Ryujinx/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs
riperiperi bf77d1cab9
GPU: Pass SpanOrArray for Texture SetData to avoid copy (#3745)
* GPU: Pass SpanOrArray for Texture SetData to avoid copy

Texture data is often converted before upload, meaning that an array was allocated to perform the conversion into. However, the backend SetData methods were being passed a Span of that data, and the Multithreaded layer does `ToArray()` on it so that it can be stored for later! This method can't extract the original array, so it creates a copy.

This PR changes the type passed for textures to a new ref struct called SpanOrArray, which is backed by either a ReadOnlySpan or an array. The benefit here is that we can have a ToArray method that doesn't copy if it is originally backed by an array.

This will also avoid a copy when running the ASTC decoder.

On NieR this was taking 38% of texture upload time, which it does a _lot_ of when you move between areas, so there should be a 1.6x performance boost when strictly uploading textures. No doubt this will also improve texture streaming performance in UE4 games, and maybe a small reduction with video playback.

From the numbers, it's probably possible to improve the upload rate by a further 1.6x by performing layout conversion on GPU. I'm not sure if we could improve it further than that - multithreading conversion on CPU would probably result in memory bottleneck.

This doesn't extend to buffers, since we don't convert their data on the GPU emulator side.

* Remove implicit cast to array.
2022-10-08 12:04:47 -03:00

455 lines
19 KiB
C#

using Ryujinx.Common;
using Ryujinx.Graphics.Device;
using Ryujinx.Graphics.Gpu.Engine.Threed;
using Ryujinx.Graphics.Gpu.Memory;
using Ryujinx.Graphics.Texture;
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
namespace Ryujinx.Graphics.Gpu.Engine.Dma
{
/// <summary>
/// Represents a DMA copy engine class.
/// </summary>
class DmaClass : IDeviceState
{
private readonly GpuContext _context;
private readonly GpuChannel _channel;
private readonly ThreedClass _3dEngine;
private readonly DeviceState<DmaClassState> _state;
/// <summary>
/// Copy flags passed on DMA launch.
/// </summary>
[Flags]
private enum CopyFlags
{
SrcLinear = 1 << 7,
DstLinear = 1 << 8,
MultiLineEnable = 1 << 9,
RemapEnable = 1 << 10
}
/// <summary>
/// Creates a new instance of the DMA copy engine class.
/// </summary>
/// <param name="context">GPU context</param>
/// <param name="channel">GPU channel</param>
/// <param name="threedEngine">3D engine</param>
public DmaClass(GpuContext context, GpuChannel channel, ThreedClass threedEngine)
{
_context = context;
_channel = channel;
_3dEngine = threedEngine;
_state = new DeviceState<DmaClassState>(new Dictionary<string, RwCallback>
{
{ nameof(DmaClassState.LaunchDma), new RwCallback(LaunchDma, null) }
});
}
/// <summary>
/// Reads data from the class registers.
/// </summary>
/// <param name="offset">Register byte offset</param>
/// <returns>Data at the specified offset</returns>
public int Read(int offset) => _state.Read(offset);
/// <summary>
/// Writes data to the class registers.
/// </summary>
/// <param name="offset">Register byte offset</param>
/// <param name="data">Data to be written</param>
public void Write(int offset, int data) => _state.Write(offset, data);
/// <summary>
/// Determine if a buffer-to-texture region covers the entirety of a texture.
/// </summary>
/// <param name="tex">Texture to compare</param>
/// <param name="linear">True if the texture is linear, false if block linear</param>
/// <param name="bpp">Texture bytes per pixel</param>
/// <param name="stride">Texture stride</param>
/// <param name="xCount">Number of pixels to be copied</param>
/// <param name="yCount">Number of lines to be copied</param>
/// <returns></returns>
private static bool IsTextureCopyComplete(DmaTexture tex, bool linear, int bpp, int stride, int xCount, int yCount)
{
if (linear)
{
// If the stride is negative, the texture has to be flipped, so
// the fast copy is not trivial, use the slow path.
if (stride <= 0)
{
return false;
}
int alignWidth = Constants.StrideAlignment / bpp;
return stride / bpp == BitUtils.AlignUp(xCount, alignWidth);
}
else
{
int alignWidth = Constants.GobAlignment / bpp;
return tex.RegionX == 0 &&
tex.RegionY == 0 &&
tex.Width == BitUtils.AlignUp(xCount, alignWidth) &&
tex.Height == yCount;
}
}
/// <summary>
/// Releases a semaphore for a given LaunchDma method call.
/// </summary>
/// <param name="argument">The LaunchDma call argument</param>
private void ReleaseSemaphore(int argument)
{
LaunchDmaSemaphoreType type = (LaunchDmaSemaphoreType)((argument >> 3) & 0x3);
if (type != LaunchDmaSemaphoreType.None)
{
ulong address = ((ulong)_state.State.SetSemaphoreA << 32) | _state.State.SetSemaphoreB;
if (type == LaunchDmaSemaphoreType.ReleaseOneWordSemaphore)
{
_channel.MemoryManager.Write(address, _state.State.SetSemaphorePayload);
}
else /* if (type == LaunchDmaSemaphoreType.ReleaseFourWordSemaphore) */
{
_channel.MemoryManager.Write(address + 8, _context.GetTimestamp());
_channel.MemoryManager.Write(address, (ulong)_state.State.SetSemaphorePayload);
}
}
}
/// <summary>
/// Performs a buffer to buffer, or buffer to texture copy.
/// </summary>
/// <param name="argument">The LaunchDma call argument</param>
private void DmaCopy(int argument)
{
var memoryManager = _channel.MemoryManager;
CopyFlags copyFlags = (CopyFlags)argument;
bool srcLinear = copyFlags.HasFlag(CopyFlags.SrcLinear);
bool dstLinear = copyFlags.HasFlag(CopyFlags.DstLinear);
bool copy2D = copyFlags.HasFlag(CopyFlags.MultiLineEnable);
bool remap = copyFlags.HasFlag(CopyFlags.RemapEnable);
uint size = _state.State.LineLengthIn;
if (size == 0)
{
return;
}
ulong srcGpuVa = ((ulong)_state.State.OffsetInUpperUpper << 32) | _state.State.OffsetInLower;
ulong dstGpuVa = ((ulong)_state.State.OffsetOutUpperUpper << 32) | _state.State.OffsetOutLower;
int xCount = (int)_state.State.LineLengthIn;
int yCount = (int)_state.State.LineCount;
_3dEngine.CreatePendingSyncs();
_3dEngine.FlushUboDirty();
if (copy2D)
{
// Buffer to texture copy.
int componentSize = (int)_state.State.SetRemapComponentsComponentSize + 1;
int srcBpp = remap ? ((int)_state.State.SetRemapComponentsNumSrcComponents + 1) * componentSize : 1;
int dstBpp = remap ? ((int)_state.State.SetRemapComponentsNumDstComponents + 1) * componentSize : 1;
var dst = Unsafe.As<uint, DmaTexture>(ref _state.State.SetDstBlockSize);
var src = Unsafe.As<uint, DmaTexture>(ref _state.State.SetSrcBlockSize);
int srcRegionX = 0, srcRegionY = 0, dstRegionX = 0, dstRegionY = 0;
if (!srcLinear)
{
srcRegionX = src.RegionX;
srcRegionY = src.RegionY;
}
if (!dstLinear)
{
dstRegionX = dst.RegionX;
dstRegionY = dst.RegionY;
}
int srcStride = (int)_state.State.PitchIn;
int dstStride = (int)_state.State.PitchOut;
var srcCalculator = new OffsetCalculator(
src.Width,
src.Height,
srcStride,
srcLinear,
src.MemoryLayout.UnpackGobBlocksInY(),
src.MemoryLayout.UnpackGobBlocksInZ(),
srcBpp);
var dstCalculator = new OffsetCalculator(
dst.Width,
dst.Height,
dstStride,
dstLinear,
dst.MemoryLayout.UnpackGobBlocksInY(),
dst.MemoryLayout.UnpackGobBlocksInZ(),
dstBpp);
(int srcBaseOffset, int srcSize) = srcCalculator.GetRectangleRange(srcRegionX, srcRegionY, xCount, yCount);
(int dstBaseOffset, int dstSize) = dstCalculator.GetRectangleRange(dstRegionX, dstRegionY, xCount, yCount);
if (srcLinear && srcStride < 0)
{
srcBaseOffset += srcStride * (yCount - 1);
}
if (dstLinear && dstStride < 0)
{
dstBaseOffset += dstStride * (yCount - 1);
}
ReadOnlySpan<byte> srcSpan = memoryManager.GetSpan(srcGpuVa + (ulong)srcBaseOffset, srcSize, true);
bool completeSource = IsTextureCopyComplete(src, srcLinear, srcBpp, srcStride, xCount, yCount);
bool completeDest = IsTextureCopyComplete(dst, dstLinear, dstBpp, dstStride, xCount, yCount);
if (completeSource && completeDest)
{
var target = memoryManager.Physical.TextureCache.FindTexture(
memoryManager,
dstGpuVa,
dstBpp,
dstStride,
dst.Height,
xCount,
yCount,
dstLinear,
dst.MemoryLayout.UnpackGobBlocksInY(),
dst.MemoryLayout.UnpackGobBlocksInZ());
if (target != null)
{
byte[] data;
if (srcLinear)
{
data = LayoutConverter.ConvertLinearStridedToLinear(
target.Info.Width,
target.Info.Height,
1,
1,
xCount * srcBpp,
srcStride,
target.Info.FormatInfo.BytesPerPixel,
srcSpan);
}
else
{
data = LayoutConverter.ConvertBlockLinearToLinear(
src.Width,
src.Height,
src.Depth,
1,
1,
1,
1,
1,
srcBpp,
src.MemoryLayout.UnpackGobBlocksInY(),
src.MemoryLayout.UnpackGobBlocksInZ(),
1,
new SizeInfo((int)target.Size),
srcSpan);
}
target.SynchronizeMemory();
target.SetData(data);
target.SignalModified();
return;
}
else if (srcCalculator.LayoutMatches(dstCalculator))
{
// No layout conversion has to be performed, just copy the data entirely.
memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, srcSpan);
return;
}
}
unsafe bool Convert<T>(Span<byte> dstSpan, ReadOnlySpan<byte> srcSpan) where T : unmanaged
{
if (srcLinear && dstLinear && srcBpp == dstBpp)
{
// Optimized path for purely linear copies - we don't need to calculate every single byte offset,
// and we can make use of Span.CopyTo which is very very fast (even compared to pointers)
for (int y = 0; y < yCount; y++)
{
srcCalculator.SetY(srcRegionY + y);
dstCalculator.SetY(dstRegionY + y);
int srcOffset = srcCalculator.GetOffset(srcRegionX);
int dstOffset = dstCalculator.GetOffset(dstRegionX);
srcSpan.Slice(srcOffset - srcBaseOffset, xCount * srcBpp)
.CopyTo(dstSpan.Slice(dstOffset - dstBaseOffset, xCount * dstBpp));
}
}
else
{
fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
{
byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
byte* srcBase = srcPtr - srcBaseOffset;
for (int y = 0; y < yCount; y++)
{
srcCalculator.SetY(srcRegionY + y);
dstCalculator.SetY(dstRegionY + y);
for (int x = 0; x < xCount; x++)
{
int srcOffset = srcCalculator.GetOffset(srcRegionX + x);
int dstOffset = dstCalculator.GetOffset(dstRegionX + x);
*(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
}
}
}
}
return true;
}
// OPT: This allocates a (potentially) huge temporary array and then copies an existing
// region of memory into it, data that might get overwritten entirely anyways. Ideally this should
// all be rewritten to use pooled arrays, but that gets complicated with packed data and strides
Span<byte> dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray();
bool _ = srcBpp switch
{
1 => Convert<byte>(dstSpan, srcSpan),
2 => Convert<ushort>(dstSpan, srcSpan),
4 => Convert<uint>(dstSpan, srcSpan),
8 => Convert<ulong>(dstSpan, srcSpan),
12 => Convert<Bpp12Pixel>(dstSpan, srcSpan),
16 => Convert<Vector128<byte>>(dstSpan, srcSpan),
_ => throw new NotSupportedException($"Unable to copy ${srcBpp} bpp pixel format.")
};
memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, dstSpan);
}
else
{
if (remap &&
_state.State.SetRemapComponentsDstX == SetRemapComponentsDst.ConstA &&
_state.State.SetRemapComponentsDstY == SetRemapComponentsDst.ConstA &&
_state.State.SetRemapComponentsDstZ == SetRemapComponentsDst.ConstA &&
_state.State.SetRemapComponentsDstW == SetRemapComponentsDst.ConstA &&
_state.State.SetRemapComponentsNumSrcComponents == SetRemapComponentsNumComponents.One &&
_state.State.SetRemapComponentsNumDstComponents == SetRemapComponentsNumComponents.One &&
_state.State.SetRemapComponentsComponentSize == SetRemapComponentsComponentSize.Four)
{
// Fast path for clears when remap is enabled.
memoryManager.Physical.BufferCache.ClearBuffer(memoryManager, dstGpuVa, size * 4, _state.State.SetRemapConstA);
}
else
{
// TODO: Implement remap functionality.
// Buffer to buffer copy.
bool srcIsPitchKind = memoryManager.GetKind(srcGpuVa).IsPitch();
bool dstIsPitchKind = memoryManager.GetKind(dstGpuVa).IsPitch();
if (!srcIsPitchKind && dstIsPitchKind)
{
CopyGobBlockLinearToLinear(memoryManager, srcGpuVa, dstGpuVa, size);
}
else if (srcIsPitchKind && !dstIsPitchKind)
{
CopyGobLinearToBlockLinear(memoryManager, srcGpuVa, dstGpuVa, size);
}
else
{
memoryManager.Physical.BufferCache.CopyBuffer(memoryManager, srcGpuVa, dstGpuVa, size);
}
}
}
}
/// <summary>
/// Copies block linear data with block linear GOBs to a block linear destination with linear GOBs.
/// </summary>
/// <param name="memoryManager">GPU memory manager</param>
/// <param name="srcGpuVa">Source GPU virtual address</param>
/// <param name="dstGpuVa">Destination GPU virtual address</param>
/// <param name="size">Size in bytes of the copy</param>
private static void CopyGobBlockLinearToLinear(MemoryManager memoryManager, ulong srcGpuVa, ulong dstGpuVa, ulong size)
{
if (((srcGpuVa | dstGpuVa | size) & 0xf) == 0)
{
for (ulong offset = 0; offset < size; offset += 16)
{
Vector128<byte> data = memoryManager.Read<Vector128<byte>>(ConvertGobLinearToBlockLinearAddress(srcGpuVa + offset), true);
memoryManager.Write(dstGpuVa + offset, data);
}
}
else
{
for (ulong offset = 0; offset < size; offset++)
{
byte data = memoryManager.Read<byte>(ConvertGobLinearToBlockLinearAddress(srcGpuVa + offset), true);
memoryManager.Write(dstGpuVa + offset, data);
}
}
}
/// <summary>
/// Copies block linear data with linear GOBs to a block linear destination with block linear GOBs.
/// </summary>
/// <param name="memoryManager">GPU memory manager</param>
/// <param name="srcGpuVa">Source GPU virtual address</param>
/// <param name="dstGpuVa">Destination GPU virtual address</param>
/// <param name="size">Size in bytes of the copy</param>
private static void CopyGobLinearToBlockLinear(MemoryManager memoryManager, ulong srcGpuVa, ulong dstGpuVa, ulong size)
{
if (((srcGpuVa | dstGpuVa | size) & 0xf) == 0)
{
for (ulong offset = 0; offset < size; offset += 16)
{
Vector128<byte> data = memoryManager.Read<Vector128<byte>>(srcGpuVa + offset, true);
memoryManager.Write(ConvertGobLinearToBlockLinearAddress(dstGpuVa + offset), data);
}
}
else
{
for (ulong offset = 0; offset < size; offset++)
{
byte data = memoryManager.Read<byte>(srcGpuVa + offset, true);
memoryManager.Write(ConvertGobLinearToBlockLinearAddress(dstGpuVa + offset), data);
}
}
}
/// <summary>
/// Calculates the GOB block linear address from a linear address.
/// </summary>
/// <param name="address">Linear address</param>
/// <returns>Block linear address</returns>
private static ulong ConvertGobLinearToBlockLinearAddress(ulong address)
{
// y2 y1 y0 x5 x4 x3 x2 x1 x0 -> x5 y2 y1 x4 y0 x3 x2 x1 x0
return (address & ~0x1f0UL) |
((address & 0x40) >> 2) |
((address & 0x10) << 1) |
((address & 0x180) >> 1) |
((address & 0x20) << 3);
}
/// <summary>
/// Performs a buffer to buffer, or buffer to texture copy, then optionally releases a semaphore.
/// </summary>
/// <param name="argument">Method call argument</param>
private void LaunchDma(int argument)
{
DmaCopy(argument);
ReleaseSemaphore(argument);
}
}
}