From 3dcaa84ba442ac173c8b5241049296d8fe8a3fd7 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 2 Mar 2020 18:54:08 -0300
Subject: [PATCH 01/14] shader/transform_feedback: Add host API friendly TFB
 builder

---
 CMakeModules/GenerateSCMRev.cmake            |   2 +
 src/common/CMakeLists.txt                    |   2 +
 src/video_core/CMakeLists.txt                |   2 +
 src/video_core/shader/transform_feedback.cpp | 114 +++++++++++++++++++
 src/video_core/shader/transform_feedback.h   |  22 ++++
 5 files changed, 142 insertions(+)
 create mode 100644 src/video_core/shader/transform_feedback.cpp
 create mode 100644 src/video_core/shader/transform_feedback.h

diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake
index 8c13a94fb..83e4e9df2 100644
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -102,6 +102,8 @@ set(HASH_FILES
     "${VIDEO_CORE}/shader/shader_ir.cpp"
     "${VIDEO_CORE}/shader/shader_ir.h"
     "${VIDEO_CORE}/shader/track.cpp"
+    "${VIDEO_CORE}/shader/transform_feedback.cpp"
+    "${VIDEO_CORE}/shader/transform_feedback.h"
 )
 set(COMBINED "")
 foreach (F IN LISTS HASH_FILES)
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 1f621fb1f..fbebed715 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -83,6 +83,8 @@ add_custom_command(OUTPUT scm_rev.cpp
       "${VIDEO_CORE}/shader/shader_ir.cpp"
       "${VIDEO_CORE}/shader/shader_ir.h"
       "${VIDEO_CORE}/shader/track.cpp"
+      "${VIDEO_CORE}/shader/transform_feedback.cpp"
+      "${VIDEO_CORE}/shader/transform_feedback.h"
       # and also check that the scm_rev files haven't changed
       "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.cpp.in"
       "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.h"
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 0101e5f0e..91df062d7 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -129,6 +129,8 @@ add_library(video_core STATIC
     shader/shader_ir.cpp
     shader/shader_ir.h
     shader/track.cpp
+    shader/transform_feedback.cpp
+    shader/transform_feedback.h
     surface.cpp
     surface.h
     texture_cache/format_lookup_table.cpp
diff --git a/src/video_core/shader/transform_feedback.cpp b/src/video_core/shader/transform_feedback.cpp
new file mode 100644
index 000000000..db86c940f
--- /dev/null
+++ b/src/video_core/shader/transform_feedback.cpp
@@ -0,0 +1,114 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <unordered_map>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/shader/registry.h"
+#include "video_core/shader/transform_feedback.h"
+
+namespace VideoCommon::Shader {
+
+namespace {
+
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+// TODO(Rodrigo): Change this to constexpr std::unordered_set in C++20
+
+/// Attribute offsets that describe a vector
+constexpr std::array VECTORS = {
+    28,  // gl_Position
+    32,  // Generic 0
+    36,  // Generic 1
+    40,  // Generic 2
+    44,  // Generic 3
+    48,  // Generic 4
+    52,  // Generic 5
+    56,  // Generic 6
+    60,  // Generic 7
+    64,  // Generic 8
+    68,  // Generic 9
+    72,  // Generic 10
+    76,  // Generic 11
+    80,  // Generic 12
+    84,  // Generic 13
+    88,  // Generic 14
+    92,  // Generic 15
+    96,  // Generic 16
+    100, // Generic 17
+    104, // Generic 18
+    108, // Generic 19
+    112, // Generic 20
+    116, // Generic 21
+    120, // Generic 22
+    124, // Generic 23
+    128, // Generic 24
+    132, // Generic 25
+    136, // Generic 26
+    140, // Generic 27
+    144, // Generic 28
+    148, // Generic 29
+    152, // Generic 30
+    156, // Generic 31
+    160, // gl_FrontColor
+    164, // gl_FrontSecondaryColor
+    160, // gl_BackColor
+    164, // gl_BackSecondaryColor
+    192, // gl_TexCoord[0]
+    196, // gl_TexCoord[1]
+    200, // gl_TexCoord[2]
+    204, // gl_TexCoord[3]
+    208, // gl_TexCoord[4]
+    212, // gl_TexCoord[5]
+    216, // gl_TexCoord[6]
+    220, // gl_TexCoord[7]
+};
+} // namespace
+
+std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info) {
+
+    std::unordered_map<u8, VaryingTFB> tfb;
+
+    for (std::size_t buffer = 0; buffer < Maxwell::NumTransformFeedbackBuffers; ++buffer) {
+        const auto& locations = info.tfb_varying_locs[buffer];
+        const auto& layout = info.tfb_layouts[buffer];
+        const std::size_t varying_count = layout.varying_count;
+
+        std::size_t highest = 0;
+
+        for (std::size_t offset = 0; offset < varying_count; ++offset) {
+            const std::size_t base_offset = offset;
+            const u8 location = locations[offset];
+
+            VaryingTFB varying;
+            varying.buffer = layout.stream;
+            varying.offset = offset * sizeof(u32);
+            varying.components = 1;
+
+            if (std::find(VECTORS.begin(), VECTORS.end(), location / 4 * 4) != VECTORS.end()) {
+                UNIMPLEMENTED_IF_MSG(location % 4 != 0, "Unaligned TFB");
+
+                const u8 base_index = location / 4;
+                while (offset + 1 < varying_count && base_index == locations[offset + 1] / 4) {
+                    ++offset;
+                    ++varying.components;
+                }
+            }
+
+            [[maybe_unused]] const bool inserted = tfb.emplace(location, varying).second;
+            UNIMPLEMENTED_IF_MSG(!inserted, "Varying already stored");
+
+            highest = std::max(highest, (base_offset + varying.components) * sizeof(u32));
+        }
+
+        UNIMPLEMENTED_IF(highest != layout.stride);
+    }
+    return tfb;
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/transform_feedback.h b/src/video_core/shader/transform_feedback.h
new file mode 100644
index 000000000..8a8235019
--- /dev/null
+++ b/src/video_core/shader/transform_feedback.h
@@ -0,0 +1,22 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "common/common_types.h"
+#include "video_core/shader/registry.h"
+
+namespace VideoCommon::Shader {
+
+struct VaryingTFB {
+    std::size_t buffer;
+    std::size_t offset;
+    std::size_t components;
+};
+
+std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info);
+
+} // namespace VideoCommon::Shader

From 4d711dface5dfc76d5ae0d62c635ec9ba6bd4293 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 2 Mar 2020 18:55:39 -0300
Subject: [PATCH 02/14] gl_shader_decompiler: Decorate output attributes with
 XFB layout

We sometimes have to slice attributes in different parts. This is needed
for example in instances where the game feedbacks 3 components but
writes 4 from the shader (something that is possible with
GL_NV_transform_feedback).
---
 .../renderer_opengl/gl_shader_decompiler.cpp  | 134 ++++++++++++++----
 1 file changed, 105 insertions(+), 29 deletions(-)

diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 19d6f3dcb..021edf1f6 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -23,6 +23,7 @@
 #include "video_core/shader/ast.h"
 #include "video_core/shader/node.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader/transform_feedback.h"
 
 namespace OpenGL {
 
@@ -36,6 +37,7 @@ using Tegra::Shader::IpaInterpMode;
 using Tegra::Shader::IpaMode;
 using Tegra::Shader::IpaSampleMode;
 using Tegra::Shader::Register;
+using VideoCommon::Shader::BuildTransformFeedback;
 using VideoCommon::Shader::Registry;
 
 using namespace std::string_literals;
@@ -49,6 +51,11 @@ class ExprDecompiler;
 
 enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat };
 
+constexpr std::array FLOAT_TYPES{"float", "vec2", "vec3", "vec4"};
+
+constexpr std::string_view INPUT_ATTRIBUTE_NAME = "in_attr";
+constexpr std::string_view OUTPUT_ATTRIBUTE_NAME = "out_attr";
+
 struct TextureOffset {};
 struct TextureDerivates {};
 using TextureArgument = std::pair<Type, Node>;
@@ -390,12 +397,19 @@ std::string FlowStackTopName(MetaStackClass stack) {
     return stage == ShaderType::Vertex;
 }
 
+struct GenericVaryingDescription {
+    std::string name;
+    u8 first_element = 0;
+    bool is_scalar = false;
+};
+
 class GLSLDecompiler final {
 public:
     explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
                             ShaderType stage, std::string_view identifier, std::string_view suffix)
         : device{device}, ir{ir}, registry{registry}, stage{stage},
-          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {}
+          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()},
+          transform_feedback{BuildTransformFeedback(registry.GetGraphicsInfo())} {}
 
     void Decompile() {
         DeclareHeader();
@@ -403,17 +417,17 @@ public:
         DeclareGeometry();
         DeclareFragment();
         DeclareCompute();
-        DeclareRegisters();
-        DeclareCustomVariables();
-        DeclarePredicates();
-        DeclareLocalMemory();
-        DeclareInternalFlags();
         DeclareInputAttributes();
         DeclareOutputAttributes();
-        DeclareConstantBuffers();
-        DeclareGlobalMemory();
-        DeclareSamplers();
         DeclareImages();
+        DeclareSamplers();
+        DeclareGlobalMemory();
+        DeclareConstantBuffers();
+        DeclareLocalMemory();
+        DeclareRegisters();
+        DeclarePredicates();
+        DeclareInternalFlags();
+        DeclareCustomVariables();
         DeclarePhysicalAttributeReader();
 
         code.AddLine("void main() {{");
@@ -485,7 +499,7 @@ private:
         if (!identifier.empty()) {
             code.AddLine("// {}", identifier);
         }
-        code.AddLine("#version 430 core");
+        code.AddLine("#version 440 core");
         code.AddLine("#extension GL_ARB_separate_shader_objects : enable");
         if (device.HasShaderBallot()) {
             code.AddLine("#extension GL_ARB_shader_ballot : require");
@@ -570,7 +584,13 @@ private:
         code.AddLine("out gl_PerVertex {{");
         ++code.scope;
 
-        code.AddLine("vec4 gl_Position;");
+        auto pos_xfb = GetTransformFeedbackDecoration(Attribute::Index::Position);
+        if (!pos_xfb.empty()) {
+            pos_xfb = fmt::format("layout ({}) ", pos_xfb);
+        }
+        const char* pos_type =
+            FLOAT_TYPES.at(GetNumComponents(Attribute::Index::Position).value_or(4) - 1);
+        code.AddLine("{}{} gl_Position;", pos_xfb, pos_type);
 
         for (const auto attribute : ir.GetOutputAttributes()) {
             if (attribute == Attribute::Index::ClipDistances0123 ||
@@ -703,7 +723,7 @@ private:
     void DeclareInputAttribute(Attribute::Index index, bool skip_unused) {
         const u32 location{GetGenericAttributeIndex(index)};
 
-        std::string name{GetInputAttribute(index)};
+        std::string name{GetGenericInputAttribute(index)};
         if (stage == ShaderType::Geometry) {
             name = "gs_" + name + "[]";
         }
@@ -740,9 +760,58 @@ private:
         }
     }
 
+    std::optional<std::size_t> GetNumComponents(Attribute::Index index, u8 element = 0) const {
+        const u8 location = static_cast<u8>(index) * 4 + element;
+        const auto it = transform_feedback.find(location);
+        if (it == transform_feedback.end()) {
+            return {};
+        }
+        return it->second.components;
+    }
+
+    std::string GetTransformFeedbackDecoration(Attribute::Index index, u8 element = 0) const {
+        const u8 location = static_cast<u8>(index) * 4 + element;
+        const auto it = transform_feedback.find(location);
+        if (it == transform_feedback.end()) {
+            return {};
+        }
+
+        const VaryingTFB& tfb = it->second;
+        return fmt::format("xfb_buffer = {}, xfb_offset = {}", tfb.buffer, tfb.offset);
+    }
+
     void DeclareOutputAttribute(Attribute::Index index) {
-        const u32 location{GetGenericAttributeIndex(index)};
-        code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index));
+        static constexpr std::string_view swizzle = "xyzw";
+        u8 element = 0;
+        while (element < 4) {
+            auto xfb = GetTransformFeedbackDecoration(index, element);
+            if (!xfb.empty()) {
+                xfb = fmt::format(", {}", xfb);
+            }
+            const std::size_t remainder = 4 - element;
+            const std::size_t num_components = GetNumComponents(index, element).value_or(remainder);
+            const char* const type = FLOAT_TYPES.at(num_components - 1);
+
+            const u32 location = GetGenericAttributeIndex(index);
+
+            GenericVaryingDescription description;
+            description.first_element = static_cast<u8>(element);
+            description.is_scalar = num_components == 1;
+            description.name = AppendSuffix(location, OUTPUT_ATTRIBUTE_NAME);
+            if (element != 0 || num_components != 4) {
+                const std::string_view name_swizzle = swizzle.substr(element, num_components);
+                description.name = fmt::format("{}_{}", description.name, name_swizzle);
+            }
+            for (std::size_t i = 0; i < num_components; ++i) {
+                const u8 offset = static_cast<u8>(location * 4 + element + i);
+                varying_description.insert({offset, description});
+            }
+
+            code.AddLine("layout (location = {}, component = {}{}) out {} {};", location, element,
+                         xfb, type, description.name);
+
+            element += static_cast<u8>(num_components);
+        }
     }
 
     void DeclareConstantBuffers() {
@@ -1095,7 +1164,7 @@ private:
             return {"0", Type::Int};
         default:
             if (IsGenericAttribute(attribute)) {
-                return {GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element),
+                return {GeometryPass(GetGenericInputAttribute(attribute)) + GetSwizzle(element),
                         Type::Float};
             }
             break;
@@ -1164,8 +1233,7 @@ private:
             return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}};
         default:
             if (IsGenericAttribute(attribute)) {
-                return {
-                    {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}};
+                return {{GetGenericOutputAttribute(attribute, abuf->GetElement()), Type::Float}};
             }
             UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
             return {};
@@ -2376,27 +2444,34 @@ private:
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
     std::string GetRegister(u32 index) const {
-        return GetDeclarationWithSuffix(index, "gpr");
+        return AppendSuffix(index, "gpr");
     }
 
     std::string GetCustomVariable(u32 index) const {
-        return GetDeclarationWithSuffix(index, "custom_var");
+        return AppendSuffix(index, "custom_var");
     }
 
     std::string GetPredicate(Tegra::Shader::Pred pred) const {
-        return GetDeclarationWithSuffix(static_cast<u32>(pred), "pred");
+        return AppendSuffix(static_cast<u32>(pred), "pred");
     }
 
-    std::string GetInputAttribute(Attribute::Index attribute) const {
-        return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "input_attr");
+    std::string GetGenericInputAttribute(Attribute::Index attribute) const {
+        return AppendSuffix(GetGenericAttributeIndex(attribute), INPUT_ATTRIBUTE_NAME);
     }
 
-    std::string GetOutputAttribute(Attribute::Index attribute) const {
-        return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "output_attr");
+    std::unordered_map<u8, GenericVaryingDescription> varying_description;
+
+    std::string GetGenericOutputAttribute(Attribute::Index attribute, std::size_t element) const {
+        const u8 offset = static_cast<u8>(GetGenericAttributeIndex(attribute) * 4 + element);
+        const auto& description = varying_description.at(offset);
+        if (description.is_scalar) {
+            return description.name;
+        }
+        return fmt::format("{}[{}]", description.name, element - description.first_element);
     }
 
     std::string GetConstBuffer(u32 index) const {
-        return GetDeclarationWithSuffix(index, "cbuf");
+        return AppendSuffix(index, "cbuf");
     }
 
     std::string GetGlobalMemory(const GlobalMemoryBase& descriptor) const {
@@ -2409,7 +2484,7 @@ private:
     }
 
     std::string GetConstBufferBlock(u32 index) const {
-        return GetDeclarationWithSuffix(index, "cbuf_block");
+        return AppendSuffix(index, "cbuf_block");
     }
 
     std::string GetLocalMemory() const {
@@ -2434,14 +2509,14 @@ private:
     }
 
     std::string GetSampler(const Sampler& sampler) const {
-        return GetDeclarationWithSuffix(static_cast<u32>(sampler.GetIndex()), "sampler");
+        return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler");
     }
 
     std::string GetImage(const Image& image) const {
-        return GetDeclarationWithSuffix(static_cast<u32>(image.GetIndex()), "image");
+        return AppendSuffix(static_cast<u32>(image.GetIndex()), "image");
     }
 
-    std::string GetDeclarationWithSuffix(u32 index, std::string_view name) const {
+    std::string AppendSuffix(u32 index, std::string_view name) const {
         if (suffix.empty()) {
             return fmt::format("{}{}", name, index);
         } else {
@@ -2477,6 +2552,7 @@ private:
     const std::string_view identifier;
     const std::string_view suffix;
     const Header header;
+    const std::unordered_map<u8, VaryingTFB> transform_feedback;
 
     ShaderWriter code;
 

From 8e9f23f393763a6d76605206eeb20f6f8885d9a9 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 2 Mar 2020 19:31:26 -0300
Subject: [PATCH 03/14] gl_rasterizer: Implement transform feedback bindings

---
 src/video_core/engines/maxwell_3d.h           |  9 +++
 .../renderer_opengl/gl_rasterizer.cpp         | 70 +++++++++++++++++--
 .../renderer_opengl/gl_rasterizer.h           | 14 +++-
 3 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 8752a1cfb..ba9c76593 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -634,6 +634,11 @@ public:
             u32 address_low;
             s32 buffer_size;
             s32 buffer_offset;
+
+            GPUVAddr Address() const {
+                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                             address_low);
+            }
         };
         static_assert(sizeof(TransformFeedbackBinding) == 32);
 
@@ -652,6 +657,10 @@ public:
             return shader_config[index].enable != 0;
         }
 
+        bool IsShaderConfigEnabled(Regs::ShaderProgram type) const {
+            return IsShaderConfigEnabled(static_cast<std::size_t>(type));
+        }
+
         union {
             struct {
                 INSERT_UNION_PADDING_WORDS(0x45);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 8a2db8e36..1af4268a4 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -496,7 +496,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
     SyncCullMode();
     SyncPrimitiveRestart();
     SyncScissorTest();
-    SyncTransformFeedback();
     SyncPointState();
     SyncPolygonOffset();
     SyncAlphaTest();
@@ -569,7 +568,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
         glTextureBarrier();
     }
 
-    ++num_queued_commands;
+    BeginTransformFeedback(primitive_mode);
 
     const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
     const GLsizei num_instances =
@@ -608,6 +607,10 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
                                               num_instances, base_instance);
         }
     }
+
+    EndTransformFeedback();
+
+    ++num_queued_commands;
 }
 
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
@@ -1290,11 +1293,6 @@ void RasterizerOpenGL::SyncScissorTest() {
     }
 }
 
-void RasterizerOpenGL::SyncTransformFeedback() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-    UNIMPLEMENTED_IF_MSG(regs.tfb_enabled != 0, "Transform feedbacks are not implemented");
-}
-
 void RasterizerOpenGL::SyncPointState() {
     auto& gpu = system.GPU().Maxwell3D();
     auto& flags = gpu.dirty.flags;
@@ -1370,4 +1368,62 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
     oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
 }
 
+void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    if (regs.tfb_enabled == 0) {
+        return;
+    }
+
+    UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
+                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
+                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
+
+    for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
+        const auto& binding = regs.tfb_bindings[index];
+        if (!binding.buffer_enable) {
+            if (enabled_transform_feedback_buffers[index]) {
+                glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0,
+                                  0);
+            }
+            enabled_transform_feedback_buffers[index] = false;
+            continue;
+        }
+        enabled_transform_feedback_buffers[index] = true;
+
+        auto& tfb_buffer = transform_feedback_buffers[index];
+        tfb_buffer.Create();
+
+        const GLuint handle = tfb_buffer.handle;
+        const std::size_t size = binding.buffer_size;
+        glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY);
+        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0,
+                          static_cast<GLsizeiptr>(size));
+    }
+
+    glBeginTransformFeedback(GL_POINTS);
+}
+
+void RasterizerOpenGL::EndTransformFeedback() {
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    if (regs.tfb_enabled == 0) {
+        return;
+    }
+
+    glEndTransformFeedback();
+
+    for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
+        const auto& binding = regs.tfb_bindings[index];
+        if (!binding.buffer_enable) {
+            continue;
+        }
+        UNIMPLEMENTED_IF(binding.buffer_offset != 0);
+
+        const GLuint handle = transform_feedback_buffers[index].handle;
+        const GPUVAddr gpu_addr = binding.Address();
+        const std::size_t size = binding.buffer_size;
+        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+        glCopyNamedBufferSubData(handle, *dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+    }
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index e6424f5d2..2d3be2437 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -168,9 +168,6 @@ private:
     /// Syncs the scissor test state to match the guest state
     void SyncScissorTest();
 
-    /// Syncs the transform feedback state to match the guest state
-    void SyncTransformFeedback();
-
     /// Syncs the point state to match the guest state
     void SyncPointState();
 
@@ -192,6 +189,12 @@ private:
     /// Syncs the framebuffer sRGB state to match the guest state
     void SyncFramebufferSRGB();
 
+    /// Begin a transform feedback
+    void BeginTransformFeedback(GLenum primitive_mode);
+
+    /// End a transform feedback
+    void EndTransformFeedback();
+
     /// Check for extension that are not strictly required but are needed for correct emulation
     void CheckExtensions();
 
@@ -229,6 +232,11 @@ private:
     BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
     BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
 
+    std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
+        transform_feedback_buffers;
+    std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
+        enabled_transform_feedback_buffers;
+
     /// Number of commands queued to the OpenGL driver. Reseted on flush.
     std::size_t num_queued_commands = 0;
 

From 7acebd7eb67032e51dd2985d847e40c216944f92 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 2 Mar 2020 21:36:25 -0300
Subject: [PATCH 04/14] vk_shader_decompiler: Use registry for specialization

---
 .../renderer_vulkan/vk_pipeline_cache.cpp     | 17 ++++------
 .../renderer_vulkan/vk_pipeline_cache.h       |  4 +++
 .../renderer_vulkan/vk_shader_decompiler.cpp  | 34 ++++++++++++-------
 .../renderer_vulkan/vk_shader_decompiler.h    | 13 +++----
 4 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index ebf85f311..056ef495c 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -273,9 +273,9 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
     specialization.workgroup_size = key.workgroup_size;
     specialization.shared_memory_size = key.shared_memory_size;
 
-    const SPIRVShader spirv_shader{
-        Decompile(device, shader->GetIR(), ShaderType::Compute, specialization),
-        shader->GetEntries()};
+    const SPIRVShader spirv_shader{Decompile(device, shader->GetIR(), ShaderType::Compute,
+                                             shader->GetRegistry(), specialization),
+                                   shader->GetEntries()};
     entry = std::make_unique<VKComputePipeline>(device, scheduler, descriptor_pool,
                                                 update_descriptor_queue, spirv_shader);
     return *entry;
@@ -324,8 +324,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
     const auto& gpu = system.GPU().Maxwell3D();
 
     Specialization specialization;
-    specialization.primitive_topology = fixed_state.input_assembly.topology;
-    if (specialization.primitive_topology == Maxwell::PrimitiveTopology::Points) {
+    if (fixed_state.input_assembly.topology == Maxwell::PrimitiveTopology::Points) {
         ASSERT(fixed_state.input_assembly.point_size != 0.0f);
         specialization.point_size = fixed_state.input_assembly.point_size;
     }
@@ -333,9 +332,6 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
         specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].type;
     }
     specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;
-    specialization.tessellation.primitive = fixed_state.tessellation.primitive;
-    specialization.tessellation.spacing = fixed_state.tessellation.spacing;
-    specialization.tessellation.clockwise = fixed_state.tessellation.clockwise;
 
     SPIRVProgram program;
     std::vector<vk::DescriptorSetLayoutBinding> bindings;
@@ -356,8 +352,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
         const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
         const auto program_type = GetShaderType(program_enum);
         const auto& entries = shader->GetEntries();
-        program[stage] = {Decompile(device, shader->GetIR(), program_type, specialization),
-                          entries};
+        program[stage] = {
+            Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization),
+            entries};
 
         if (program_enum == Maxwell::ShaderProgram::VertexA) {
             // VertexB was combined with VertexA, so we skip the VertexB iteration
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
index e292526bb..21340c9a4 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -132,6 +132,10 @@ public:
         return shader_ir;
     }
 
+    const VideoCommon::Shader::Registry& GetRegistry() const {
+        return registry;
+    }
+
     const VideoCommon::Shader::ShaderIR& GetIR() const {
         return shader_ir;
     }
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index cfcca5af0..699a538d6 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -24,6 +24,7 @@
 #include "video_core/renderer_vulkan/vk_shader_decompiler.h"
 #include "video_core/shader/node.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader/transform_feedback.h"
 
 namespace Vulkan {
 
@@ -266,9 +267,10 @@ bool IsPrecise(Operation operand) {
 class SPIRVDecompiler final : public Sirit::Module {
 public:
     explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage,
-                             const Specialization& specialization)
+                             const Registry& registry, const Specialization& specialization)
         : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()},
-          specialization{specialization} {
+          registry{registry}, specialization{specialization},
+          transform_feedback{BuildTransformFeedback(registry.GetGraphicsInfo())} {
         AddCapability(spv::Capability::Shader);
         AddCapability(spv::Capability::UniformAndStorageBuffer16BitAccess);
         AddCapability(spv::Capability::ImageQuery);
@@ -318,25 +320,29 @@ public:
             AddExecutionMode(main, spv::ExecutionMode::OutputVertices,
                              header.common2.threads_per_input_primitive);
             break;
-        case ShaderType::TesselationEval:
+        case ShaderType::TesselationEval: {
+            const auto& info = registry.GetGraphicsInfo();
             AddCapability(spv::Capability::Tessellation);
             AddEntryPoint(spv::ExecutionModel::TessellationEvaluation, main, "main", interfaces);
-            AddExecutionMode(main, GetExecutionMode(specialization.tessellation.primitive));
-            AddExecutionMode(main, GetExecutionMode(specialization.tessellation.spacing));
-            AddExecutionMode(main, specialization.tessellation.clockwise
+            AddExecutionMode(main, GetExecutionMode(info.tessellation_primitive));
+            AddExecutionMode(main, GetExecutionMode(info.tessellation_spacing));
+            AddExecutionMode(main, info.tessellation_clockwise
                                        ? spv::ExecutionMode::VertexOrderCw
                                        : spv::ExecutionMode::VertexOrderCcw);
             break;
-        case ShaderType::Geometry:
+        }
+        case ShaderType::Geometry: {
+            const auto& info = registry.GetGraphicsInfo();
             AddCapability(spv::Capability::Geometry);
             AddEntryPoint(spv::ExecutionModel::Geometry, main, "main", interfaces);
-            AddExecutionMode(main, GetExecutionMode(specialization.primitive_topology));
+            AddExecutionMode(main, GetExecutionMode(info.primitive_topology));
             AddExecutionMode(main, GetExecutionMode(header.common3.output_topology));
             AddExecutionMode(main, spv::ExecutionMode::OutputVertices,
                              header.common4.max_output_vertices);
             // TODO(Rodrigo): Where can we get this info from?
             AddExecutionMode(main, spv::ExecutionMode::Invocations, 1U);
             break;
+        }
         case ShaderType::Fragment:
             AddEntryPoint(spv::ExecutionModel::Fragment, main, "main", interfaces);
             AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft);
@@ -545,7 +551,8 @@ private:
         if (stage != ShaderType::Geometry) {
             return;
         }
-        const u32 num_input = GetNumPrimitiveTopologyVertices(specialization.primitive_topology);
+        const auto& info = registry.GetGraphicsInfo();
+        const u32 num_input = GetNumPrimitiveTopologyVertices(info.primitive_topology);
         DeclareInputVertexArray(num_input);
         DeclareOutputVertex();
     }
@@ -898,7 +905,7 @@ private:
     u32 GetNumInputVertices() const {
         switch (stage) {
         case ShaderType::Geometry:
-            return GetNumPrimitiveTopologyVertices(specialization.primitive_topology);
+            return GetNumPrimitiveTopologyVertices(registry.GetGraphicsInfo().primitive_topology);
         case ShaderType::TesselationControl:
         case ShaderType::TesselationEval:
             return NumInputPatches;
@@ -2495,7 +2502,9 @@ private:
     const ShaderIR& ir;
     const ShaderType stage;
     const Tegra::Shader::Header header;
+    const Registry& registry;
     const Specialization& specialization;
+    const std::unordered_map<u8, VaryingTFB> transform_feedback;
 
     const Id t_void = Name(TypeVoid(), "void");
 
@@ -2870,8 +2879,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
 }
 
 std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir,
-                           ShaderType stage, const Specialization& specialization) {
-    return SPIRVDecompiler(device, ir, stage, specialization).Assemble();
+                           ShaderType stage, const VideoCommon::Shader::Registry& registry,
+                           const Specialization& specialization) {
+    return SPIRVDecompiler(device, ir, stage, registry, specialization).Assemble();
 }
 
 } // namespace Vulkan
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
index f5dc14d9e..ffea4709e 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -15,6 +15,7 @@
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
 
 namespace Vulkan {
@@ -91,17 +92,9 @@ struct Specialization final {
     u32 shared_memory_size{};
 
     // Graphics specific
-    Maxwell::PrimitiveTopology primitive_topology{};
     std::optional<float> point_size{};
     std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
     bool ndc_minus_one_to_one{};
-
-    // Tessellation specific
-    struct {
-        Maxwell::TessellationPrimitive primitive{};
-        Maxwell::TessellationSpacing spacing{};
-        bool clockwise{};
-    } tessellation;
 };
 // Old gcc versions don't consider this trivially copyable.
 // static_assert(std::is_trivially_copyable_v<Specialization>);
@@ -114,6 +107,8 @@ struct SPIRVShader {
 ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir);
 
 std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir,
-                           Tegra::Engines::ShaderType stage, const Specialization& specialization);
+                           Tegra::Engines::ShaderType stage,
+                           const VideoCommon::Shader::Registry& registry,
+                           const Specialization& specialization);
 
 } // namespace Vulkan

From ae6189d7c2d5b5bf7daa4cc5a3ec34805cec7b7e Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Fri, 6 Mar 2020 05:03:13 -0300
Subject: [PATCH 05/14] shader/transform_feedback: Expose buffer stride

---
 src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 3 ++-
 src/video_core/shader/transform_feedback.cpp            | 1 +
 src/video_core/shader/transform_feedback.h              | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 021edf1f6..175145cc1 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -777,7 +777,8 @@ private:
         }
 
         const VaryingTFB& tfb = it->second;
-        return fmt::format("xfb_buffer = {}, xfb_offset = {}", tfb.buffer, tfb.offset);
+        return fmt::format("xfb_buffer = {}, xfb_offset = {}, xfb_stride = {}", tfb.buffer,
+                           tfb.offset, tfb.stride);
     }
 
     void DeclareOutputAttribute(Attribute::Index index) {
diff --git a/src/video_core/shader/transform_feedback.cpp b/src/video_core/shader/transform_feedback.cpp
index db86c940f..22a933761 100644
--- a/src/video_core/shader/transform_feedback.cpp
+++ b/src/video_core/shader/transform_feedback.cpp
@@ -87,6 +87,7 @@ std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& in
 
             VaryingTFB varying;
             varying.buffer = layout.stream;
+            varying.stride = layout.stride;
             varying.offset = offset * sizeof(u32);
             varying.components = 1;
 
diff --git a/src/video_core/shader/transform_feedback.h b/src/video_core/shader/transform_feedback.h
index 8a8235019..77d05f64c 100644
--- a/src/video_core/shader/transform_feedback.h
+++ b/src/video_core/shader/transform_feedback.h
@@ -13,6 +13,7 @@ namespace VideoCommon::Shader {
 
 struct VaryingTFB {
     std::size_t buffer;
+    std::size_t stride;
     std::size_t offset;
     std::size_t components;
 };

From c320702092c81c8acc9b5ff0c18f0613a5f64f04 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Fri, 6 Mar 2020 05:06:02 -0300
Subject: [PATCH 06/14] vk_device: Shrink formatless capability name size

---
 src/video_core/renderer_vulkan/vk_device.cpp  |  6 +--
 src/video_core/renderer_vulkan/vk_device.h    | 39 +++++++++----------
 .../renderer_vulkan/vk_shader_decompiler.cpp  |  4 +-
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index 886bde3b9..ddcdb05e6 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -107,8 +107,7 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
     features.occlusionQueryPrecise = true;
     features.fragmentStoresAndAtomics = true;
     features.shaderImageGatherExtended = true;
-    features.shaderStorageImageReadWithoutFormat =
-        is_shader_storage_img_read_without_format_supported;
+    features.shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported;
     features.shaderStorageImageWriteWithoutFormat = true;
     features.textureCompressionASTC_LDR = is_optimal_astc_supported;
 
@@ -467,8 +466,7 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK
 
 void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) {
     const auto supported_features{physical.getFeatures(dldi)};
-    is_shader_storage_img_read_without_format_supported =
-        supported_features.shaderStorageImageReadWithoutFormat;
+    is_formatless_image_load_supported = supported_features.shaderStorageImageReadWithoutFormat;
     is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi);
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h
index 2c27ad730..8c4ccfefd 100644
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -122,11 +122,6 @@ public:
         return properties.limits.maxPushConstantsSize;
     }
 
-    /// Returns true if Shader storage Image Read Without Format supported.
-    bool IsShaderStorageImageReadWithoutFormatSupported() const {
-        return is_shader_storage_img_read_without_format_supported;
-    }
-
     /// Returns true if ASTC is natively supported.
     bool IsOptimalAstcSupported() const {
         return is_optimal_astc_supported;
@@ -147,6 +142,11 @@ public:
         return (guest_warp_stages & stage) != vk::ShaderStageFlags{};
     }
 
+    /// Returns true if formatless image load is supported.
+    bool IsFormatlessImageLoadSupported() const {
+        return is_formatless_image_load_supported;
+    }
+
     /// Returns true if the device supports VK_EXT_scalar_block_layout.
     bool IsKhrUniformBufferStandardLayoutSupported() const {
         return khr_uniform_buffer_standard_layout;
@@ -214,26 +214,25 @@ private:
     static std::unordered_map<vk::Format, vk::FormatProperties> GetFormatProperties(
         const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical);
 
-    const vk::PhysicalDevice physical;         ///< Physical device.
-    vk::DispatchLoaderDynamic dld;             ///< Device function pointers.
-    vk::PhysicalDeviceProperties properties;   ///< Device properties.
-    UniqueDevice logical;                      ///< Logical device.
-    vk::Queue graphics_queue;                  ///< Main graphics queue.
-    vk::Queue present_queue;                   ///< Main present queue.
-    u32 graphics_family{};                     ///< Main graphics queue family index.
-    u32 present_family{};                      ///< Main present queue family index.
-    vk::DriverIdKHR driver_id{};               ///< Driver ID.
-    vk::ShaderStageFlags guest_warp_stages{};  ///< Stages where the guest warp size can be forced.
-    bool is_optimal_astc_supported{};          ///< Support for native ASTC.
-    bool is_float16_supported{};               ///< Support for float16 arithmetics.
-    bool is_warp_potentially_bigger{};         ///< Host warp size can be bigger than guest.
+    const vk::PhysicalDevice physical;        ///< Physical device.
+    vk::DispatchLoaderDynamic dld;            ///< Device function pointers.
+    vk::PhysicalDeviceProperties properties;  ///< Device properties.
+    UniqueDevice logical;                     ///< Logical device.
+    vk::Queue graphics_queue;                 ///< Main graphics queue.
+    vk::Queue present_queue;                  ///< Main present queue.
+    u32 graphics_family{};                    ///< Main graphics queue family index.
+    u32 present_family{};                     ///< Main present queue family index.
+    vk::DriverIdKHR driver_id{};              ///< Driver ID.
+    vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced.ed
+    bool is_optimal_astc_supported{};         ///< Support for native ASTC.
+    bool is_float16_supported{};              ///< Support for float16 arithmetics.
+    bool is_warp_potentially_bigger{};        ///< Host warp size can be bigger than guest.
+    bool is_formatless_image_load_supported{}; ///< Support for shader image read without format.
     bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs.
     bool ext_index_type_uint8{};               ///< Support for VK_EXT_index_type_uint8.
     bool ext_depth_range_unrestricted{};       ///< Support for VK_EXT_depth_range_unrestricted.
     bool ext_shader_viewport_index_layer{};    ///< Support for VK_EXT_shader_viewport_index_layer.
     bool nv_device_diagnostic_checkpoints{};   ///< Support for VK_NV_device_diagnostic_checkpoints.
-    bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage
-                                                                ///< image read without format
 
     // Telemetry parameters
     std::string vendor_name;                      ///< Device's driver name.
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 699a538d6..802fe8747 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -298,7 +298,7 @@ public:
             }
         }
 
-        if (device.IsShaderStorageImageReadWithoutFormatSupported()) {
+        if (device.IsFormatlessImageLoadSupported()) {
             AddCapability(spv::Capability::StorageImageReadWithoutFormat);
         }
 
@@ -1800,7 +1800,7 @@ private:
     }
 
     Expression ImageLoad(Operation operation) {
-        if (!device.IsShaderStorageImageReadWithoutFormatSupported()) {
+        if (!device.IsFormatlessImageLoadSupported()) {
             return {v_float_zero, Type::Float};
         }
 

From 8d5bdcb17b732124e478f067bd449f76a46c547a Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Fri, 6 Mar 2020 05:09:06 -0300
Subject: [PATCH 07/14] vk_device: Enable VK_EXT_transform_feedback when
 available

---
 src/video_core/renderer_vulkan/vk_device.cpp | 41 ++++++++++++++++----
 src/video_core/renderer_vulkan/vk_device.h   |  6 +++
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp
index ddcdb05e6..3847bd722 100644
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -147,6 +147,15 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
         LOG_INFO(Render_Vulkan, "Device doesn't support uint8 indexes");
     }
 
+    vk::PhysicalDeviceTransformFeedbackFeaturesEXT transform_feedback;
+    if (ext_transform_feedback) {
+        transform_feedback.transformFeedback = true;
+        transform_feedback.geometryStreams = true;
+        SetNext(next, transform_feedback);
+    } else {
+        LOG_INFO(Render_Vulkan, "Device doesn't support transform feedbacks");
+    }
+
     if (!ext_depth_range_unrestricted) {
         LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted");
     }
@@ -384,7 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
         }
     };
 
-    extensions.reserve(14);
+    extensions.reserve(15);
     extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
     extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);
     extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME);
@@ -396,18 +405,22 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
 
     [[maybe_unused]] const bool nsight =
         std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
-    bool khr_shader_float16_int8{};
-    bool ext_subgroup_size_control{};
+    bool has_khr_shader_float16_int8{};
+    bool has_ext_subgroup_size_control{};
+    bool has_ext_transform_feedback{};
     for (const auto& extension : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) {
         Test(extension, khr_uniform_buffer_standard_layout,
              VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true);
-        Test(extension, khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false);
+        Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
+             false);
         Test(extension, ext_depth_range_unrestricted,
              VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true);
         Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true);
         Test(extension, ext_shader_viewport_index_layer,
              VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true);
-        Test(extension, ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
+        Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
+             false);
+        Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME,
              false);
         if (Settings::values.renderer_debug) {
             Test(extension, nv_device_diagnostic_checkpoints,
@@ -415,13 +428,13 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
         }
     }
 
-    if (khr_shader_float16_int8) {
+    if (has_khr_shader_float16_int8) {
         is_float16_supported =
             GetFeatures<vk::PhysicalDeviceFloat16Int8FeaturesKHR>(physical, dldi).shaderFloat16;
         extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME);
     }
 
-    if (ext_subgroup_size_control) {
+    if (has_ext_subgroup_size_control) {
         const auto features =
             GetFeatures<vk::PhysicalDeviceSubgroupSizeControlFeaturesEXT>(physical, dldi);
         const auto properties =
@@ -438,6 +451,20 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
         is_warp_potentially_bigger = true;
     }
 
+    if (has_ext_transform_feedback) {
+        const auto features =
+            GetFeatures<vk::PhysicalDeviceTransformFeedbackFeaturesEXT>(physical, dldi);
+        const auto properties =
+            GetProperties<vk::PhysicalDeviceTransformFeedbackPropertiesEXT>(physical, dldi);
+
+        if (features.transformFeedback && features.geometryStreams &&
+            properties.maxTransformFeedbackStreams >= 4 && properties.maxTransformFeedbackBuffers &&
+            properties.transformFeedbackQueries && properties.transformFeedbackDraw) {
+            extensions.push_back(VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME);
+            ext_transform_feedback = true;
+        }
+    }
+
     return extensions;
 }
 
diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h
index 8c4ccfefd..6e656517f 100644
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -167,6 +167,11 @@ public:
         return ext_shader_viewport_index_layer;
     }
 
+    /// Returns true if the device supports VK_EXT_transform_feedback.
+    bool IsExtTransformFeedbackSupported() const {
+        return ext_transform_feedback;
+    }
+
     /// Returns true if the device supports VK_NV_device_diagnostic_checkpoints.
     bool IsNvDeviceDiagnosticCheckpoints() const {
         return nv_device_diagnostic_checkpoints;
@@ -232,6 +237,7 @@ private:
     bool ext_index_type_uint8{};               ///< Support for VK_EXT_index_type_uint8.
     bool ext_depth_range_unrestricted{};       ///< Support for VK_EXT_depth_range_unrestricted.
     bool ext_shader_viewport_index_layer{};    ///< Support for VK_EXT_shader_viewport_index_layer.
+    bool ext_transform_feedback{};             ///< Support for VK_EXT_transform_feedback.
     bool nv_device_diagnostic_checkpoints{};   ///< Support for VK_NV_device_diagnostic_checkpoints.
 
     // Telemetry parameters

From b67360c0f86c8acc1e56547382f07f35039fbcbf Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Fri, 6 Mar 2020 05:10:39 -0300
Subject: [PATCH 08/14] vk_shader_decompiler: Add XFB decorations to generic
 varyings

---
 .../renderer_vulkan/vk_shader_decompiler.cpp  | 105 +++++++++++++++---
 1 file changed, 89 insertions(+), 16 deletions(-)

diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 802fe8747..3117a8d74 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -5,7 +5,9 @@
 #include <functional>
 #include <limits>
 #include <map>
+#include <optional>
 #include <type_traits>
+#include <unordered_map>
 #include <utility>
 
 #include <fmt/format.h>
@@ -94,6 +96,12 @@ struct VertexIndices {
     std::optional<u32> clip_distances;
 };
 
+struct GenericVaryingDescription {
+    Id id = nullptr;
+    u32 first_element = 0;
+    bool is_scalar = false;
+};
+
 spv::Dim GetSamplerDim(const Sampler& sampler) {
     ASSERT(!sampler.IsBuffer());
     switch (sampler.GetType()) {
@@ -288,6 +296,15 @@ public:
         AddExtension("SPV_KHR_variable_pointers");
         AddExtension("SPV_KHR_shader_draw_parameters");
 
+        if (!transform_feedback.empty()) {
+            if (device.IsExtTransformFeedbackSupported()) {
+                AddCapability(spv::Capability::TransformFeedback);
+            } else {
+                LOG_ERROR(Render_Vulkan, "Shader requires transform feedbacks but these are not "
+                                         "supported on this device");
+            }
+        }
+
         if (ir.UsesLayer() || ir.UsesViewportIndex()) {
             if (ir.UsesViewportIndex()) {
                 AddCapability(spv::Capability::MultiViewport);
@@ -406,7 +423,7 @@ private:
             // Clear Position to avoid reading trash on the Z conversion.
             const auto position_index = out_indices.position.value();
             const Id position = AccessElement(t_out_float4, out_vertex, position_index);
-            OpStore(position, v_varying_default);
+            OpStore(position, ConstantNull(t_float4));
 
             if (specialization.point_size) {
                 const u32 point_size_index = out_indices.point_size.value();
@@ -749,13 +766,35 @@ private:
     }
 
     void DeclareOutputAttributes() {
+        if (stage == ShaderType::Compute || stage == ShaderType::Fragment) {
+            return;
+        }
+
+        UNIMPLEMENTED_IF(registry.GetGraphicsInfo().tfb_enabled && stage != ShaderType::Vertex);
         for (const auto index : ir.GetOutputAttributes()) {
             if (!IsGenericAttribute(index)) {
                 continue;
             }
-            const u32 location = GetGenericAttributeLocation(index);
-            Id type = t_float4;
-            Id varying_default = v_varying_default;
+            DeclareOutputAttribute(index);
+        }
+    }
+
+    void DeclareOutputAttribute(Attribute::Index index) {
+        static constexpr std::string_view swizzle = "xyzw";
+
+        const u32 location = GetGenericAttributeLocation(index);
+        u8 element = 0;
+        while (element < 4) {
+            const std::size_t remainder = 4 - element;
+
+            std::size_t num_components = remainder;
+            const std::optional tfb = GetTransformFeedbackInfo(index, element);
+            if (tfb) {
+                num_components = tfb->components;
+            }
+
+            Id type = GetTypeVectorDefinitionLut(Type::Float).at(num_components - 1);
+            Id varying_default = ConstantNull(type);
             if (IsOutputAttributeArray()) {
                 const u32 num = GetNumOutputVertices();
                 type = TypeArray(type, Constant(t_uint, num));
@@ -767,15 +806,47 @@ private:
             }
             type = TypePointer(spv::StorageClass::Output, type);
 
+            std::string name = fmt::format("out_attr{}", location);
+            if (num_components < 4 || element > 0) {
+                name = fmt::format("{}_{}", name, swizzle.substr(element, num_components));
+            }
+
             const Id id = OpVariable(type, spv::StorageClass::Output, varying_default);
-            Name(AddGlobalVariable(id), fmt::format("out_attr{}", location));
-            output_attributes.emplace(index, id);
+            Name(AddGlobalVariable(id), name);
+
+            GenericVaryingDescription description;
+            description.id = id;
+            description.first_element = element;
+            description.is_scalar = num_components == 1;
+            for (u32 i = 0; i < num_components; ++i) {
+                const u8 offset = static_cast<u8>(static_cast<u32>(index) * 4 + element + i);
+                output_attributes.emplace(offset, description);
+            }
             interfaces.push_back(id);
 
             Decorate(id, spv::Decoration::Location, location);
+            if (element > 0) {
+                Decorate(id, spv::Decoration::Component, static_cast<u32>(element));
+            }
+            if (tfb && device.IsExtTransformFeedbackSupported()) {
+                Decorate(id, spv::Decoration::XfbBuffer, static_cast<u32>(tfb->buffer));
+                Decorate(id, spv::Decoration::XfbStride, static_cast<u32>(tfb->stride));
+                Decorate(id, spv::Decoration::Offset, static_cast<u32>(tfb->offset));
+            }
+
+            element += static_cast<u8>(num_components);
         }
     }
 
+    std::optional<VaryingTFB> GetTransformFeedbackInfo(Attribute::Index index, u8 element = 0) {
+        const u8 location = static_cast<u8>(index) * 4 + element;
+        const auto it = transform_feedback.find(location);
+        if (it == transform_feedback.end()) {
+            return {};
+        }
+        return it->second;
+    }
+
     u32 DeclareConstantBuffers(u32 binding) {
         for (const auto& [index, size] : ir.GetConstantBuffers()) {
             const Id type = device.IsKhrUniformBufferStandardLayoutSupported() ? t_cbuf_scalar_ubo
@@ -1353,8 +1424,14 @@ private:
                 }
                 default:
                     if (IsGenericAttribute(attribute)) {
-                        const Id composite = output_attributes.at(attribute);
-                        return {ArrayPass(t_out_float, composite, {element}), Type::Float};
+                        const u8 offset = static_cast<u8>(static_cast<u8>(attribute) * 4 + element);
+                        const GenericVaryingDescription description = output_attributes.at(offset);
+                        const Id composite = description.id;
+                        std::vector<u32> indices;
+                        if (!description.is_scalar) {
+                            indices.push_back(element - description.first_element);
+                        }
+                        return {ArrayPass(t_out_float, composite, indices), Type::Float};
                     }
                     UNIMPLEMENTED_MSG("Unhandled output attribute: {}",
                                       static_cast<u32>(attribute));
@@ -2265,11 +2342,11 @@ private:
     std::array<Id, 4> GetTypeVectorDefinitionLut(Type type) const {
         switch (type) {
         case Type::Float:
-            return {nullptr, t_float2, t_float3, t_float4};
+            return {t_float, t_float2, t_float3, t_float4};
         case Type::Int:
-            return {nullptr, t_int2, t_int3, t_int4};
+            return {t_int, t_int2, t_int3, t_int4};
         case Type::Uint:
-            return {nullptr, t_uint2, t_uint3, t_uint4};
+            return {t_uint, t_uint2, t_uint3, t_uint4};
         default:
             UNIMPLEMENTED();
             return {};
@@ -2573,10 +2650,6 @@ private:
     const Id v_float_zero = Constant(t_float, 0.0f);
     const Id v_float_one = Constant(t_float, 1.0f);
 
-    // Nvidia uses these defaults for varyings (e.g. position and generic attributes)
-    const Id v_varying_default =
-        ConstantComposite(t_float4, v_float_zero, v_float_zero, v_float_zero, v_float_one);
-
     const Id v_true = ConstantTrue(t_bool);
     const Id v_false = ConstantFalse(t_bool);
 
@@ -2593,7 +2666,7 @@ private:
     Id shared_memory{};
     std::array<Id, INTERNAL_FLAGS_COUNT> internal_flags{};
     std::map<Attribute::Index, Id> input_attributes;
-    std::map<Attribute::Index, Id> output_attributes;
+    std::unordered_map<u8, GenericVaryingDescription> output_attributes;
     std::map<u32, Id> constant_buffers;
     std::map<GlobalMemoryBase, Id> global_buffers;
     std::map<u32, TexelBuffer> texel_buffers;

From 2fae1e6205c16903625c993c8d46a6c0905a6d41 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Fri, 6 Mar 2020 05:11:18 -0300
Subject: [PATCH 09/14] vk_rasterizer: Implement transform feedback binding
 zero

---
 .../renderer_vulkan/vk_rasterizer.cpp         | 42 +++++++++++++++++++
 .../renderer_vulkan/vk_rasterizer.h           |  4 ++
 2 files changed, 46 insertions(+)

diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 2bcb17b56..f889019c1 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -347,6 +347,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
             [&pipeline](auto cmdbuf, auto& dld) { cmdbuf.setCheckpointNV(&pipeline, dld); });
     }
 
+    BeginTransformFeedback();
+
     const auto pipeline_layout = pipeline.GetLayout();
     const auto descriptor_set = pipeline.CommitDescriptorSet();
     scheduler.Record([pipeline_layout, descriptor_set, draw_params](auto cmdbuf, auto& dld) {
@@ -356,6 +358,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
         }
         draw_params.Draw(cmdbuf, dld);
     });
+
+    EndTransformFeedback();
 }
 
 void RasterizerVulkan::Clear() {
@@ -738,6 +742,44 @@ void RasterizerVulkan::UpdateDynamicStates() {
     UpdateStencilFaces(regs);
 }
 
+void RasterizerVulkan::BeginTransformFeedback() {
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    if (regs.tfb_enabled == 0) {
+        return;
+    }
+
+    UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
+                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
+                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
+
+    UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable);
+    UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable);
+    UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable);
+
+    const auto& binding = regs.tfb_bindings[0];
+    UNIMPLEMENTED_IF(binding.buffer_enable == 0);
+    UNIMPLEMENTED_IF(binding.buffer_offset != 0);
+
+    const GPUVAddr gpu_addr = binding.Address();
+    const std::size_t size = binding.buffer_size;
+    const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+
+    scheduler.Record([buffer = *buffer, offset = offset, size](auto cmdbuf, auto& dld) {
+        cmdbuf.bindTransformFeedbackBuffersEXT(0, {buffer}, {offset}, {size}, dld);
+        cmdbuf.beginTransformFeedbackEXT(0, {}, {}, dld);
+    });
+}
+
+void RasterizerVulkan::EndTransformFeedback() {
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    if (regs.tfb_enabled == 0) {
+        return;
+    }
+
+    scheduler.Record(
+        [](auto cmdbuf, auto& dld) { cmdbuf.endTransformFeedbackEXT(0, {}, {}, dld); });
+}
+
 void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input,
                                          BufferBindings& buffer_bindings) {
     const auto& regs = system.GPU().Maxwell3D().regs;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index 96ea05f0a..b2e73d98d 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -169,6 +169,10 @@ private:
 
     void UpdateDynamicStates();
 
+    void BeginTransformFeedback();
+
+    void EndTransformFeedback();
+
     bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment);
 
     void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input,

From 47459f6a36f5bd539b6aedf9b07ffbebb0a2be32 Mon Sep 17 00:00:00 2001
From: Rodrigo Locatti <reinuseslisp@airmail.cc>
Date: Mon, 9 Mar 2020 20:08:48 -0300
Subject: [PATCH 10/14] vk_shader_decompiler: Fix implicit type conversion

Co-Authored-By: Mat M. <mathew1800@gmail.com>
---
 src/video_core/renderer_vulkan/vk_shader_decompiler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 3117a8d74..ed78bba02 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -839,7 +839,7 @@ private:
     }
 
     std::optional<VaryingTFB> GetTransformFeedbackInfo(Attribute::Index index, u8 element = 0) {
-        const u8 location = static_cast<u8>(index) * 4 + element;
+        const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
         const auto it = transform_feedback.find(location);
         if (it == transform_feedback.end()) {
             return {};

From 4bc4851d457c6f14feca665d4d729b9df444ec05 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Mon, 9 Mar 2020 20:46:16 -0300
Subject: [PATCH 11/14] gl_shader_decompiler: Fix implicit conversion errors

---
 src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 175145cc1..973d3fd11 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -761,7 +761,7 @@ private:
     }
 
     std::optional<std::size_t> GetNumComponents(Attribute::Index index, u8 element = 0) const {
-        const u8 location = static_cast<u8>(index) * 4 + element;
+        const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
         const auto it = transform_feedback.find(location);
         if (it == transform_feedback.end()) {
             return {};
@@ -770,7 +770,7 @@ private:
     }
 
     std::string GetTransformFeedbackDecoration(Attribute::Index index, u8 element = 0) const {
-        const u8 location = static_cast<u8>(index) * 4 + element;
+        const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
         const auto it = transform_feedback.find(location);
         if (it == transform_feedback.end()) {
             return {};
@@ -811,7 +811,7 @@ private:
             code.AddLine("layout (location = {}, component = {}{}) out {} {};", location, element,
                          xfb, type, description.name);
 
-            element += static_cast<u8>(num_components);
+            element = static_cast<u8>(static_cast<std::size_t>(element) + num_components);
         }
     }
 

From afebdda2031ca0a39f2816cf56388c04c63ed336 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Wed, 11 Mar 2020 01:08:28 -0300
Subject: [PATCH 12/14] maxwell_3d: Add padding words to XFB entries

Use INSERT_UNION_PADDING_WORDS instead of alignas to ensure a size
requirement.
---
 src/video_core/engines/maxwell_3d.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index ba9c76593..8a9e9992e 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -628,12 +628,13 @@ public:
             float depth_range_far;
         };
 
-        struct alignas(32) TransformFeedbackBinding {
+        struct TransformFeedbackBinding {
             u32 buffer_enable;
             u32 address_high;
             u32 address_low;
             s32 buffer_size;
             s32 buffer_offset;
+            INSERT_UNION_PADDING_WORDS(3);
 
             GPUVAddr Address() const {
                 return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
@@ -642,10 +643,11 @@ public:
         };
         static_assert(sizeof(TransformFeedbackBinding) == 32);
 
-        struct alignas(16) TransformFeedbackLayout {
+        struct TransformFeedbackLayout {
             u32 stream;
             u32 varying_count;
             u32 stride;
+            INSERT_UNION_PADDING_WORDS(1);
         };
         static_assert(sizeof(TransformFeedbackLayout) == 16);
 

From 62560f1e6356747e7a2723eab12528e657a76a4f Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 12 Mar 2020 02:35:31 -0300
Subject: [PATCH 13/14] vk_shader_decompiler: Fix default varying regression

---
 src/video_core/renderer_vulkan/vk_shader_decompiler.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index ed78bba02..7d51bf9af 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -423,7 +423,7 @@ private:
             // Clear Position to avoid reading trash on the Z conversion.
             const auto position_index = out_indices.position.value();
             const Id position = AccessElement(t_out_float4, out_vertex, position_index);
-            OpStore(position, ConstantNull(t_float4));
+            OpStore(position, v_varying_default);
 
             if (specialization.point_size) {
                 const u32 point_size_index = out_indices.point_size.value();
@@ -794,7 +794,7 @@ private:
             }
 
             Id type = GetTypeVectorDefinitionLut(Type::Float).at(num_components - 1);
-            Id varying_default = ConstantNull(type);
+            Id varying_default = v_varying_default;
             if (IsOutputAttributeArray()) {
                 const u32 num = GetNumOutputVertices();
                 type = TypeArray(type, Constant(t_uint, num));
@@ -2650,6 +2650,10 @@ private:
     const Id v_float_zero = Constant(t_float, 0.0f);
     const Id v_float_one = Constant(t_float, 1.0f);
 
+    // Nvidia uses these defaults for varyings (e.g. position and generic attributes)
+    const Id v_varying_default =
+        ConstantComposite(t_float4, v_float_zero, v_float_zero, v_float_zero, v_float_one);
+
     const Id v_true = ConstantTrue(t_bool);
     const Id v_false = ConstantFalse(t_bool);
 

From 69c7a01f88a1839a3d950cab968accfa5100ea18 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 12 Mar 2020 03:27:29 -0300
Subject: [PATCH 14/14] vk/gl_shader_decompiler: Silence assertion on compute

---
 src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 9 ++++++---
 src/video_core/renderer_vulkan/vk_shader_decompiler.cpp | 9 ++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 973d3fd11..3adf7f0cb 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -408,8 +408,11 @@ public:
     explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
                             ShaderType stage, std::string_view identifier, std::string_view suffix)
         : device{device}, ir{ir}, registry{registry}, stage{stage},
-          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()},
-          transform_feedback{BuildTransformFeedback(registry.GetGraphicsInfo())} {}
+          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
+        if (stage != ShaderType::Compute) {
+            transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
+        }
+    }
 
     void Decompile() {
         DeclareHeader();
@@ -2553,7 +2556,7 @@ private:
     const std::string_view identifier;
     const std::string_view suffix;
     const Header header;
-    const std::unordered_map<u8, VaryingTFB> transform_feedback;
+    std::unordered_map<u8, VaryingTFB> transform_feedback;
 
     ShaderWriter code;
 
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 7d51bf9af..b2c298051 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -277,8 +277,11 @@ public:
     explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage,
                              const Registry& registry, const Specialization& specialization)
         : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()},
-          registry{registry}, specialization{specialization},
-          transform_feedback{BuildTransformFeedback(registry.GetGraphicsInfo())} {
+          registry{registry}, specialization{specialization} {
+        if (stage != ShaderType::Compute) {
+            transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
+        }
+
         AddCapability(spv::Capability::Shader);
         AddCapability(spv::Capability::UniformAndStorageBuffer16BitAccess);
         AddCapability(spv::Capability::ImageQuery);
@@ -2581,7 +2584,7 @@ private:
     const Tegra::Shader::Header header;
     const Registry& registry;
     const Specialization& specialization;
-    const std::unordered_map<u8, VaryingTFB> transform_feedback;
+    std::unordered_map<u8, VaryingTFB> transform_feedback;
 
     const Id t_void = Name(TypeVoid(), "void");