From a5a94f52ffcbf3119d272a9369021a213ea6dad2 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Wed, 9 Feb 2022 15:00:05 +0100
Subject: [PATCH] MacroHLE: Add MultidrawIndirect HLE Macro.

---
 src/video_core/buffer_cache/buffer_cache.h    | 22 +++++++
 src/video_core/dma_pusher.cpp                 | 14 +++--
 src/video_core/dma_pusher.h                   |  1 +
 src/video_core/engines/draw_manager.cpp       | 21 +++++++
 src/video_core/engines/draw_manager.h         | 20 +++++++
 src/video_core/engines/engine_interface.h     |  2 +
 src/video_core/macro/macro_hle.cpp            | 53 ++++++++---------
 src/video_core/rasterizer_interface.h         |  3 +
 .../renderer_vulkan/vk_rasterizer.cpp         | 57 ++++++++++++++-----
 .../renderer_vulkan/vk_rasterizer.h           |  4 ++
 .../vulkan_common/vulkan_device.cpp           |  2 +-
 .../vulkan_common/vulkan_wrapper.cpp          |  2 +
 src/video_core/vulkan_common/vulkan_wrapper.h | 15 +++++
 13 files changed, 169 insertions(+), 47 deletions(-)

diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index f1c60d1f38..99abe0edf3 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -170,6 +170,9 @@ public:
     void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format,
                                   bool is_written, bool is_image);
 
+    [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size,
+                                                       bool synchronize, bool mark_as_written);
+
     void FlushCachedWrites();
 
     /// Return true when there are uncommitted buffers to be downloaded
@@ -790,6 +793,25 @@ void BufferCache<P>::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add
     compute_texture_buffers[tbo_index] = GetTextureBufferBinding(gpu_addr, size, format);
 }
 
+template <class P>
+std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_addr, u32 size,
+                                                                 bool synchronize,
+                                                                 bool mark_as_written) {
+    const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr);
+    if (!cpu_addr) {
+        return {&slot_buffers[NULL_BUFFER_ID], 0};
+    }
+    const BufferId buffer_id = FindBuffer(*cpu_addr, size);
+    Buffer& buffer = slot_buffers[buffer_id];
+    if (synchronize) {
+        SynchronizeBuffer(buffer, *cpu_addr, size);
+    }
+    if (mark_as_written) {
+        MarkWrittenBuffer(buffer_id, *cpu_addr, size);
+    }
+    return {&buffer, buffer.Offset(*cpu_addr)};
+}
+
 template <class P>
 void BufferCache<P>::FlushCachedWrites() {
     for (const BufferId buffer_id : cached_write_buffer_ids) {
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 322de2606b..eb13716123 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -61,7 +61,7 @@ bool DmaPusher::Step() {
     } else {
         const CommandListHeader command_list_header{
             command_list.command_lists[dma_pushbuffer_subindex++]};
-        const GPUVAddr dma_get = command_list_header.addr;
+        dma_state.dma_get = command_list_header.addr;
 
         if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
             // We've gone through the current list, remove it from the queue
@@ -75,11 +75,11 @@ bool DmaPusher::Step() {
 
         // Push buffer non-empty, read a word
         command_headers.resize_destructive(command_list_header.size);
-        if (Settings::IsGPULevelHigh()) {
-            memory_manager.ReadBlock(dma_get, command_headers.data(),
+        if (Settings::IsGPULevelExtreme()) {
+            memory_manager.ReadBlock(dma_state.dma_get, command_headers.data(),
                                      command_list_header.size * sizeof(u32));
         } else {
-            memory_manager.ReadBlockUnsafe(dma_get, command_headers.data(),
+            memory_manager.ReadBlockUnsafe(dma_state.dma_get, command_headers.data(),
                                            command_list_header.size * sizeof(u32));
         }
         ProcessCommands(command_headers);
@@ -174,8 +174,10 @@ void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const {
         puller.CallMultiMethod(dma_state.method, dma_state.subchannel, base_start, num_methods,
                                dma_state.method_count);
     } else {
-        subchannels[dma_state.subchannel]->CallMultiMethod(dma_state.method, base_start,
-                                                           num_methods, dma_state.method_count);
+        auto subchannel = subchannels[dma_state.subchannel];
+        subchannel->current_dma_segment = dma_state.dma_get;
+        subchannel->CallMultiMethod(dma_state.method, base_start, num_methods,
+                                    dma_state.method_count);
     }
 }
 
diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h
index 6f00de937e..ca0899ba71 100644
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -156,6 +156,7 @@ private:
         u32 subchannel;        ///< Current subchannel
         u32 method_count;      ///< Current method count
         u32 length_pending;    ///< Large NI command length pending
+        GPUVAddr dma_get;      ///< Currently read segment
         bool non_incrementing; ///< Current command's NI flag
         bool is_last_call;
     };
diff --git a/src/video_core/engines/draw_manager.cpp b/src/video_core/engines/draw_manager.cpp
index 3a78421f6b..4fa77b6843 100644
--- a/src/video_core/engines/draw_manager.cpp
+++ b/src/video_core/engines/draw_manager.cpp
@@ -91,6 +91,16 @@ void DrawManager::DrawIndex(PrimitiveTopology topology, u32 index_first, u32 ind
     ProcessDraw(true, num_instances);
 }
 
+void DrawManager::DrawIndexedIndirect(PrimitiveTopology topology, u32 index_first, u32 index_count) {
+    const auto& regs{maxwell3d->regs};
+    draw_state.topology = topology;
+    draw_state.index_buffer = regs.index_buffer;
+    draw_state.index_buffer.first = index_first;
+    draw_state.index_buffer.count = index_count;
+
+    ProcessDrawIndirect(true);
+}
+
 void DrawManager::SetInlineIndexBuffer(u32 index) {
     draw_state.inline_index_draw_indexes.push_back(static_cast<u8>(index & 0x000000ff));
     draw_state.inline_index_draw_indexes.push_back(static_cast<u8>((index & 0x0000ff00) >> 8));
@@ -198,4 +208,15 @@ void DrawManager::ProcessDraw(bool draw_indexed, u32 instance_count) {
         maxwell3d->rasterizer->Draw(draw_indexed, instance_count);
     }
 }
+
+void DrawManager::ProcessDrawIndirect(bool draw_indexed) {
+    LOG_TRACE(HW_GPU, "called, topology={}, count={}", draw_state.topology,
+              draw_indexed ? draw_state.index_buffer.count : draw_state.vertex_buffer.count);
+
+    UpdateTopology();
+
+    if (maxwell3d->ShouldExecute()) {
+        maxwell3d->rasterizer->DrawIndirect(draw_indexed);
+    }
+}
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/draw_manager.h b/src/video_core/engines/draw_manager.h
index 0e6930a9cc..0cdb37f83d 100644
--- a/src/video_core/engines/draw_manager.h
+++ b/src/video_core/engines/draw_manager.h
@@ -32,6 +32,13 @@ public:
         std::vector<u8> inline_index_draw_indexes;
     };
 
+    struct IndirectParams {
+        GPUVAddr start_address;
+        size_t buffer_size;
+        size_t max_draw_counts;
+        size_t stride;
+    };
+
     explicit DrawManager(Maxwell3D* maxwell_3d);
 
     void ProcessMethodCall(u32 method, u32 argument);
@@ -46,10 +53,20 @@ public:
     void DrawIndex(PrimitiveTopology topology, u32 index_first, u32 index_count, u32 base_index,
                    u32 base_instance, u32 num_instances);
 
+    void DrawIndexedIndirect(PrimitiveTopology topology, u32 index_first, u32 index_count);
+
     const State& GetDrawState() const {
         return draw_state;
     }
 
+    IndirectParams& GetIndirectParams() {
+        return indirect_state;
+    }
+
+    const IndirectParams& GetIndirectParams() const {
+        return indirect_state;
+    }
+
 private:
     void SetInlineIndexBuffer(u32 index);
 
@@ -63,7 +80,10 @@ private:
 
     void ProcessDraw(bool draw_indexed, u32 instance_count);
 
+    void ProcessDrawIndirect(bool draw_indexed);
+
     Maxwell3D* maxwell3d{};
     State draw_state{};
+    IndirectParams indirect_state{};
 };
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/engine_interface.h b/src/video_core/engines/engine_interface.h
index 26cde85841..76630272d3 100644
--- a/src/video_core/engines/engine_interface.h
+++ b/src/video_core/engines/engine_interface.h
@@ -17,6 +17,8 @@ public:
     /// Write multiple values to the register identified by method.
     virtual void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
                                  u32 methods_pending) = 0;
+
+    GPUVAddr current_dma_segment;
 };
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index 8549db2e4e..1cc202cc7f 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -53,42 +53,43 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>&
 
 // Multidraw Indirect
 void HLE_3F5E74B9C9A50164(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
-    SCOPE_EXIT({
-        // Clean everything.
-        maxwell3d.regs.vertex_id_base = 0x0;
-        maxwell3d.CallMethod(0x8e3, 0x640, true);
-        maxwell3d.CallMethod(0x8e4, 0x0, true);
-        maxwell3d.CallMethod(0x8e5, 0x0, true);
-        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-    });
     const u32 start_indirect = parameters[0];
     const u32 end_indirect = parameters[1];
     if (start_indirect >= end_indirect) {
         // Nothing to do.
         return;
     }
-    const u32 padding = parameters[3];
-    const std::size_t max_draws = parameters[4];
+    const auto topology =
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[2]);
+    const u32 padding = parameters[3]; // padding is in words
 
+    // size of each indirect segment
     const u32 indirect_words = 5 + padding;
-    const std::size_t first_draw = start_indirect;
-    const std::size_t effective_draws = end_indirect - start_indirect;
-    const std::size_t last_draw = start_indirect + std::min(effective_draws, max_draws);
-
-    for (std::size_t index = first_draw; index < last_draw; index++) {
+    const u32 stride = indirect_words * sizeof(u32);
+    const GPUVAddr start_address = maxwell3d.current_dma_segment + 4 * sizeof(u32);
+    const std::size_t draw_count = end_indirect - start_indirect;
+    u32 lowest_first = std::numeric_limits<u32>::max();
+    u32 highest_limit = std::numeric_limits<u32>::min();
+    for (std::size_t index = 0; index < draw_count; index++) {
         const std::size_t base = index * indirect_words + 5;
-        const u32 base_vertex = parameters[base + 3];
-        const u32 base_instance = parameters[base + 4];
-        maxwell3d.regs.vertex_id_base = base_vertex;
-        maxwell3d.CallMethod(0x8e3, 0x640, true);
-        maxwell3d.CallMethod(0x8e4, base_vertex, true);
-        maxwell3d.CallMethod(0x8e5, base_instance, true);
-        maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        maxwell3d.draw_manager->DrawIndex(
-            static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[2]),
-            parameters[base + 2], parameters[base], base_vertex, base_instance,
-            parameters[base + 1]);
+        const u32 count = parameters[base];
+        const u32 first_index = parameters[base + 2];
+        lowest_first = std::min(lowest_first, first_index);
+        highest_limit = std::max(highest_limit, first_index + count);
     }
+
+    const u32 base_vertex = parameters[8];
+    const u32 base_instance = parameters[9];
+    maxwell3d.CallMethod(0x8e3, 0x640, true);
+    maxwell3d.CallMethod(0x8e4, base_vertex, true);
+    maxwell3d.CallMethod(0x8e5, base_instance, true);
+    auto& params = maxwell3d.draw_manager->GetIndirectParams();
+    params.start_address = start_address;
+    params.buffer_size = sizeof(u32) + stride * draw_count;
+    params.max_draw_counts = draw_count;
+    params.stride = stride;
+    maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
+    maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, highest_limit);
 }
 
 // Multi-layer Clear
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index b6907463c5..a2a651f341 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -42,6 +42,9 @@ public:
     /// Dispatches a draw invocation
     virtual void Draw(bool is_indexed, u32 instance_count) = 0;
 
+    /// Dispatches an indirect draw invocation
+    virtual void DrawIndirect(bool is_indexed) {}
+
     /// Clear the current framebuffer
     virtual void Clear(u32 layer_count) = 0;
 
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index ac1eb9895b..9b75f33dd9 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -180,7 +180,8 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
 
 RasterizerVulkan::~RasterizerVulkan() = default;
 
-void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) {
+template <typename Func>
+void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
     MICROPROFILE_SCOPE(Vulkan_Drawing);
 
     SCOPE_EXIT({ gpu.TickWork(); });
@@ -201,22 +202,50 @@ void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) {
 
     UpdateDynamicStates();
 
-    const auto& draw_state = maxwell3d->draw_manager->GetDrawState();
-    const u32 num_instances{instance_count};
-    const DrawParams draw_params{MakeDrawParams(draw_state, num_instances, is_indexed)};
-    scheduler.Record([draw_params](vk::CommandBuffer cmdbuf) {
-        if (draw_params.is_indexed) {
-            cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances,
-                               draw_params.first_index, draw_params.base_vertex,
-                               draw_params.base_instance);
-        } else {
-            cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances,
-                        draw_params.base_vertex, draw_params.base_instance);
-        }
-    });
+    draw_func();
+
     EndTransformFeedback();
 }
 
+void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) {
+    PrepareDraw(is_indexed, [this, is_indexed, instance_count] {
+        const auto& draw_state = maxwell3d->draw_manager->GetDrawState();
+        const u32 num_instances{instance_count};
+        const DrawParams draw_params{MakeDrawParams(draw_state, num_instances, is_indexed)};
+        scheduler.Record([draw_params](vk::CommandBuffer cmdbuf) {
+            if (draw_params.is_indexed) {
+                cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances,
+                                   draw_params.first_index, draw_params.base_vertex,
+                                   draw_params.base_instance);
+            } else {
+                cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances,
+                            draw_params.base_vertex, draw_params.base_instance);
+            }
+        });
+    });
+}
+
+void RasterizerVulkan::DrawIndirect(bool is_indexed) {
+    PrepareDraw(is_indexed, [this, is_indexed] {
+        const auto params = maxwell3d->draw_manager->GetIndirectParams();
+        const auto [buffer, offset] = buffer_cache.ObtainBuffer(
+            params.start_address, static_cast<u32>(params.buffer_size), true, false);
+        scheduler.Record([buffer_obj = buffer->Handle(), offset,
+                          max_draw_counts = params.max_draw_counts, stride = params.stride,
+                          is_indexed](vk::CommandBuffer cmdbuf) {
+            if (is_indexed) {
+                cmdbuf.DrawIndexedIndirectCount(buffer_obj, offset + 4ULL, buffer_obj, offset,
+                                                static_cast<u32>(max_draw_counts),
+                                                static_cast<u32>(stride));
+            } else {
+                cmdbuf.DrawIndirectCount(buffer_obj, offset + 4ULL, buffer_obj, offset,
+                                         static_cast<u32>(max_draw_counts),
+                                         static_cast<u32>(stride));
+            }
+        });
+    });
+}
+
 void RasterizerVulkan::Clear(u32 layer_count) {
     MICROPROFILE_SCOPE(Vulkan_Clearing);
 
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index ee483cfd9e..bc43a8a1f3 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -65,6 +65,7 @@ public:
     ~RasterizerVulkan() override;
 
     void Draw(bool is_indexed, u32 instance_count) override;
+    void DrawIndirect(bool is_indexed) override;
     void Clear(u32 layer_count) override;
     void DispatchCompute() override;
     void ResetCounter(VideoCore::QueryType type) override;
@@ -114,6 +115,9 @@ private:
 
     static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float);
 
+    template <typename Func>
+    void PrepareDraw(bool is_indexed, Func&&);
+
     void FlushWork();
 
     void UpdateDynamicStates();
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index c4d31681aa..477fc428bc 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -350,7 +350,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
             .sampleRateShading = true,
             .dualSrcBlend = true,
             .logicOp = true,
-            .multiDrawIndirect = false,
+            .multiDrawIndirect = true,
             .drawIndirectFirstInstance = false,
             .depthClamp = true,
             .depthBiasClamp = true,
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp
index 7dca7341cd..c58c4c1c49 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.cpp
+++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp
@@ -94,6 +94,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
     X(vkCmdDispatch);
     X(vkCmdDraw);
     X(vkCmdDrawIndexed);
+    X(vkCmdDrawIndirectCount);
+    X(vkCmdDrawIndexedIndirectCount);
     X(vkCmdEndQuery);
     X(vkCmdEndRenderPass);
     X(vkCmdEndTransformFeedbackEXT);
diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h
index 8bd4fd4d99..9bd158dce5 100644
--- a/src/video_core/vulkan_common/vulkan_wrapper.h
+++ b/src/video_core/vulkan_common/vulkan_wrapper.h
@@ -213,6 +213,8 @@ struct DeviceDispatch : InstanceDispatch {
     PFN_vkCmdDispatch vkCmdDispatch{};
     PFN_vkCmdDraw vkCmdDraw{};
     PFN_vkCmdDrawIndexed vkCmdDrawIndexed{};
+    PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{};
+    PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{};
     PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{};
     PFN_vkCmdEndQuery vkCmdEndQuery{};
     PFN_vkCmdEndRenderPass vkCmdEndRenderPass{};
@@ -1019,6 +1021,19 @@ public:
                               first_instance);
     }
 
+    void DrawIndirectCount(VkBuffer src_buffer, VkDeviceSize src_offset, VkBuffer count_buffer,
+                           VkDeviceSize count_offset, u32 draw_count, u32 stride) const noexcept {
+        dld->vkCmdDrawIndirectCount(handle, src_buffer, src_offset, count_buffer, count_offset,
+                                    draw_count, stride);
+    }
+
+    void DrawIndexedIndirectCount(VkBuffer src_buffer, VkDeviceSize src_offset,
+                                  VkBuffer count_buffer, VkDeviceSize count_offset, u32 draw_count,
+                                  u32 stride) const noexcept {
+        dld->vkCmdDrawIndexedIndirectCount(handle, src_buffer, src_offset, count_buffer,
+                                           count_offset, draw_count, stride);
+    }
+
     void ClearAttachments(Span<VkClearAttachment> attachments,
                           Span<VkClearRect> rects) const noexcept {
         dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(),