From 93cf2b3ca8edeb1e8f1e00182f920b8d50664ed5 Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Tue, 7 Feb 2023 21:33:57 -0500
Subject: [PATCH] texture_cache: OpenGL: Implement MSAA uploads and copies

---
 src/video_core/host_shaders/CMakeLists.txt    |  2 ++
 .../convert_msaa_to_non_msaa.comp             | 30 +++++++++++++++++
 .../convert_non_msaa_to_msaa.comp             | 29 ++++++++++++++++
 .../renderer_opengl/gl_texture_cache.cpp      |  8 +++++
 .../renderer_opengl/gl_texture_cache.h        |  9 ++++-
 .../renderer_opengl/util_shaders.cpp          | 33 ++++++++++++++++++-
 src/video_core/renderer_opengl/util_shaders.h |  5 +++
 .../renderer_vulkan/vk_texture_cache.cpp      |  5 +++
 .../renderer_vulkan/vk_texture_cache.h        |  7 ++++
 src/video_core/texture_cache/formatter.cpp    |  3 ++
 src/video_core/texture_cache/texture_cache.h  | 14 ++++----
 src/video_core/texture_cache/util.cpp         |  5 ---
 12 files changed, 136 insertions(+), 14 deletions(-)
 create mode 100644 src/video_core/host_shaders/convert_msaa_to_non_msaa.comp
 create mode 100644 src/video_core/host_shaders/convert_non_msaa_to_msaa.comp

diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index 52cd5bb811..2442c3c294 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -22,6 +22,8 @@ set(SHADER_FILES
     convert_d24s8_to_abgr8.frag
     convert_depth_to_float.frag
     convert_float_to_depth.frag
+    convert_msaa_to_non_msaa.comp
+    convert_non_msaa_to_msaa.comp
     convert_s8d24_to_abgr8.frag
     full_screen_triangle.vert
     fxaa.frag
diff --git a/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp b/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp
new file mode 100644
index 0000000000..fc3854d18a
--- /dev/null
+++ b/src/video_core/host_shaders/convert_msaa_to_non_msaa.comp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#version 450 core
+layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
+
+layout (binding = 0, rgba8) uniform readonly restrict image2DMSArray msaa_in;
+layout (binding = 1, rgba8) uniform writeonly restrict image2DArray output_img;
+
+void main() {
+    const ivec3 coords = ivec3(gl_GlobalInvocationID);
+    if (any(greaterThanEqual(coords, imageSize(msaa_in)))) {
+        return;
+    }
+
+    // TODO: Specialization constants for num_samples?
+    const int num_samples = imageSamples(msaa_in);
+    for (int curr_sample = 0; curr_sample < num_samples; ++curr_sample) {
+        const vec4 pixel = imageLoad(msaa_in, coords, curr_sample);
+
+        const int single_sample_x = 2 * coords.x + (curr_sample & 1);
+        const int single_sample_y = 2 * coords.y + ((curr_sample / 2) & 1);
+        const ivec3 dest_coords = ivec3(single_sample_x, single_sample_y, coords.z);
+
+        if (any(greaterThanEqual(dest_coords, imageSize(output_img)))) {
+            continue;
+        }
+        imageStore(output_img, dest_coords, pixel);
+    }
+}
diff --git a/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp b/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp
new file mode 100644
index 0000000000..dedd962f1c
--- /dev/null
+++ b/src/video_core/host_shaders/convert_non_msaa_to_msaa.comp
@@ -0,0 +1,29 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#version 450 core
+layout (local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
+
+layout (binding = 0, rgba8) uniform readonly restrict image2DArray img_in;
+layout (binding = 1, rgba8) uniform writeonly restrict image2DMSArray output_msaa;
+
+void main() {
+    const ivec3 coords = ivec3(gl_GlobalInvocationID);
+    if (any(greaterThanEqual(coords, imageSize(output_msaa)))) {
+        return;
+    }
+
+    // TODO: Specialization constants for num_samples?
+    const int num_samples = imageSamples(output_msaa);
+    for (int curr_sample = 0; curr_sample < num_samples; ++curr_sample) {
+        const int single_sample_x = 2 * coords.x + (curr_sample & 1);
+        const int single_sample_y = 2 * coords.y + ((curr_sample / 2) & 1);
+        const ivec3 single_coords = ivec3(single_sample_x, single_sample_y, coords.z);
+
+        if (any(greaterThanEqual(single_coords, imageSize(img_in)))) {
+            continue;
+        }
+        const vec4 pixel = imageLoad(img_in, single_coords);
+        imageStore(output_msaa, coords, curr_sample, pixel);
+    }
+}
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 9f7ce7414b..eb6e43a080 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -557,6 +557,14 @@ void TextureCacheRuntime::CopyImage(Image& dst_image, Image& src_image,
     }
 }
 
+void TextureCacheRuntime::CopyImageMSAA(Image& dst_image, Image& src_image,
+                                        std::span<const VideoCommon::ImageCopy> copies) {
+    LOG_DEBUG(Render_OpenGL, "Copying from {} samples to {} samples", src_image.info.num_samples,
+              dst_image.info.num_samples);
+    // TODO: Leverage the format conversion pass if possible/accurate.
+    util_shaders.CopyMSAA(dst_image, src_image, copies);
+}
+
 void TextureCacheRuntime::ReinterpretImage(Image& dst, Image& src,
                                            std::span<const VideoCommon::ImageCopy> copies) {
     LOG_DEBUG(Render_OpenGL, "Converting {} to {}", src.info.format, dst.info.format);
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index 5d9d370f28..e308754963 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -93,12 +93,19 @@ public:
         return device.CanReportMemoryUsage();
     }
 
-    bool ShouldReinterpret([[maybe_unused]] Image& dst, [[maybe_unused]] Image& src) {
+    bool ShouldReinterpret([[maybe_unused]] Image& dst,
+                           [[maybe_unused]] Image& src) const noexcept {
+        return true;
+    }
+
+    bool CanUploadMSAA() const noexcept {
         return true;
     }
 
     void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
 
+    void CopyImageMSAA(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
+
     void ReinterpretImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
 
     void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) {
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp
index 404def62e3..2c7ac210bc 100644
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -12,6 +12,8 @@
 #include "video_core/host_shaders/astc_decoder_comp.h"
 #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h"
 #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h"
+#include "video_core/host_shaders/convert_msaa_to_non_msaa_comp.h"
+#include "video_core/host_shaders/convert_non_msaa_to_msaa_comp.h"
 #include "video_core/host_shaders/opengl_convert_s8d24_comp.h"
 #include "video_core/host_shaders/opengl_copy_bc4_comp.h"
 #include "video_core/host_shaders/pitch_unswizzle_comp.h"
@@ -51,7 +53,9 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
       block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)),
       pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)),
       copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)),
-      convert_s8d24_program(MakeProgram(OPENGL_CONVERT_S8D24_COMP)) {
+      convert_s8d24_program(MakeProgram(OPENGL_CONVERT_S8D24_COMP)),
+      convert_ms_to_nonms_program(MakeProgram(CONVERT_MSAA_TO_NON_MSAA_COMP)),
+      convert_nonms_to_ms_program(MakeProgram(CONVERT_NON_MSAA_TO_MSAA_COMP)) {
     const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
     swizzle_table_buffer.Create();
     glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
@@ -269,6 +273,33 @@ void UtilShaders::ConvertS8D24(Image& dst_image, std::span<const ImageCopy> copi
     program_manager.RestoreGuestCompute();
 }
 
+void UtilShaders::CopyMSAA(Image& dst_image, Image& src_image,
+                           std::span<const VideoCommon::ImageCopy> copies) {
+    const bool is_ms_to_non_ms = src_image.info.num_samples > 1 && dst_image.info.num_samples == 1;
+    const auto program_handle =
+        is_ms_to_non_ms ? convert_ms_to_nonms_program.handle : convert_nonms_to_ms_program.handle;
+    program_manager.BindComputeProgram(program_handle);
+
+    for (const ImageCopy& copy : copies) {
+        ASSERT(copy.src_subresource.base_layer == 0);
+        ASSERT(copy.src_subresource.num_layers == 1);
+        ASSERT(copy.dst_subresource.base_layer == 0);
+        ASSERT(copy.dst_subresource.num_layers == 1);
+
+        glBindImageTexture(0, src_image.StorageHandle(), copy.src_subresource.base_level, GL_TRUE,
+                           0, GL_READ_ONLY, GL_RGBA8);
+        glBindImageTexture(1, dst_image.StorageHandle(), copy.dst_subresource.base_level, GL_TRUE,
+                           0, GL_WRITE_ONLY, GL_RGBA8);
+
+        const u32 num_dispatches_x = Common::DivCeil(copy.extent.width, 8U);
+        const u32 num_dispatches_y = Common::DivCeil(copy.extent.height, 8U);
+        const u32 num_dispatches_z = copy.extent.depth;
+
+        glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z);
+    }
+    program_manager.RestoreGuestCompute();
+}
+
 GLenum StoreFormat(u32 bytes_per_block) {
     switch (bytes_per_block) {
     case 1:
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h
index 44efb6ecff..9013808e71 100644
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -40,6 +40,9 @@ public:
 
     void ConvertS8D24(Image& dst_image, std::span<const VideoCommon::ImageCopy> copies);
 
+    void CopyMSAA(Image& dst_image, Image& src_image,
+                  std::span<const VideoCommon::ImageCopy> copies);
+
 private:
     ProgramManager& program_manager;
 
@@ -51,6 +54,8 @@ private:
     OGLProgram pitch_unswizzle_program;
     OGLProgram copy_bc4_program;
     OGLProgram convert_s8d24_program;
+    OGLProgram convert_ms_to_nonms_program;
+    OGLProgram convert_nonms_to_ms_program;
 };
 
 GLenum StoreFormat(u32 bytes_per_block);
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
index d39372ec4d..9b85dfb5ee 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -1230,6 +1230,11 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
     });
 }
 
+void TextureCacheRuntime::CopyImageMSAA(Image& dst, Image& src,
+                                        std::span<const VideoCommon::ImageCopy> copies) {
+    UNIMPLEMENTED_MSG("Copying images with different samples is not implemented in Vulkan.");
+}
+
 u64 TextureCacheRuntime::GetDeviceLocalMemory() const {
     return device.GetDeviceLocalMemory();
 }
diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h
index 1f27a35896..b9ee83de78 100644
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -70,6 +70,8 @@ public:
 
     void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
 
+    void CopyImageMSAA(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
+
     bool ShouldReinterpret(Image& dst, Image& src);
 
     void ReinterpretImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
@@ -80,6 +82,11 @@ public:
         return false;
     }
 
+    bool CanUploadMSAA() const noexcept {
+        // TODO: Implement buffer to MSAA uploads
+        return false;
+    }
+
     void AccelerateImageUpload(Image&, const StagingBufferRef&,
                                std::span<const VideoCommon::SwizzleParameters>);
 
diff --git a/src/video_core/texture_cache/formatter.cpp b/src/video_core/texture_cache/formatter.cpp
index 4188901268..30f72361d1 100644
--- a/src/video_core/texture_cache/formatter.cpp
+++ b/src/video_core/texture_cache/formatter.cpp
@@ -22,6 +22,9 @@ std::string Name(const ImageBase& image) {
     const u32 num_layers = image.info.resources.layers;
     const u32 num_levels = image.info.resources.levels;
     std::string resource;
+    if (image.info.num_samples > 1) {
+        resource += fmt::format(":{}xMSAA", image.info.num_samples);
+    }
     if (num_layers > 1) {
         resource += fmt::format(":L{}", num_layers);
     }
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index 1b01990a44..3e2cbb0b0c 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -773,7 +773,7 @@ void TextureCache<P>::RefreshContents(Image& image, ImageId image_id) {
     image.flags &= ~ImageFlagBits::CpuModified;
     TrackImage(image, image_id);
 
-    if (image.info.num_samples > 1) {
+    if (image.info.num_samples > 1 && !runtime.CanUploadMSAA()) {
         LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
         return;
     }
@@ -1167,14 +1167,14 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
         if (True(overlap.flags & ImageFlagBits::GpuModified)) {
             new_image.flags |= ImageFlagBits::GpuModified;
         }
+        const auto& resolution = Settings::values.resolution_info;
+        const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value();
+        const u32 up_scale = can_rescale ? resolution.up_scale : 1;
+        const u32 down_shift = can_rescale ? resolution.down_shift : 0;
+        auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift);
         if (overlap.info.num_samples != new_image.info.num_samples) {
-            LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented");
+            runtime.CopyImageMSAA(new_image, overlap, std::move(copies));
         } else {
-            const auto& resolution = Settings::values.resolution_info;
-            const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value();
-            const u32 up_scale = can_rescale ? resolution.up_scale : 1;
-            const u32 down_shift = can_rescale ? resolution.down_shift : 0;
-            auto copies = MakeShrinkImageCopies(new_info, overlap.info, base, up_scale, down_shift);
             runtime.CopyImage(new_image, overlap, std::move(copies));
         }
         if (True(overlap.flags & ImageFlagBits::Tracked)) {
diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp
index 03acc68d9d..697f866411 100644
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@@ -573,10 +573,6 @@ u32 CalculateUnswizzledSizeBytes(const ImageInfo& info) noexcept {
     if (info.type == ImageType::Buffer) {
         return info.size.width * BytesPerBlock(info.format);
     }
-    if (info.num_samples > 1) {
-        // Multisample images can't be uploaded or downloaded to the host
-        return 0;
-    }
     if (info.type == ImageType::Linear) {
         return info.pitch * Common::DivCeil(info.size.height, DefaultBlockHeight(info.format));
     }
@@ -703,7 +699,6 @@ ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept {
 std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src,
                                              SubresourceBase base, u32 up_scale, u32 down_shift) {
     ASSERT(dst.resources.levels >= src.resources.levels);
-    ASSERT(dst.num_samples == src.num_samples);
 
     const bool is_dst_3d = dst.type == ImageType::e3D;
     if (is_dst_3d) {