From 7ff5851608031baca2adceb9f72e7c75eda9b3a9 Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Mon, 10 May 2021 22:35:16 -0400
Subject: [PATCH] glasm: Implement Storage atomics

StorageAtomicExchangeU64 is failing test seemingly due to failure storing 64-bit
result into the register
---
 .../backend/glasm/emit_glasm.cpp              |  7 ++
 .../backend/glasm/emit_glasm_atomic.cpp       | 60 ++++++++++++
 .../backend/glasm/emit_glasm_instructions.h   | 38 +++++---
 .../glasm/emit_glasm_not_implemented.cpp      | 96 -------------------
 .../glasm/emit_glasm_shared_memory.cpp        | 64 +++++++++++++
 5 files changed, 156 insertions(+), 109 deletions(-)

diff --git a/src/shader_recompiler/backend/glasm/emit_glasm.cpp b/src/shader_recompiler/backend/glasm/emit_glasm.cpp
index 047b2f89c..056d8cbf8 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm.cpp
@@ -193,6 +193,9 @@ void SetupOptions(std::string& header, Info info) {
     if (info.uses_subgroup_shuffles) {
         header += "OPTION NV_shader_thread_shuffle;";
     }
+    // TODO: Track the shared atomic ops
+    header +=
+        "OPTION NV_shader_storage_buffer;OPTION NV_gpu_program_fp64;OPTION NV_bindless_texture;";
 }
 } // Anonymous namespace
 
@@ -214,6 +217,10 @@ std::string EmitGLASM(const Profile&, IR::Program& program, Bindings&) {
     default:
         break;
     }
+    if (program.shared_memory_size > 0) {
+        header += fmt::format("SHARED_MEMORY {};", program.shared_memory_size);
+        header += fmt::format("SHARED shared_mem[]={{program.sharedmem}};");
+    }
     header += "TEMP ";
     for (size_t index = 0; index < ctx.reg_alloc.NumUsedRegisters(); ++index) {
         header += fmt::format("R{},", index);
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_atomic.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_atomic.cpp
index fe44c3d15..e72b252a3 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm_atomic.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_atomic.cpp
@@ -35,6 +35,66 @@ void Atom(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, ScalarU32
 }
 } // namespace
 
+void EmitSharedAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                            ScalarU32 value) {
+    ctx.Add("ATOMS.ADD.U32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicSMin32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                            ScalarS32 value) {
+    ctx.Add("ATOMS.MIN.S32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicUMin32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                            ScalarU32 value) {
+    ctx.Add("ATOMS.MIN.U32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicSMax32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                            ScalarS32 value) {
+    ctx.Add("ATOMS.MAX.S32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicUMax32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                            ScalarU32 value) {
+    ctx.Add("ATOMS.MAX.U32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicInc32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                           ScalarU32 value) {
+    ctx.Add("ATOMS.IWRAP.U32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicDec32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                           ScalarU32 value) {
+    ctx.Add("ATOMS.DWRAP.U32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicAnd32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                           ScalarU32 value) {
+    ctx.Add("ATOMS.AND.U32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicOr32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                          ScalarU32 value) {
+    ctx.Add("ATOMS.OR.U32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicXor32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                           ScalarU32 value) {
+    ctx.Add("ATOMS.XOR.U32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicExchange32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                                ScalarU32 value) {
+    ctx.Add("ATOMS.EXCH.U32 {},{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
+void EmitSharedAtomicExchange64(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                                Register value) {
+    ctx.LongAdd("ATOMS.EXCH.U64 {}.x,{},shared_mem[{}];", inst, value, pointer_offset);
+}
+
 void EmitStorageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                              ScalarU32 offset, ScalarU32 value) {
     Atom(ctx, inst, binding, offset, value, "ADD", "U32");
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h
index 75613571f..8202354fe 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h
@@ -129,7 +129,7 @@ void EmitLoadSharedS8(EmitContext& ctx, ScalarU32 offset);
 void EmitLoadSharedU16(EmitContext& ctx, ScalarU32 offset);
 void EmitLoadSharedS16(EmitContext& ctx, ScalarU32 offset);
 void EmitLoadSharedU32(EmitContext& ctx, ScalarU32 offset);
-void EmitLoadSharedU64(EmitContext& ctx, ScalarU32 offset);
+void EmitLoadSharedU64(EmitContext& ctx, IR::Inst& inst, ScalarU32 offset);
 void EmitLoadSharedU128(EmitContext& ctx, ScalarU32 offset);
 void EmitWriteSharedU8(EmitContext& ctx, ScalarU32 offset, ScalarU32 value);
 void EmitWriteSharedU16(EmitContext& ctx, ScalarU32 offset, ScalarU32 value);
@@ -345,18 +345,30 @@ void EmitUGreaterThan(EmitContext& ctx, IR::Inst& inst, ScalarU32 lhs, ScalarU32
 void EmitINotEqual(EmitContext& ctx, IR::Inst& inst, ScalarS32 lhs, ScalarS32 rhs);
 void EmitSGreaterThanEqual(EmitContext& ctx, IR::Inst& inst, ScalarS32 lhs, ScalarS32 rhs);
 void EmitUGreaterThanEqual(EmitContext& ctx, IR::Inst& inst, ScalarU32 lhs, ScalarU32 rhs);
-void EmitSharedAtomicIAdd32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value);
-void EmitSharedAtomicSMin32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarS32 value);
-void EmitSharedAtomicUMin32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value);
-void EmitSharedAtomicSMax32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarS32 value);
-void EmitSharedAtomicUMax32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value);
-void EmitSharedAtomicInc32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value);
-void EmitSharedAtomicDec32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value);
-void EmitSharedAtomicAnd32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value);
-void EmitSharedAtomicOr32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value);
-void EmitSharedAtomicXor32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value);
-void EmitSharedAtomicExchange32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value);
-void EmitSharedAtomicExchange64(EmitContext& ctx, ScalarU32 pointer_offset, Register value);
+void EmitSharedAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                            ScalarU32 value);
+void EmitSharedAtomicSMin32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                            ScalarS32 value);
+void EmitSharedAtomicUMin32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                            ScalarU32 value);
+void EmitSharedAtomicSMax32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                            ScalarS32 value);
+void EmitSharedAtomicUMax32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                            ScalarU32 value);
+void EmitSharedAtomicInc32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                           ScalarU32 value);
+void EmitSharedAtomicDec32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                           ScalarU32 value);
+void EmitSharedAtomicAnd32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                           ScalarU32 value);
+void EmitSharedAtomicOr32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                          ScalarU32 value);
+void EmitSharedAtomicXor32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                           ScalarU32 value);
+void EmitSharedAtomicExchange32(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                                ScalarU32 value);
+void EmitSharedAtomicExchange64(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                                Register value);
 void EmitStorageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                              ScalarU32 offset, ScalarU32 value);
 void EmitStorageAtomicSMin32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp
index 3c0a74e3c..b40d09f8c 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_not_implemented.cpp
@@ -199,54 +199,6 @@ void EmitUndefU64(EmitContext& ctx) {
     NotImplemented();
 }
 
-void EmitLoadSharedU8(EmitContext& ctx, ScalarU32 offset) {
-    NotImplemented();
-}
-
-void EmitLoadSharedS8(EmitContext& ctx, ScalarU32 offset) {
-    NotImplemented();
-}
-
-void EmitLoadSharedU16(EmitContext& ctx, ScalarU32 offset) {
-    NotImplemented();
-}
-
-void EmitLoadSharedS16(EmitContext& ctx, ScalarU32 offset) {
-    NotImplemented();
-}
-
-void EmitLoadSharedU32(EmitContext& ctx, ScalarU32 offset) {
-    NotImplemented();
-}
-
-void EmitLoadSharedU64(EmitContext& ctx, ScalarU32 offset) {
-    NotImplemented();
-}
-
-void EmitLoadSharedU128(EmitContext& ctx, ScalarU32 offset) {
-    NotImplemented();
-}
-
-void EmitWriteSharedU8(EmitContext& ctx, ScalarU32 offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitWriteSharedU16(EmitContext& ctx, ScalarU32 offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitWriteSharedU32(EmitContext& ctx, ScalarU32 offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitWriteSharedU64(EmitContext& ctx, ScalarU32 offset, Register value) {
-    NotImplemented();
-}
-
-void EmitWriteSharedU128(EmitContext& ctx, ScalarU32 offset, Register value) {
-    NotImplemented();
-}
-
 void EmitGetZeroFromOp(EmitContext& ctx) {
     NotImplemented();
 }
@@ -271,54 +223,6 @@ void EmitGetInBoundsFromOp(EmitContext& ctx) {
     NotImplemented();
 }
 
-void EmitSharedAtomicIAdd32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicSMin32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarS32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicUMin32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicSMax32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarS32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicUMax32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicInc32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicDec32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicAnd32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicOr32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicXor32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicExchange32(EmitContext& ctx, ScalarU32 pointer_offset, ScalarU32 value) {
-    NotImplemented();
-}
-
-void EmitSharedAtomicExchange64(EmitContext& ctx, ScalarU32 pointer_offset, Register value) {
-    NotImplemented();
-}
-
 void EmitLogicalOr(EmitContext& ctx, IR::Inst& inst, ScalarS32 a, ScalarS32 b) {
     ctx.Add("OR.S {},{},{};", inst, a, b);
 }
diff --git a/src/shader_recompiler/backend/glasm/emit_glasm_shared_memory.cpp b/src/shader_recompiler/backend/glasm/emit_glasm_shared_memory.cpp
index e69de29bb..32cc5d92c 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm_shared_memory.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_shared_memory.cpp
@@ -0,0 +1,64 @@
+
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "shader_recompiler/backend/glasm/emit_context.h"
+#include "shader_recompiler/backend/glasm/emit_glasm_instructions.h"
+#include "shader_recompiler/frontend/ir/value.h"
+
+namespace Shader::Backend::GLASM {
+void EmitLoadSharedU8([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitLoadSharedS8([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitLoadSharedU16([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitLoadSharedS16([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitLoadSharedU32([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitLoadSharedU64([[maybe_unused]] EmitContext& ctx, IR::Inst& inst,
+                       [[maybe_unused]] ScalarU32 offset) {
+    ctx.LongAdd("LDS.U64 {},shared_mem[{}];", inst, offset);
+}
+
+void EmitLoadSharedU128([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitWriteSharedU8([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset,
+                       [[maybe_unused]] ScalarU32 value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitWriteSharedU16([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset,
+                        [[maybe_unused]] ScalarU32 value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitWriteSharedU32([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset,
+                        [[maybe_unused]] ScalarU32 value) {
+    ctx.Add("STS.U32 {},shared_mem[{}];", value, offset);
+}
+
+void EmitWriteSharedU64([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset,
+                        [[maybe_unused]] Register value) {
+    ctx.Add("STS.U64 {},shared_mem[{}];", value, offset);
+}
+
+void EmitWriteSharedU128([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] ScalarU32 offset,
+                         [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+} // namespace Shader::Backend::GLASM