From de0ab806df4575df93068e128d911dabbf396d2c Mon Sep 17 00:00:00 2001
From: bunnei <bunneidev@gmail.com>
Date: Mon, 29 Oct 2018 23:36:03 -0400
Subject: [PATCH] maxwell_3d: Restructure macro upload to use a single macro
 code memory.

- Fixes an issue where macros could be skipped.
- Fixes rendering of distant objects in Super Mario Odyssey.
---
 src/video_core/engines/maxwell_3d.cpp | 26 ++++++++++++++++++--------
 src/video_core/engines/maxwell_3d.h   | 25 +++++++++++++++++++++----
 src/video_core/macro_interpreter.cpp  | 19 ++++++++++---------
 src/video_core/macro_interpreter.h    | 12 ++++++------
 4 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 7357d20d16..d79c509193 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -43,15 +43,17 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
     // Reset the current macro.
     executing_macro = 0;
 
-    // The requested macro must have been uploaded already.
-    auto macro_code = uploaded_macros.find(method);
-    if (macro_code == uploaded_macros.end()) {
-        LOG_ERROR(HW_GPU, "Macro {:04X} was not uploaded", method);
+    // Lookup the macro offset
+    const u32 entry{(method - MacroRegistersStart) >> 1};
+    const auto& search{macro_offsets.find(entry)};
+    if (search == macro_offsets.end()) {
+        LOG_CRITICAL(HW_GPU, "macro not found for method 0x{:X}!", method);
+        UNREACHABLE();
         return;
     }
 
     // Execute the current macro.
-    macro_interpreter.Execute(macro_code->second, std::move(parameters));
+    macro_interpreter.Execute(search->second, std::move(parameters));
 }
 
 void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
@@ -97,6 +99,10 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
         ProcessMacroUpload(value);
         break;
     }
+    case MAXWELL3D_REG_INDEX(macros.bind): {
+        ProcessMacroBind(value);
+        break;
+    }
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
     case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
@@ -158,9 +164,13 @@ void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
 }
 
 void Maxwell3D::ProcessMacroUpload(u32 data) {
-    // Store the uploaded macro code to interpret them when they're called.
-    auto& macro = uploaded_macros[regs.macros.entry * 2 + MacroRegistersStart];
-    macro.push_back(data);
+    ASSERT_MSG(regs.macros.upload_address < macro_memory.size(),
+               "upload_address exceeded macro_memory size!");
+    macro_memory[regs.macros.upload_address++] = data;
+}
+
+void Maxwell3D::ProcessMacroBind(u32 data) {
+    macro_offsets[regs.macros.entry] = data;
 }
 
 void Maxwell3D::ProcessQueryGet() {
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 443affc36d..50873813e8 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -475,12 +475,13 @@ public:
                 INSERT_PADDING_WORDS(0x45);
 
                 struct {
-                    INSERT_PADDING_WORDS(1);
+                    u32 upload_address;
                     u32 data;
                     u32 entry;
+                    u32 bind;
                 } macros;
 
-                INSERT_PADDING_WORDS(0x189);
+                INSERT_PADDING_WORDS(0x188);
 
                 u32 tfb_enabled;
 
@@ -994,12 +995,25 @@ public:
     /// Returns the texture information for a specific texture in a specific shader stage.
     Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, std::size_t offset) const;
 
+    /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
+    /// we've seen used.
+    using MacroMemory = std::array<u32, 0x40000>;
+
+    /// Gets a reference to macro memory.
+    const MacroMemory& GetMacroMemory() const {
+        return macro_memory;
+    }
+
 private:
     void InitializeRegisterDefaults();
 
     VideoCore::RasterizerInterface& rasterizer;
 
-    std::unordered_map<u32, std::vector<u32>> uploaded_macros;
+    /// Start offsets of each macro in macro_memory
+    std::unordered_map<u32, u32> macro_offsets;
+
+    /// Memory for macro code
+    MacroMemory macro_memory;
 
     /// Macro method that is currently being executed / being fed parameters.
     u32 executing_macro = 0;
@@ -1022,9 +1036,12 @@ private:
      */
     void CallMacroMethod(u32 method, std::vector<u32> parameters);
 
-    /// Handles writes to the macro uploading registers.
+    /// Handles writes to the macro uploading register.
     void ProcessMacroUpload(u32 data);
 
+    /// Handles writes to the macro bind register.
+    void ProcessMacroBind(u32 data);
+
     /// Handles a write to the CLEAR_BUFFERS register.
     void ProcessClearBuffers();
 
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro_interpreter.cpp
index f6af132fbf..335a8d4079 100644
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -11,7 +11,7 @@ namespace Tegra {
 
 MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
 
-void MacroInterpreter::Execute(const std::vector<u32>& code, std::vector<u32> parameters) {
+void MacroInterpreter::Execute(u32 offset, std::vector<u32> parameters) {
     Reset();
     registers[1] = parameters[0];
     this->parameters = std::move(parameters);
@@ -19,7 +19,7 @@ void MacroInterpreter::Execute(const std::vector<u32>& code, std::vector<u32> pa
     // Execute the code until we hit an exit condition.
     bool keep_executing = true;
     while (keep_executing) {
-        keep_executing = Step(code, false);
+        keep_executing = Step(offset, false);
     }
 
     // Assert the the macro used all the input parameters
@@ -37,10 +37,10 @@ void MacroInterpreter::Reset() {
     next_parameter_index = 1;
 }
 
-bool MacroInterpreter::Step(const std::vector<u32>& code, bool is_delay_slot) {
+bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
     u32 base_address = pc;
 
-    Opcode opcode = GetOpcode(code);
+    Opcode opcode = GetOpcode(offset);
     pc += 4;
 
     // Update the program counter if we were delayed
@@ -108,7 +108,7 @@ bool MacroInterpreter::Step(const std::vector<u32>& code, bool is_delay_slot) {
 
             delayed_pc = base_address + opcode.GetBranchTarget();
             // Execute one more instruction due to the delay slot.
-            return Step(code, true);
+            return Step(offset, true);
         }
         break;
     }
@@ -121,17 +121,18 @@ bool MacroInterpreter::Step(const std::vector<u32>& code, bool is_delay_slot) {
         // Exit has a delay slot, execute the next instruction
         // Note: Executing an exit during a branch delay slot will cause the instruction at the
         // branch target to be executed before exiting.
-        Step(code, true);
+        Step(offset, true);
         return false;
     }
 
     return true;
 }
 
-MacroInterpreter::Opcode MacroInterpreter::GetOpcode(const std::vector<u32>& code) const {
+MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const {
+    const auto& macro_memory{maxwell3d.GetMacroMemory()};
     ASSERT((pc % sizeof(u32)) == 0);
-    ASSERT(pc < code.size() * sizeof(u32));
-    return {code[pc / sizeof(u32)]};
+    ASSERT((pc + offset) < macro_memory.size() * sizeof(u32));
+    return {macro_memory[offset + pc / sizeof(u32)]};
 }
 
 u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) const {
diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro_interpreter.h
index 773684bdeb..62d1ce289b 100644
--- a/src/video_core/macro_interpreter.h
+++ b/src/video_core/macro_interpreter.h
@@ -22,10 +22,10 @@ public:
 
     /**
      * Executes the macro code with the specified input parameters.
-     * @param code The macro byte code to execute
-     * @param parameters The parameters of the macro
+     * @param offset Offset to start execution at.
+     * @param parameters The parameters of the macro.
      */
-    void Execute(const std::vector<u32>& code, std::vector<u32> parameters);
+    void Execute(u32 offset, std::vector<u32> parameters);
 
 private:
     enum class Operation : u32 {
@@ -110,11 +110,11 @@ private:
     /**
      * Executes a single macro instruction located at the current program counter. Returns whether
      * the interpreter should keep running.
-     * @param code The macro code to execute.
+     * @param offset Offset to start execution at.
      * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
      * previous instruction.
      */
-    bool Step(const std::vector<u32>& code, bool is_delay_slot);
+    bool Step(u32 offset, bool is_delay_slot);
 
     /// Calculates the result of an ALU operation. src_a OP src_b;
     u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) const;
@@ -127,7 +127,7 @@ private:
     bool EvaluateBranchCondition(BranchCondition cond, u32 value) const;
 
     /// Reads an opcode at the current program counter location.
-    Opcode GetOpcode(const std::vector<u32>& code) const;
+    Opcode GetOpcode(u32 offset) const;
 
     /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
     u32 GetRegister(u32 register_id) const;