From 3337b846204c3d18fde4e28ad1558f5e73532ccc Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Fri, 2 Jan 2015 18:21:45 -0500
Subject: [PATCH] dyncom: Implement SMLAD/SMUAD/SMLSD/SMUSD

---
 .../arm/dyncom/arm_dyncom_interpreter.cpp     | 117 +++++++++++-------
 src/core/arm/interpreter/armemu.cpp           |   6 +-
 src/core/arm/interpreter/armsupp.cpp          |   8 +-
 src/core/arm/skyeye_common/armdefs.h          |   2 +
 src/core/arm/skyeye_common/armemu.h           |   1 -
 5 files changed, 84 insertions(+), 50 deletions(-)

diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index 7ba82503d..c5e885bcd 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -930,6 +930,8 @@ typedef struct _smlad_inst {
     unsigned int Rd;
     unsigned int Ra;
     unsigned int Rn;
+    unsigned int op1;
+    unsigned int op2;
 } smlad_inst;
 
 typedef struct _smla_inst {
@@ -2313,25 +2315,40 @@ ARM_INST_PTR INTERPRETER_TRANSLATE(smla)(unsigned int inst, int index)
 
     return inst_base;
 }
-ARM_INST_PTR INTERPRETER_TRANSLATE(smlad)(unsigned int inst, int index){
-    arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(smlad_inst));
-    smlad_inst *inst_cream = (smlad_inst *)inst_base->component;
 
-    inst_base->cond  = BITS(inst, 28, 31);
-    inst_base->idx     = index;
-    inst_base->br     = NON_BRANCH;
+ARM_INST_PTR INTERPRETER_TRANSLATE(smlad)(unsigned int inst, int index)
+{
+    arm_inst* const inst_base = (arm_inst*)AllocBuffer(sizeof(arm_inst) + sizeof(smlad_inst));
+    smlad_inst* const inst_cream = (smlad_inst*)inst_base->component;
+
+    inst_base->cond     = BITS(inst, 28, 31);
+    inst_base->idx      = index;
+    inst_base->br       = NON_BRANCH;
     inst_base->load_r15 = 0;
 
-    inst_cream->m     = BIT(inst, 4);
-    inst_cream->Rn     = BITS(inst, 0, 3);
-    inst_cream->Rm     = BITS(inst, 8, 11);
-    inst_cream->Rd = BITS(inst, 16, 19);
-    inst_cream->Ra = BITS(inst, 12, 15);
+    inst_cream->m   = BIT(inst, 5);
+    inst_cream->Rn  = BITS(inst, 0, 3);
+    inst_cream->Rm  = BITS(inst, 8, 11);
+    inst_cream->Rd  = BITS(inst, 16, 19);
+    inst_cream->Ra  = BITS(inst, 12, 15);
+    inst_cream->op1 = BITS(inst, 20, 22);
+    inst_cream->op2 = BITS(inst, 5, 7);
 
-    if (CHECK_RM ) 
-        inst_base->load_r15 = 1;
     return inst_base;
 }
+ARM_INST_PTR INTERPRETER_TRANSLATE(smuad)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(smlad)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(smusd)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(smlad)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(smlsd)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(smlad)(inst, index);
+}
+
 ARM_INST_PTR INTERPRETER_TRANSLATE(smlal)(unsigned int inst, int index)
 {
     arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(umlal_inst));
@@ -2355,12 +2372,10 @@ ARM_INST_PTR INTERPRETER_TRANSLATE(smlal)(unsigned int inst, int index)
 ARM_INST_PTR INTERPRETER_TRANSLATE(smlalxy)(unsigned int inst, int index) { UNIMPLEMENTED_INSTRUCTION("SMLALXY"); }
 ARM_INST_PTR INTERPRETER_TRANSLATE(smlald)(unsigned int inst, int index)  { UNIMPLEMENTED_INSTRUCTION("SMLALD"); }
 ARM_INST_PTR INTERPRETER_TRANSLATE(smlaw)(unsigned int inst, int index)   { UNIMPLEMENTED_INSTRUCTION("SMLAW"); }
-ARM_INST_PTR INTERPRETER_TRANSLATE(smlsd)(unsigned int inst, int index)   { UNIMPLEMENTED_INSTRUCTION("SMLSD"); }
 ARM_INST_PTR INTERPRETER_TRANSLATE(smlsld)(unsigned int inst, int index)  { UNIMPLEMENTED_INSTRUCTION("SMLSLD"); }
 ARM_INST_PTR INTERPRETER_TRANSLATE(smmla)(unsigned int inst, int index)   { UNIMPLEMENTED_INSTRUCTION("SMMLA"); }
 ARM_INST_PTR INTERPRETER_TRANSLATE(smmls)(unsigned int inst, int index)   { UNIMPLEMENTED_INSTRUCTION("SMMLS"); }
 ARM_INST_PTR INTERPRETER_TRANSLATE(smmul)(unsigned int inst, int index)   { UNIMPLEMENTED_INSTRUCTION("SMMUL"); }
-ARM_INST_PTR INTERPRETER_TRANSLATE(smuad)(unsigned int inst, int index)   { UNIMPLEMENTED_INSTRUCTION("SMUAD"); }
 ARM_INST_PTR INTERPRETER_TRANSLATE(smul)(unsigned int inst, int index)
 {
     arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(smul_inst));
@@ -2423,7 +2438,6 @@ ARM_INST_PTR INTERPRETER_TRANSLATE(smulw)(unsigned int inst, int index)
         inst_base->load_r15 = 1;
     return inst_base;
 }
-ARM_INST_PTR INTERPRETER_TRANSLATE(smusd)(unsigned int inst, int index)    { UNIMPLEMENTED_INSTRUCTION("SMUSD"); }
 ARM_INST_PTR INTERPRETER_TRANSLATE(srs)(unsigned int inst, int index)      { UNIMPLEMENTED_INSTRUCTION("SRS"); }
 ARM_INST_PTR INTERPRETER_TRANSLATE(ssat)(unsigned int inst, int index)
 {
@@ -5382,44 +5396,59 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
         FETCH_INST;
         GOTO_NEXT_INST;
     }
+
     SMLAD_INST:
+    SMLSD_INST:
+    SMUAD_INST:
+    SMUSD_INST:
     {
-        if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
-            smlad_inst *inst_cream = (smlad_inst *)inst_base->component;
-            long long int rm = cpu->Reg[inst_cream->Rm];
-            long long int rn = cpu->Reg[inst_cream->Rn];
-            long long int ra = cpu->Reg[inst_cream->Ra];
+        if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) {
+            smlad_inst* const inst_cream = (smlad_inst*)inst_base->component;
+            const u8 op2 = inst_cream->op2;
 
-            // See SMUAD
-            if(inst_cream->Ra == 15)
-                CITRA_IGNORE_EXIT(-1);
-            int operand2 = (inst_cream->m)? ROTATE_RIGHT_32(rm, 16):rm;
-            int half_rn, half_operand2;
+            u32 rm_val = cpu->Reg[inst_cream->Rm];
+            const u32 rn_val = cpu->Reg[inst_cream->Rn];
 
-            half_rn = rn & 0xFFFF;
-            half_rn = (half_rn & 0x8000)? (0xFFFF0000|half_rn) : half_rn;
+            if (inst_cream->m)
+                rm_val = (((rm_val & 0xFFFF) << 16) | (rm_val >> 16));
 
-            half_operand2 = operand2 & 0xFFFF;
-            half_operand2 = (half_operand2 & 0x8000)? (0xFFFF0000|half_operand2) : half_operand2;
+            const s16 rm_lo = (rm_val & 0xFFFF);
+            const s16 rm_hi = ((rm_val >> 16) & 0xFFFF);
+            const s16 rn_lo = (rn_val & 0xFFFF);
+            const s16 rn_hi = ((rn_val >> 16) & 0xFFFF);
 
-            long long int product1 = half_rn * half_operand2;
+            const u32 product1 = (rn_lo * rm_lo);
+            const u32 product2 = (rn_hi * rm_hi);
 
-            half_rn = (rn & 0xFFFF0000) >> 16;
-            half_rn = (half_rn & 0x8000)? (0xFFFF0000|half_rn) : half_rn;
+            // SMUAD and SMLAD
+            if (BIT(op2, 1) == 0) {
+                RD = (product1 + product2);
 
-            half_operand2 = (operand2 & 0xFFFF0000) >> 16;
-            half_operand2 = (half_operand2 & 0x8000)? (0xFFFF0000|half_operand2) : half_operand2;
+                if (inst_cream->Ra != 15) {
+                    RD += cpu->Reg[inst_cream->Ra];
 
-            long long int product2 = half_rn * half_operand2;
+                    if (ARMul_AddOverflowQ(product1 + product2, cpu->Reg[inst_cream->Ra]))
+                        cpu->Cpsr |= (1 << 27);
+                }
 
-            long long int signed_ra = (ra & 0x80000000)? (0xFFFFFFFF00000000LL) | ra : ra;
-            long long int result = product1 + product2 + signed_ra;
-            cpu->Reg[inst_cream->Rd] = result & 0xFFFFFFFF;
+                if (ARMul_AddOverflowQ(product1, product2))
+                    cpu->Cpsr |= (1 << 27);
+            }
+            // SMUSD and SMLSD
+            else {
+                RD = (product1 - product2);
 
-            // TODO: FIXME should check Signed overflow
+                if (inst_cream->Ra != 15) {
+                    RD += cpu->Reg[inst_cream->Ra];
+
+                    if (ARMul_AddOverflowQ(product1 - product2, cpu->Reg[inst_cream->Ra]))
+                        cpu->Cpsr |= (1 << 27);
+                }
+            }
         }
+
         cpu->Reg[15] += GET_INST_SIZE(cpu);
-        INC_PC(sizeof(umlal_inst));
+        INC_PC(sizeof(smlad_inst));
         FETCH_INST;
         GOTO_NEXT_INST;
     }
@@ -5452,15 +5481,15 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
         FETCH_INST;
         GOTO_NEXT_INST;
     }
+
     SMLALXY_INST:
     SMLALD_INST:
     SMLAW_INST:
-    SMLSD_INST:
     SMLSLD_INST:
     SMMLA_INST:
     SMMLS_INST:
     SMMUL_INST:
-    SMUAD_INST:
+
     SMUL_INST:
     {
         if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
@@ -5528,8 +5557,8 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
         GOTO_NEXT_INST;
     }
 
-    SMUSD_INST:
     SRS_INST:
+
     SSAT_INST:
     {
         if (inst_base->cond == 0xE || CondPassed(cpu, inst_base->cond)) {
diff --git a/src/core/arm/interpreter/armemu.cpp b/src/core/arm/interpreter/armemu.cpp
index 43b1ba40e..40e4837d8 100644
--- a/src/core/arm/interpreter/armemu.cpp
+++ b/src/core/arm/interpreter/armemu.cpp
@@ -6470,10 +6470,12 @@ L_stm_s_takeabort:
 
                     if (BITS(12, 15) != 15) {
                         state->Reg[rd_idx] += state->Reg[ra_idx];
-                        ARMul_AddOverflowQ(state, product1 + product2, state->Reg[ra_idx]);
+                        if (ARMul_AddOverflowQ(product1 + product2, state->Reg[ra_idx]))
+                            SETQ;
                     }
 
-                    ARMul_AddOverflowQ(state, product1, product2);
+                    if (ARMul_AddOverflowQ(product1, product2))
+                        SETQ;
                 }
                 // SMUSD and SMLSD
                 else {
diff --git a/src/core/arm/interpreter/armsupp.cpp b/src/core/arm/interpreter/armsupp.cpp
index 426b67831..eec34143e 100644
--- a/src/core/arm/interpreter/armsupp.cpp
+++ b/src/core/arm/interpreter/armsupp.cpp
@@ -453,12 +453,14 @@ ARMul_AddOverflow (ARMul_State * state, ARMword a, ARMword b, ARMword result)
     ASSIGNV (AddOverflow (a, b, result));
 }
 
-/* Assigns the Q flag if the given result is considered an overflow from the addition of a and b  */
-void ARMul_AddOverflowQ(ARMul_State* state, ARMword a, ARMword b)
+// Returns true if the Q flag should be set as a result of overflow.
+bool ARMul_AddOverflowQ(ARMword a, ARMword b)
 {
     u32 result = a + b;
     if (((result ^ a) & (u32)0x80000000) && ((a ^ b) & (u32)0x80000000) == 0)
-        SETQ;
+        return true;
+
+    return false;
 }
 
 /* Assigns the C flag after an subtraction of a and b to give result.  */
diff --git a/src/core/arm/skyeye_common/armdefs.h b/src/core/arm/skyeye_common/armdefs.h
index 8611d7392..c2c78cd5a 100644
--- a/src/core/arm/skyeye_common/armdefs.h
+++ b/src/core/arm/skyeye_common/armdefs.h
@@ -790,6 +790,8 @@ extern void ARMul_FixSPSR(ARMul_State*, ARMword, ARMword);
 extern void ARMul_ConsolePrint(ARMul_State*, const char*, ...);
 extern void ARMul_SelectProcessor(ARMul_State*, unsigned);
 
+extern bool ARMul_AddOverflowQ(ARMword, ARMword);
+
 extern u8 ARMul_SignedSaturatedAdd8(u8, u8);
 extern u8 ARMul_SignedSaturatedSub8(u8, u8);
 extern u16 ARMul_SignedSaturatedAdd16(u16, u16);
diff --git a/src/core/arm/skyeye_common/armemu.h b/src/core/arm/skyeye_common/armemu.h
index 3ea14b5a3..e1b286f0f 100644
--- a/src/core/arm/skyeye_common/armemu.h
+++ b/src/core/arm/skyeye_common/armemu.h
@@ -602,7 +602,6 @@ extern ARMword ARMul_SwitchMode (ARMul_State *, ARMword, ARMword);
 extern void ARMul_MSRCpsr (ARMul_State *, ARMword, ARMword);
 extern void ARMul_SubOverflow (ARMul_State *, ARMword, ARMword, ARMword);
 extern void ARMul_AddOverflow (ARMul_State *, ARMword, ARMword, ARMword);
-extern void ARMul_AddOverflowQ(ARMul_State*, ARMword, ARMword);
 extern void ARMul_SubCarry (ARMul_State *, ARMword, ARMword, ARMword);
 extern void ARMul_AddCarry (ARMul_State *, ARMword, ARMword, ARMword);
 extern tdstate ARMul_ThumbDecode (ARMul_State *, ARMword, ARMword, ARMword *);