1
0
mirror of https://git.suyu.dev/suyu/suyu synced 2025-01-16 04:40:12 -06:00

shader: Split SSY and PBK stack

Hardware testing revealed that SSY and PBK push to a different stack,
allowing code like this:

        SSY label1;
        PBK label2;
        SYNC;
label1: PBK;
label2: EXIT;
This commit is contained in:
ReinUsesLisp 2019-06-02 18:52:07 -03:00
parent cd2d9628c9
commit fe8e6618f2
4 changed files with 78 additions and 27 deletions

View File

@ -143,6 +143,24 @@ u32 GetGenericAttributeIndex(Attribute::Index index) {
return static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
}
constexpr const char* GetFlowStackPrefix(MetaStackClass stack) {
switch (stack) {
case MetaStackClass::Ssy:
return "ssy";
case MetaStackClass::Pbk:
return "pbk";
}
return {};
}
std::string FlowStackName(MetaStackClass stack) {
return fmt::format("{}_flow_stack", GetFlowStackPrefix(stack));
}
std::string FlowStackTopName(MetaStackClass stack) {
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
}
class GLSLDecompiler final {
public:
explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
@ -173,8 +191,10 @@ public:
// TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
// unlikely that shaders will use 20 nested SSYs and PBKs.
constexpr u32 FLOW_STACK_SIZE = 20;
code.AddLine("uint flow_stack[{}];", FLOW_STACK_SIZE);
code.AddLine("uint flow_stack_top = 0u;");
for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
}
code.AddLine("while (true) {{");
++code.scope;
@ -1438,15 +1458,18 @@ private:
}
std::string PushFlowStack(Operation operation) {
const auto stack = std::get<MetaStackClass>(operation.GetMeta());
const auto target = std::get_if<ImmediateNode>(&*operation[0]);
UNIMPLEMENTED_IF(!target);
code.AddLine("flow_stack[flow_stack_top++] = 0x{:x}u;", target->GetValue());
code.AddLine("{}[{}++] = 0x{:x}u;", FlowStackName(stack), FlowStackTopName(stack),
target->GetValue());
return {};
}
std::string PopFlowStack(Operation operation) {
code.AddLine("jmp_to = flow_stack[--flow_stack_top];");
const auto stack = std::get<MetaStackClass>(operation.GetMeta());
code.AddLine("jmp_to = {}[--{}];", FlowStackName(stack), FlowStackTopName(stack));
code.AddLine("break;");
return {};
}

View File

@ -132,20 +132,16 @@ public:
branch_labels.push_back(label);
}
// TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
// that shaders will use 20 nested SSYs and PBKs.
constexpr u32 FLOW_STACK_SIZE = 20;
const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
jmp_to = Emit(OpVariable(TypePointer(spv::StorageClass::Function, t_uint),
spv::StorageClass::Function, Constant(t_uint, first_address)));
flow_stack = Emit(OpVariable(TypePointer(spv::StorageClass::Function, flow_stack_type),
spv::StorageClass::Function, ConstantNull(flow_stack_type)));
flow_stack_top =
Emit(OpVariable(t_func_uint, spv::StorageClass::Function, Constant(t_uint, 0)));
std::tie(ssy_flow_stack, ssy_flow_stack_top) = CreateFlowStack();
std::tie(pbk_flow_stack, pbk_flow_stack_top) = CreateFlowStack();
Name(jmp_to, "jmp_to");
Name(flow_stack, "flow_stack");
Name(flow_stack_top, "flow_stack_top");
Name(ssy_flow_stack, "ssy_flow_stack");
Name(ssy_flow_stack_top, "ssy_flow_stack_top");
Name(pbk_flow_stack, "pbk_flow_stack");
Name(pbk_flow_stack_top, "pbk_flow_stack_top");
Emit(OpBranch(loop_label));
Emit(loop_label);
@ -952,6 +948,7 @@ private:
const auto target = std::get_if<ImmediateNode>(&*operation[0]);
ASSERT(target);
const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
const Id current = Emit(OpLoad(t_uint, flow_stack_top));
const Id next = Emit(OpIAdd(t_uint, current, Constant(t_uint, 1)));
const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, current));
@ -962,6 +959,7 @@ private:
}
Id PopFlowStack(Operation operation) {
const auto [flow_stack, flow_stack_top] = GetFlowStack(operation);
const Id current = Emit(OpLoad(t_uint, flow_stack_top));
const Id previous = Emit(OpISub(t_uint, current, Constant(t_uint, 1)));
const Id access = Emit(OpAccessChain(t_func_uint, flow_stack, previous));
@ -1172,6 +1170,31 @@ private:
Emit(skip_label);
}
std::tuple<Id, Id> CreateFlowStack() {
// TODO(Rodrigo): Figure out the actual depth of the flow stack, for now it seems unlikely
// that shaders will use 20 nested SSYs and PBKs.
constexpr u32 FLOW_STACK_SIZE = 20;
constexpr auto storage_class = spv::StorageClass::Function;
const Id flow_stack_type = TypeArray(t_uint, Constant(t_uint, FLOW_STACK_SIZE));
const Id stack = Emit(OpVariable(TypePointer(storage_class, flow_stack_type), storage_class,
ConstantNull(flow_stack_type)));
const Id top = Emit(OpVariable(t_func_uint, storage_class, Constant(t_uint, 0)));
return std::tie(stack, top);
}
std::pair<Id, Id> GetFlowStack(Operation operation) {
const auto stack_class = std::get<MetaStackClass>(operation.GetMeta());
switch (stack_class) {
case MetaStackClass::Ssy:
return {ssy_flow_stack, ssy_flow_stack_top};
case MetaStackClass::Pbk:
return {pbk_flow_stack, pbk_flow_stack_top};
}
UNREACHABLE();
return {};
}
static constexpr OperationDecompilersArray operation_decompilers = {
&SPIRVDecompiler::Assign,
@ -1414,8 +1437,10 @@ private:
Id execute_function{};
Id jmp_to{};
Id flow_stack_top{};
Id flow_stack{};
Id ssy_flow_stack_top{};
Id pbk_flow_stack_top{};
Id ssy_flow_stack{};
Id pbk_flow_stack{};
Id continue_label{};
std::map<u32, Id> labels;
};

View File

@ -109,22 +109,20 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
"Constant buffer flow is not supported");
// The SSY opcode tells the GPU where to re-converge divergent execution paths, it sets the
// target of the jump that the SYNC instruction will make. The SSY opcode has a similar
// structure to the BRA opcode.
// The SSY opcode tells the GPU where to re-converge divergent execution paths with SYNC.
const u32 target = pc + instr.bra.GetBranchTarget();
bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
bb.push_back(
Operation(OperationCode::PushFlowStack, MetaStackClass::Ssy, Immediate(target)));
break;
}
case OpCode::Id::PBK: {
UNIMPLEMENTED_IF_MSG(instr.bra.constant_buffer != 0,
"Constant buffer PBK is not supported");
// PBK pushes to a stack the address where BRK will jump to. This shares stack with SSY but
// using SYNC on a PBK address will kill the shader execution. We don't emulate this because
// it's very unlikely a driver will emit such invalid shader.
// PBK pushes to a stack the address where BRK will jump to.
const u32 target = pc + instr.bra.GetBranchTarget();
bb.push_back(Operation(OperationCode::PushFlowStack, Immediate(target)));
bb.push_back(
Operation(OperationCode::PushFlowStack, MetaStackClass::Pbk, Immediate(target)));
break;
}
case OpCode::Id::SYNC: {
@ -133,7 +131,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
static_cast<u32>(cc));
// The SYNC opcode jumps to the address previously set by the SSY opcode
bb.push_back(Operation(OperationCode::PopFlowStack));
bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Ssy));
break;
}
case OpCode::Id::BRK: {
@ -142,7 +140,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
static_cast<u32>(cc));
// The BRK opcode jumps to the address previously set by the PBK opcode
bb.push_back(Operation(OperationCode::PopFlowStack));
bb.push_back(Operation(OperationCode::PopFlowStack, MetaStackClass::Pbk));
break;
}
case OpCode::Id::IPA: {

View File

@ -174,6 +174,11 @@ enum class InternalFlag {
Amount = 4,
};
enum class MetaStackClass {
Ssy,
Pbk,
};
class OperationNode;
class ConditionalNode;
class GprNode;
@ -285,7 +290,7 @@ struct MetaTexture {
};
/// Parameters that modify an operation but are not part of any particular operand
using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
using Meta = std::variant<MetaArithmetic, MetaTexture, MetaStackClass, Tegra::Shader::HalfType>;
/// Holds any kind of operation that can be done in the IR
class OperationNode final {