Skip to content

shader_recompiler: Improvements to buffer addressing implementation. #2123

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 55 additions & 41 deletions src/shader_recompiler/frontend/translate/vector_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
}

void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) {
const auto& mtbuf = inst.control.mtbuf;
const bool is_ring = mtbuf.glc && mtbuf.slc;
const auto& mubuf = inst.control.mubuf;
const bool is_ring = mubuf.glc && mubuf.slc;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
const IR::Value soffset{GetSrc(inst.src[3])};
Expand All @@ -178,22 +178,23 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
if (is_ring) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset);
}
if (mtbuf.idxen && mtbuf.offen) {
if (mubuf.idxen && mubuf.offen) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
}
if (mtbuf.idxen || mtbuf.offen) {
if (mubuf.idxen || mubuf.offen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();

IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mtbuf.idxen);
buffer_info.offset_enable.Assign(mtbuf.offen);
buffer_info.inst_offset.Assign(mtbuf.offset);
buffer_info.globally_coherent.Assign(mtbuf.glc);
buffer_info.system_coherent.Assign(mtbuf.slc);
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
if (is_typed) {
const auto& mtbuf = inst.control.mtbuf;
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
Expand All @@ -220,32 +221,38 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) {
const auto& mubuf = inst.control.mubuf;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported");
const IR::Value address = [&] -> IR::Value {
if (mubuf.idxen) {
if (mubuf.idxen && mubuf.offen) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
}
if (mubuf.idxen || mubuf.offen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();
const IR::Value soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");

IR::BufferInstInfo info{};
info.index_enable.Assign(mubuf.idxen);
IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);

const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
const IR::Value value = ir.LoadBufferFormat(handle, address, info);
const IR::Value value = ir.LoadBufferFormat(handle, address, buffer_info);
const IR::VectorReg dst_reg{inst.src[1].code};
for (u32 i = 0; i < num_dwords; i++) {
ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
}
}

void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst) {
const auto& mtbuf = inst.control.mtbuf;
const bool is_ring = mtbuf.glc && mtbuf.slc;
const auto& mubuf = inst.control.mubuf;
const bool is_ring = mubuf.glc && mubuf.slc;
const IR::VectorReg vaddr{inst.src[0].code};
const IR::ScalarReg sharp{inst.src[2].code * 4};
const IR::Value soffset{GetSrc(inst.src[3])};
Expand All @@ -259,22 +266,23 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
if (is_ring) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset);
}
if (mtbuf.idxen && mtbuf.offen) {
if (mubuf.idxen && mubuf.offen) {
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
}
if (mtbuf.idxen || mtbuf.offen) {
if (mubuf.idxen || mubuf.offen) {
return ir.GetVectorReg(vaddr);
}
return {};
}();

IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mtbuf.idxen);
buffer_info.offset_enable.Assign(mtbuf.offen);
buffer_info.inst_offset.Assign(mtbuf.offset);
buffer_info.globally_coherent.Assign(mtbuf.glc);
buffer_info.system_coherent.Assign(mtbuf.slc);
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);
if (is_typed) {
const auto& mtbuf = inst.control.mtbuf;
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
Expand Down Expand Up @@ -321,8 +329,12 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
const IR::Value soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");

IR::BufferInstInfo info{};
info.index_enable.Assign(mubuf.idxen);
IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);

const IR::VectorReg src_reg{inst.src[1].code};

Expand All @@ -338,7 +350,7 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
const IR::Value handle =
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
ir.StoreBufferFormat(handle, address, value, info);
ir.StoreBufferFormat(handle, address, value, buffer_info);
}

void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
Expand All @@ -358,10 +370,12 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
const IR::U32 soffset{GetSrc(inst.src[3])};
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");

IR::BufferInstInfo info{};
info.index_enable.Assign(mubuf.idxen);
info.inst_offset.Assign(mubuf.offset);
info.offset_enable.Assign(mubuf.offen);
IR::BufferInstInfo buffer_info{};
buffer_info.index_enable.Assign(mubuf.idxen);
buffer_info.offset_enable.Assign(mubuf.offen);
buffer_info.inst_offset.Assign(mubuf.offset);
buffer_info.globally_coherent.Assign(mubuf.glc);
buffer_info.system_coherent.Assign(mubuf.slc);

IR::Value vdata_val = ir.GetVectorReg<Shader::IR::U32>(vdata);
const IR::Value handle =
Expand All @@ -371,27 +385,27 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
const IR::Value original_val = [&] {
switch (op) {
case AtomicOp::Swap:
return ir.BufferAtomicSwap(handle, address, vdata_val, info);
return ir.BufferAtomicSwap(handle, address, vdata_val, buffer_info);
case AtomicOp::Add:
return ir.BufferAtomicIAdd(handle, address, vdata_val, info);
return ir.BufferAtomicIAdd(handle, address, vdata_val, buffer_info);
case AtomicOp::Smin:
return ir.BufferAtomicIMin(handle, address, vdata_val, true, info);
return ir.BufferAtomicIMin(handle, address, vdata_val, true, buffer_info);
case AtomicOp::Umin:
return ir.BufferAtomicIMin(handle, address, vdata_val, false, info);
return ir.BufferAtomicIMin(handle, address, vdata_val, false, buffer_info);
case AtomicOp::Smax:
return ir.BufferAtomicIMax(handle, address, vdata_val, true, info);
return ir.BufferAtomicIMax(handle, address, vdata_val, true, buffer_info);
case AtomicOp::Umax:
return ir.BufferAtomicIMax(handle, address, vdata_val, false, info);
return ir.BufferAtomicIMax(handle, address, vdata_val, false, buffer_info);
case AtomicOp::And:
return ir.BufferAtomicAnd(handle, address, vdata_val, info);
return ir.BufferAtomicAnd(handle, address, vdata_val, buffer_info);
case AtomicOp::Or:
return ir.BufferAtomicOr(handle, address, vdata_val, info);
return ir.BufferAtomicOr(handle, address, vdata_val, buffer_info);
case AtomicOp::Xor:
return ir.BufferAtomicXor(handle, address, vdata_val, info);
return ir.BufferAtomicXor(handle, address, vdata_val, buffer_info);
case AtomicOp::Inc:
return ir.BufferAtomicInc(handle, address, vdata_val, info);
return ir.BufferAtomicInc(handle, address, vdata_val, buffer_info);
case AtomicOp::Dec:
return ir.BufferAtomicDec(handle, address, vdata_val, info);
return ir.BufferAtomicDec(handle, address, vdata_val, buffer_info);
default:
UNREACHABLE();
}
Expand Down
12 changes: 9 additions & 3 deletions src/shader_recompiler/ir/passes/constant_propagation_pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,15 @@ void FoldMul(IR::Block& block, IR::Inst& inst) {
return;
}
const IR::Value rhs{inst.Arg(1)};
if (rhs.IsImmediate() && Arg<T>(rhs) == 0) {
inst.ReplaceUsesWithAndRemove(IR::Value(0u));
return;
if (rhs.IsImmediate()) {
if (Arg<T>(rhs) == 0) {
inst.ReplaceUsesWithAndRemove(IR::Value(0u));
return;
}
if (Arg<T>(rhs) == 1) {
inst.ReplaceUsesWithAndRemove(inst.Arg(0));
return;
}
}
}

Expand Down
103 changes: 64 additions & 39 deletions src/shader_recompiler/ir/passes/resource_tracking_pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -483,64 +483,89 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto
inst.SetArg(1, ir.Imm32(binding));
}

IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info,
const AmdGpu::Buffer& buffer, u32 stride) {
const auto inst_info = inst.Flags<IR::BufferInstInfo>();

// index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0)
IR::U32 index = ir.Imm32(0U);
if (inst_info.index_enable) {
const IR::U32 vgpr_index{inst_info.offset_enable
? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
: IR::U32{inst.Arg(1)}};
index = ir.IAdd(index, vgpr_index);
}
if (buffer.add_tid_enable) {
ASSERT_MSG(info.l_stage == LogicalStage::Compute,
"Thread ID buffer addressing is not supported outside of compute.");
const IR::U32 thread_id{ir.LaneId()};
index = ir.IAdd(index, thread_id);
}
// offset = (inst_offen ? vgpr_offset : 0) + inst_offset
IR::U32 offset = ir.Imm32(inst_info.inst_offset.Value());
if (inst_info.offset_enable) {
const IR::U32 vgpr_offset = inst_info.index_enable
? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
: IR::U32{inst.Arg(1)};
offset = ir.IAdd(offset, vgpr_offset);
}
const IR::U32 const_stride = ir.Imm32(stride);
IR::U32 buffer_offset;
if (buffer.swizzle_enable) {
const IR::U32 const_index_stride = ir.Imm32(buffer.GetIndexStride());
const IR::U32 const_element_size = ir.Imm32(buffer.GetElementSize());
// index_msb = index / const_index_stride
const IR::U32 index_msb{ir.IDiv(index, const_index_stride)};
// index_lsb = index % const_index_stride
const IR::U32 index_lsb{ir.IMod(index, const_index_stride)};
// offset_msb = offset / const_element_size
const IR::U32 offset_msb{ir.IDiv(offset, const_element_size)};
// offset_lsb = offset % const_element_size
const IR::U32 offset_lsb{ir.IMod(offset, const_element_size)};
// buffer_offset =
// (index_msb * const_stride + offset_msb * const_element_size) * const_index_stride
// + index_lsb * const_element_size + offset_lsb
const IR::U32 buffer_offset_msb = ir.IMul(
ir.IAdd(ir.IMul(index_msb, const_stride), ir.IMul(offset_msb, const_element_size)),
const_index_stride);
const IR::U32 buffer_offset_lsb =
ir.IAdd(ir.IMul(index_lsb, const_element_size), offset_lsb);
buffer_offset = ir.IAdd(buffer_offset_msb, buffer_offset_lsb);
} else {
// buffer_offset = index * const_stride + offset
buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset);
}
return buffer_offset;
}

void PatchBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) {
const auto handle = inst.Arg(0);
const auto buffer_res = info.buffers[handle.U32()];
const auto buffer = buffer_res.GetSharp(info);

ASSERT(!buffer.add_tid_enable);

// Address of constant buffer reads can be calculated at IR emission time.
if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer) {
return;
}

IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
const auto inst_info = inst.Flags<IR::BufferInstInfo>();

const IR::U32 index_stride = ir.Imm32(buffer.index_stride);
const IR::U32 element_size = ir.Imm32(buffer.element_size);

// Compute address of the buffer using the stride.
IR::U32 address = ir.Imm32(inst_info.inst_offset.Value());
if (inst_info.index_enable) {
const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
: IR::U32{inst.Arg(1)};
if (buffer.swizzle_enable) {
const IR::U32 stride_index_stride =
ir.Imm32(static_cast<u32>(buffer.stride * buffer.index_stride));
const IR::U32 index_msb = ir.IDiv(index, index_stride);
const IR::U32 index_lsb = ir.IMod(index, index_stride);
address = ir.IAdd(address, ir.IAdd(ir.IMul(index_msb, stride_index_stride),
ir.IMul(index_lsb, element_size)));
} else {
address = ir.IAdd(address, ir.IMul(index, ir.Imm32(buffer.GetStride())));
}
}
if (inst_info.offset_enable) {
const IR::U32 offset = inst_info.index_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
: IR::U32{inst.Arg(1)};
if (buffer.swizzle_enable) {
const IR::U32 element_size_index_stride =
ir.Imm32(buffer.element_size * buffer.index_stride);
const IR::U32 offset_msb = ir.IDiv(offset, element_size);
const IR::U32 offset_lsb = ir.IMod(offset, element_size);
address = ir.IAdd(address,
ir.IAdd(ir.IMul(offset_msb, element_size_index_stride), offset_lsb));
} else {
address = ir.IAdd(address, offset);
}
}
inst.SetArg(1, address);
inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, buffer.stride));
}

void PatchTextureBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) {
const auto handle = inst.Arg(0);
const auto buffer_res = info.texture_buffers[handle.U32()];
const auto buffer = buffer_res.GetSharp(info);

ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable);
// Only linear addressing with index is supported currently, since we cannot yet
// address with sub-texel granularity.
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
ASSERT_MSG(!buffer.swizzle_enable && !inst_info.offset_enable && inst_info.inst_offset == 0,
"Unsupported texture buffer address mode.");

IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
// Stride of 1 to get an index into formatted data. See above addressing limitations.
inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, 1U));

if (inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32) {
const auto swizzled = ApplySwizzle(ir, inst.Arg(2), buffer.DstSelect());
Expand Down
11 changes: 11 additions & 0 deletions src/shader_recompiler/specialization.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,16 @@ struct VsAttribSpecialization {
struct BufferSpecialization {
u16 stride : 14;
u16 is_storage : 1;
u16 swizzle_enable : 1;
u8 index_stride : 2 = 0;
u8 element_size : 2 = 0;
u32 size = 0;

bool operator==(const BufferSpecialization& other) const {
return stride == other.stride && is_storage == other.is_storage &&
swizzle_enable == other.swizzle_enable &&
(!swizzle_enable ||
(index_stride == other.index_stride && element_size == other.element_size)) &&
(size >= other.is_storage || is_storage);
}
};
Expand Down Expand Up @@ -101,6 +107,11 @@ struct StageSpecialization {
[](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
spec.stride = sharp.GetStride();
spec.is_storage = desc.IsStorage(sharp);
spec.swizzle_enable = sharp.swizzle_enable;
if (spec.swizzle_enable) {
spec.index_stride = sharp.index_stride;
spec.element_size = sharp.element_size;
}
if (!spec.is_storage) {
spec.size = sharp.GetSize();
}
Expand Down
10 changes: 10 additions & 0 deletions src/video_core/amdgpu/resource.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ struct Buffer {
u32 GetSize() const noexcept {
return stride == 0 ? num_records : (stride * num_records);
}

u32 GetIndexStride() const noexcept {
// Index stride is 2 bits, meaning 8, 16, 32, or 64.
return 8 << index_stride;
}

u32 GetElementSize() const noexcept {
// Element size is 2 bits, meaning 2, 4, 8, or 16.
return 2 << element_size;
}
};
static_assert(sizeof(Buffer) == 16); // 128bits

Expand Down