Skip to content

Commit d10250b

Browse files
committed
shader_recompiler: Improvements to buffer addressing implementation.
1 parent 4719d32 commit d10250b

File tree

5 files changed

+150
-84
lines changed

5 files changed

+150
-84
lines changed

src/shader_recompiler/frontend/translate/vector_memory.cpp

Lines changed: 55 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,8 @@ void Translator::EmitVectorMemory(const GcnInst& inst) {
164164
}
165165

166166
void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) {
167-
const auto& mtbuf = inst.control.mtbuf;
168-
const bool is_ring = mtbuf.glc && mtbuf.slc;
167+
const auto& mubuf = inst.control.mubuf;
168+
const bool is_ring = mubuf.glc && mubuf.slc;
169169
const IR::VectorReg vaddr{inst.src[0].code};
170170
const IR::ScalarReg sharp{inst.src[2].code * 4};
171171
const IR::Value soffset{GetSrc(inst.src[3])};
@@ -178,22 +178,23 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst)
178178
if (is_ring) {
179179
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset);
180180
}
181-
if (mtbuf.idxen && mtbuf.offen) {
181+
if (mubuf.idxen && mubuf.offen) {
182182
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
183183
}
184-
if (mtbuf.idxen || mtbuf.offen) {
184+
if (mubuf.idxen || mubuf.offen) {
185185
return ir.GetVectorReg(vaddr);
186186
}
187187
return {};
188188
}();
189189

190190
IR::BufferInstInfo buffer_info{};
191-
buffer_info.index_enable.Assign(mtbuf.idxen);
192-
buffer_info.offset_enable.Assign(mtbuf.offen);
193-
buffer_info.inst_offset.Assign(mtbuf.offset);
194-
buffer_info.globally_coherent.Assign(mtbuf.glc);
195-
buffer_info.system_coherent.Assign(mtbuf.slc);
191+
buffer_info.index_enable.Assign(mubuf.idxen);
192+
buffer_info.offset_enable.Assign(mubuf.offen);
193+
buffer_info.inst_offset.Assign(mubuf.offset);
194+
buffer_info.globally_coherent.Assign(mubuf.glc);
195+
buffer_info.system_coherent.Assign(mubuf.slc);
196196
if (is_typed) {
197+
const auto& mtbuf = inst.control.mtbuf;
197198
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
198199
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
199200
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
@@ -220,32 +221,38 @@ void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, const GcnInst& inst) {
220221
const auto& mubuf = inst.control.mubuf;
221222
const IR::VectorReg vaddr{inst.src[0].code};
222223
const IR::ScalarReg sharp{inst.src[2].code * 4};
223-
ASSERT_MSG(!mubuf.offen && mubuf.offset == 0, "Offsets for image buffers are not supported");
224224
const IR::Value address = [&] -> IR::Value {
225-
if (mubuf.idxen) {
225+
if (mubuf.idxen && mubuf.offen) {
226+
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
227+
}
228+
if (mubuf.idxen || mubuf.offen) {
226229
return ir.GetVectorReg(vaddr);
227230
}
228231
return {};
229232
}();
230233
const IR::Value soffset{GetSrc(inst.src[3])};
231234
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
232235

233-
IR::BufferInstInfo info{};
234-
info.index_enable.Assign(mubuf.idxen);
236+
IR::BufferInstInfo buffer_info{};
237+
buffer_info.index_enable.Assign(mubuf.idxen);
238+
buffer_info.offset_enable.Assign(mubuf.offen);
239+
buffer_info.inst_offset.Assign(mubuf.offset);
240+
buffer_info.globally_coherent.Assign(mubuf.glc);
241+
buffer_info.system_coherent.Assign(mubuf.slc);
235242

236243
const IR::Value handle =
237244
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
238245
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
239-
const IR::Value value = ir.LoadBufferFormat(handle, address, info);
246+
const IR::Value value = ir.LoadBufferFormat(handle, address, buffer_info);
240247
const IR::VectorReg dst_reg{inst.src[1].code};
241248
for (u32 i = 0; i < num_dwords; i++) {
242249
ir.SetVectorReg(dst_reg + i, IR::F32{ir.CompositeExtract(value, i)});
243250
}
244251
}
245252

246253
void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst) {
247-
const auto& mtbuf = inst.control.mtbuf;
248-
const bool is_ring = mtbuf.glc && mtbuf.slc;
254+
const auto& mubuf = inst.control.mubuf;
255+
const bool is_ring = mubuf.glc && mubuf.slc;
249256
const IR::VectorReg vaddr{inst.src[0].code};
250257
const IR::ScalarReg sharp{inst.src[2].code * 4};
251258
const IR::Value soffset{GetSrc(inst.src[3])};
@@ -259,22 +266,23 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst
259266
if (is_ring) {
260267
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset);
261268
}
262-
if (mtbuf.idxen && mtbuf.offen) {
269+
if (mubuf.idxen && mubuf.offen) {
263270
return ir.CompositeConstruct(ir.GetVectorReg(vaddr), ir.GetVectorReg(vaddr + 1));
264271
}
265-
if (mtbuf.idxen || mtbuf.offen) {
272+
if (mubuf.idxen || mubuf.offen) {
266273
return ir.GetVectorReg(vaddr);
267274
}
268275
return {};
269276
}();
270277

271278
IR::BufferInstInfo buffer_info{};
272-
buffer_info.index_enable.Assign(mtbuf.idxen);
273-
buffer_info.offset_enable.Assign(mtbuf.offen);
274-
buffer_info.inst_offset.Assign(mtbuf.offset);
275-
buffer_info.globally_coherent.Assign(mtbuf.glc);
276-
buffer_info.system_coherent.Assign(mtbuf.slc);
279+
buffer_info.index_enable.Assign(mubuf.idxen);
280+
buffer_info.offset_enable.Assign(mubuf.offen);
281+
buffer_info.inst_offset.Assign(mubuf.offset);
282+
buffer_info.globally_coherent.Assign(mubuf.glc);
283+
buffer_info.system_coherent.Assign(mubuf.slc);
277284
if (is_typed) {
285+
const auto& mtbuf = inst.control.mtbuf;
278286
const auto dmft = static_cast<AmdGpu::DataFormat>(mtbuf.dfmt);
279287
const auto nfmt = static_cast<AmdGpu::NumberFormat>(mtbuf.nfmt);
280288
ASSERT(nfmt == AmdGpu::NumberFormat::Float &&
@@ -321,8 +329,12 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
321329
const IR::Value soffset{GetSrc(inst.src[3])};
322330
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
323331

324-
IR::BufferInstInfo info{};
325-
info.index_enable.Assign(mubuf.idxen);
332+
IR::BufferInstInfo buffer_info{};
333+
buffer_info.index_enable.Assign(mubuf.idxen);
334+
buffer_info.offset_enable.Assign(mubuf.offen);
335+
buffer_info.inst_offset.Assign(mubuf.offset);
336+
buffer_info.globally_coherent.Assign(mubuf.glc);
337+
buffer_info.system_coherent.Assign(mubuf.slc);
326338

327339
const IR::VectorReg src_reg{inst.src[1].code};
328340

@@ -338,7 +350,7 @@ void Translator::BUFFER_STORE_FORMAT(u32 num_dwords, const GcnInst& inst) {
338350
const IR::Value handle =
339351
ir.CompositeConstruct(ir.GetScalarReg(sharp), ir.GetScalarReg(sharp + 1),
340352
ir.GetScalarReg(sharp + 2), ir.GetScalarReg(sharp + 3));
341-
ir.StoreBufferFormat(handle, address, value, info);
353+
ir.StoreBufferFormat(handle, address, value, buffer_info);
342354
}
343355

344356
void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
@@ -358,10 +370,12 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
358370
const IR::U32 soffset{GetSrc(inst.src[3])};
359371
ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported");
360372

361-
IR::BufferInstInfo info{};
362-
info.index_enable.Assign(mubuf.idxen);
363-
info.inst_offset.Assign(mubuf.offset);
364-
info.offset_enable.Assign(mubuf.offen);
373+
IR::BufferInstInfo buffer_info{};
374+
buffer_info.index_enable.Assign(mubuf.idxen);
375+
buffer_info.offset_enable.Assign(mubuf.offen);
376+
buffer_info.inst_offset.Assign(mubuf.offset);
377+
buffer_info.globally_coherent.Assign(mubuf.glc);
378+
buffer_info.system_coherent.Assign(mubuf.slc);
365379

366380
IR::Value vdata_val = ir.GetVectorReg<Shader::IR::U32>(vdata);
367381
const IR::Value handle =
@@ -371,27 +385,27 @@ void Translator::BUFFER_ATOMIC(AtomicOp op, const GcnInst& inst) {
371385
const IR::Value original_val = [&] {
372386
switch (op) {
373387
case AtomicOp::Swap:
374-
return ir.BufferAtomicSwap(handle, address, vdata_val, info);
388+
return ir.BufferAtomicSwap(handle, address, vdata_val, buffer_info);
375389
case AtomicOp::Add:
376-
return ir.BufferAtomicIAdd(handle, address, vdata_val, info);
390+
return ir.BufferAtomicIAdd(handle, address, vdata_val, buffer_info);
377391
case AtomicOp::Smin:
378-
return ir.BufferAtomicIMin(handle, address, vdata_val, true, info);
392+
return ir.BufferAtomicIMin(handle, address, vdata_val, true, buffer_info);
379393
case AtomicOp::Umin:
380-
return ir.BufferAtomicIMin(handle, address, vdata_val, false, info);
394+
return ir.BufferAtomicIMin(handle, address, vdata_val, false, buffer_info);
381395
case AtomicOp::Smax:
382-
return ir.BufferAtomicIMax(handle, address, vdata_val, true, info);
396+
return ir.BufferAtomicIMax(handle, address, vdata_val, true, buffer_info);
383397
case AtomicOp::Umax:
384-
return ir.BufferAtomicIMax(handle, address, vdata_val, false, info);
398+
return ir.BufferAtomicIMax(handle, address, vdata_val, false, buffer_info);
385399
case AtomicOp::And:
386-
return ir.BufferAtomicAnd(handle, address, vdata_val, info);
400+
return ir.BufferAtomicAnd(handle, address, vdata_val, buffer_info);
387401
case AtomicOp::Or:
388-
return ir.BufferAtomicOr(handle, address, vdata_val, info);
402+
return ir.BufferAtomicOr(handle, address, vdata_val, buffer_info);
389403
case AtomicOp::Xor:
390-
return ir.BufferAtomicXor(handle, address, vdata_val, info);
404+
return ir.BufferAtomicXor(handle, address, vdata_val, buffer_info);
391405
case AtomicOp::Inc:
392-
return ir.BufferAtomicInc(handle, address, vdata_val, info);
406+
return ir.BufferAtomicInc(handle, address, vdata_val, buffer_info);
393407
case AtomicOp::Dec:
394-
return ir.BufferAtomicDec(handle, address, vdata_val, info);
408+
return ir.BufferAtomicDec(handle, address, vdata_val, buffer_info);
395409
default:
396410
UNREACHABLE();
397411
}

src/shader_recompiler/ir/passes/constant_propagation_pass.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,9 +222,15 @@ void FoldMul(IR::Block& block, IR::Inst& inst) {
222222
return;
223223
}
224224
const IR::Value rhs{inst.Arg(1)};
225-
if (rhs.IsImmediate() && Arg<T>(rhs) == 0) {
226-
inst.ReplaceUsesWithAndRemove(IR::Value(0u));
227-
return;
225+
if (rhs.IsImmediate()) {
226+
if (Arg<T>(rhs) == 0) {
227+
inst.ReplaceUsesWithAndRemove(IR::Value(0u));
228+
return;
229+
}
230+
if (Arg<T>(rhs) == 1) {
231+
inst.ReplaceUsesWithAndRemove(inst.Arg(0));
232+
return;
233+
}
228234
}
229235
}
230236

src/shader_recompiler/ir/passes/resource_tracking_pass.cpp

Lines changed: 64 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -483,64 +483,89 @@ void PatchDataRingAccess(IR::Block& block, IR::Inst& inst, Info& info, Descripto
483483
inst.SetArg(1, ir.Imm32(binding));
484484
}
485485

486+
IR::U32 CalculateBufferAddress(IR::IREmitter& ir, const IR::Inst& inst, const Info& info,
487+
const AmdGpu::Buffer& buffer, u32 stride) {
488+
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
489+
490+
// index = (inst_idxen ? vgpr_index : 0) + (const_add_tid_enable ? thread_id[5:0] : 0)
491+
IR::U32 index = ir.Imm32(0U);
492+
if (inst_info.index_enable) {
493+
const IR::U32 vgpr_index{inst_info.offset_enable
494+
? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
495+
: IR::U32{inst.Arg(1)}};
496+
index = ir.IAdd(index, vgpr_index);
497+
}
498+
if (buffer.add_tid_enable) {
499+
ASSERT_MSG(info.l_stage == LogicalStage::Compute,
500+
"Thread ID buffer addressing is not supported outside of compute.");
501+
const IR::U32 thread_id{ir.LaneId()};
502+
index = ir.IAdd(index, thread_id);
503+
}
504+
// offset = (inst_offen ? vgpr_offset : 0) + inst_offset
505+
IR::U32 offset = ir.Imm32(inst_info.inst_offset.Value());
506+
if (inst_info.offset_enable) {
507+
const IR::U32 vgpr_offset = inst_info.index_enable
508+
? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
509+
: IR::U32{inst.Arg(1)};
510+
offset = ir.IAdd(offset, vgpr_offset);
511+
}
512+
const IR::U32 const_stride = ir.Imm32(stride);
513+
IR::U32 buffer_offset;
514+
if (buffer.swizzle_enable) {
515+
const IR::U32 const_index_stride = ir.Imm32(buffer.GetIndexStride());
516+
const IR::U32 const_element_size = ir.Imm32(buffer.GetElementSize());
517+
// index_msb = index / const_index_stride
518+
const IR::U32 index_msb{ir.IDiv(index, const_index_stride)};
519+
// index_lsb = index % const_index_stride
520+
const IR::U32 index_lsb{ir.IMod(index, const_index_stride)};
521+
// offset_msb = offset / const_element_size
522+
const IR::U32 offset_msb{ir.IDiv(offset, const_element_size)};
523+
// offset_lsb = offset % const_element_size
524+
const IR::U32 offset_lsb{ir.IMod(offset, const_element_size)};
525+
// buffer_offset =
526+
// (index_msb * const_stride + offset_msb * const_element_size) * const_index_stride
527+
// + index_lsb * const_element_size + offset_lsb
528+
const IR::U32 buffer_offset_msb = ir.IMul(
529+
ir.IAdd(ir.IMul(index_msb, const_stride), ir.IMul(offset_msb, const_element_size)),
530+
const_index_stride);
531+
const IR::U32 buffer_offset_lsb =
532+
ir.IAdd(ir.IMul(index_lsb, const_element_size), offset_lsb);
533+
buffer_offset = ir.IAdd(buffer_offset_msb, buffer_offset_lsb);
534+
} else {
535+
// buffer_offset = index * const_stride + offset
536+
buffer_offset = ir.IAdd(ir.IMul(index, const_stride), offset);
537+
}
538+
return buffer_offset;
539+
}
540+
486541
void PatchBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) {
487542
const auto handle = inst.Arg(0);
488543
const auto buffer_res = info.buffers[handle.U32()];
489544
const auto buffer = buffer_res.GetSharp(info);
490545

491-
ASSERT(!buffer.add_tid_enable);
492-
493546
// Address of constant buffer reads can be calculated at IR emission time.
494547
if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer) {
495548
return;
496549
}
497550

498551
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
499-
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
500-
501-
const IR::U32 index_stride = ir.Imm32(buffer.index_stride);
502-
const IR::U32 element_size = ir.Imm32(buffer.element_size);
503-
504-
// Compute address of the buffer using the stride.
505-
IR::U32 address = ir.Imm32(inst_info.inst_offset.Value());
506-
if (inst_info.index_enable) {
507-
const IR::U32 index = inst_info.offset_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 0)}
508-
: IR::U32{inst.Arg(1)};
509-
if (buffer.swizzle_enable) {
510-
const IR::U32 stride_index_stride =
511-
ir.Imm32(static_cast<u32>(buffer.stride * buffer.index_stride));
512-
const IR::U32 index_msb = ir.IDiv(index, index_stride);
513-
const IR::U32 index_lsb = ir.IMod(index, index_stride);
514-
address = ir.IAdd(address, ir.IAdd(ir.IMul(index_msb, stride_index_stride),
515-
ir.IMul(index_lsb, element_size)));
516-
} else {
517-
address = ir.IAdd(address, ir.IMul(index, ir.Imm32(buffer.GetStride())));
518-
}
519-
}
520-
if (inst_info.offset_enable) {
521-
const IR::U32 offset = inst_info.index_enable ? IR::U32{ir.CompositeExtract(inst.Arg(1), 1)}
522-
: IR::U32{inst.Arg(1)};
523-
if (buffer.swizzle_enable) {
524-
const IR::U32 element_size_index_stride =
525-
ir.Imm32(buffer.element_size * buffer.index_stride);
526-
const IR::U32 offset_msb = ir.IDiv(offset, element_size);
527-
const IR::U32 offset_lsb = ir.IMod(offset, element_size);
528-
address = ir.IAdd(address,
529-
ir.IAdd(ir.IMul(offset_msb, element_size_index_stride), offset_lsb));
530-
} else {
531-
address = ir.IAdd(address, offset);
532-
}
533-
}
534-
inst.SetArg(1, address);
552+
inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, buffer.stride));
535553
}
536554

537555
void PatchTextureBufferArgs(IR::Block& block, IR::Inst& inst, Info& info) {
538556
const auto handle = inst.Arg(0);
539557
const auto buffer_res = info.texture_buffers[handle.U32()];
540558
const auto buffer = buffer_res.GetSharp(info);
541559

542-
ASSERT(!buffer.swizzle_enable && !buffer.add_tid_enable);
560+
// Only linear addressing with index is supported currently, since we cannot yet
561+
// address with sub-texel granularity.
562+
const auto inst_info = inst.Flags<IR::BufferInstInfo>();
563+
ASSERT_MSG(!buffer.swizzle_enable && !inst_info.offset_enable && inst_info.inst_offset == 0,
564+
"Unsupported texture buffer address mode.");
565+
543566
IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
567+
// Stride of 1 to get an index into formatted data. See above addressing limitations.
568+
inst.SetArg(1, CalculateBufferAddress(ir, inst, info, buffer, 1U));
544569

545570
if (inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32) {
546571
const auto swizzled = ApplySwizzle(ir, inst.Arg(2), buffer.DstSelect());

src/shader_recompiler/specialization.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,17 @@ struct VsAttribSpecialization {
2121
struct BufferSpecialization {
2222
u16 stride : 14;
2323
u16 is_storage : 1;
24+
u16 swizzle_enable : 1;
25+
u8 index_stride : 2 = 0;
26+
u8 element_size : 2 = 0;
2427
u32 size = 0;
2528

2629
bool operator==(const BufferSpecialization& other) const {
2730
return stride == other.stride && is_storage == other.is_storage &&
28-
(size >= other.is_storage || is_storage);
31+
swizzle_enable == other.swizzle_enable &&
32+
(!swizzle_enable ||
33+
(index_stride == other.index_stride && element_size == other.element_size)) &&
34+
(is_storage || size >= other.size);
2935
}
3036
};
3137

@@ -101,6 +107,11 @@ struct StageSpecialization {
101107
[](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
102108
spec.stride = sharp.GetStride();
103109
spec.is_storage = desc.IsStorage(sharp);
110+
spec.swizzle_enable = sharp.swizzle_enable;
111+
if (spec.swizzle_enable) {
112+
spec.index_stride = sharp.index_stride;
113+
spec.element_size = sharp.element_size;
114+
}
104115
if (!spec.is_storage) {
105116
spec.size = sharp.GetSize();
106117
}

src/video_core/amdgpu/resource.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,16 @@ struct Buffer {
7676
u32 GetSize() const noexcept {
7777
return stride == 0 ? num_records : (stride * num_records);
7878
}
79+
80+
u32 GetIndexStride() const noexcept {
81+
// Index stride is 2 bits, meaning 8, 16, 32, or 64.
82+
return 8 << index_stride;
83+
}
84+
85+
u32 GetElementSize() const noexcept {
86+
// Element size is 2 bits, meaning 2, 4, 8, or 16.
87+
return 2 << element_size;
88+
}
7989
};
8090
static_assert(sizeof(Buffer) == 16); // 128bits
8191

0 commit comments

Comments
 (0)