7
7
#include " common/debug.h"
8
8
#include " common/scope_exit.h"
9
9
#include " common/types.h"
10
+ #include " core/memory.h"
10
11
#include " video_core/amdgpu/liverpool.h"
11
12
#include " video_core/buffer_cache/buffer_cache.h"
13
+ #include " video_core/buffer_cache/memory_tracker.h"
12
14
#include " video_core/host_shaders/fault_buffer_process_comp.h"
13
15
#include " video_core/renderer_vulkan/vk_graphics_pipeline.h"
14
16
#include " video_core/renderer_vulkan/vk_instance.h"
@@ -167,7 +169,9 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
167
169
.dstOffset = total_size_bytes,
168
170
.size = new_size,
169
171
});
170
- total_size_bytes += new_size;
172
+ // Align up to avoid cache conflicts
173
+ constexpr u64 align = 64ULL ;
174
+ total_size_bytes += Common::AlignUp (new_size, align);
171
175
};
172
176
gpu_modified_ranges.ForEachInRange (device_addr_out, range_size, add_download);
173
177
gpu_modified_ranges.Subtract (device_addr_out, range_size);
@@ -185,10 +189,12 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
185
189
const auto cmdbuf = scheduler.CommandBuffer ();
186
190
cmdbuf.copyBuffer (buffer.buffer , download_buffer.Handle (), copies);
187
191
scheduler.Finish ();
192
+ auto * memory = Core::Memory::Instance ();
188
193
for (const auto & copy : copies) {
189
194
const VAddr copy_device_addr = buffer.CpuAddr () + copy.srcOffset ;
190
195
const u64 dst_offset = copy.dstOffset - offset;
191
- std::memcpy (std::bit_cast<u8 *>(copy_device_addr), download + dst_offset, copy.size );
196
+ memory->TryWriteBacking (std::bit_cast<u8 *>(copy_device_addr), download + dst_offset,
197
+ copy.size );
192
198
}
193
199
}
194
200
@@ -308,18 +314,50 @@ void BufferCache::BindIndexBuffer(u32 index_offset) {
308
314
309
315
void BufferCache::InlineData (VAddr address, const void * value, u32 num_bytes, bool is_gds) {
310
316
ASSERT_MSG (address % 4 == 0 , " GDS offset must be dword aligned" );
311
- if (!is_gds && ! IsRegionRegistered (address, num_bytes) ) {
317
+ if (!is_gds) {
312
318
memcpy (std::bit_cast<void *>(address), value, num_bytes);
313
- return ;
319
+ if (!IsRegionRegistered (address, num_bytes)) {
320
+ return ;
321
+ }
314
322
}
315
- Buffer* buffer = [&] {
323
+ scheduler.EndRendering ();
324
+ const Buffer* buffer = [&] {
316
325
if (is_gds) {
317
326
return &gds_buffer;
318
327
}
319
328
const BufferId buffer_id = FindBuffer (address, num_bytes);
320
329
return &slot_buffers[buffer_id];
321
330
}();
322
- InlineDataBuffer (*buffer, address, value, num_bytes);
331
+ const auto cmdbuf = scheduler.CommandBuffer ();
332
+ const vk::BufferMemoryBarrier2 pre_barrier = {
333
+ .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
334
+ .srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
335
+ .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
336
+ .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
337
+ .buffer = buffer->Handle (),
338
+ .offset = buffer->Offset (address),
339
+ .size = num_bytes,
340
+ };
341
+ const vk::BufferMemoryBarrier2 post_barrier = {
342
+ .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
343
+ .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
344
+ .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
345
+ .dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
346
+ .buffer = buffer->Handle (),
347
+ .offset = buffer->Offset (address),
348
+ .size = num_bytes,
349
+ };
350
+ cmdbuf.pipelineBarrier2 (vk::DependencyInfo{
351
+ .dependencyFlags = vk::DependencyFlagBits::eByRegion,
352
+ .bufferMemoryBarrierCount = 1 ,
353
+ .pBufferMemoryBarriers = &pre_barrier,
354
+ });
355
+ cmdbuf.updateBuffer (buffer->Handle (), buffer->Offset (address), num_bytes, value);
356
+ cmdbuf.pipelineBarrier2 (vk::DependencyInfo{
357
+ .dependencyFlags = vk::DependencyFlagBits::eByRegion,
358
+ .bufferMemoryBarrierCount = 1 ,
359
+ .pBufferMemoryBarriers = &post_barrier,
360
+ });
323
361
}
324
362
325
363
void BufferCache::CopyBuffer (VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) {
@@ -431,9 +469,8 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
431
469
// For small uniform buffers that have not been modified by gpu
432
470
// use device local stream buffer to reduce renderpass breaks.
433
471
// Maybe we want to modify the threshold now that the page size is 16KB?
434
- static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
435
- const bool is_gpu_dirty = memory_tracker->IsRegionGpuModified (device_addr, size);
436
- if (!is_written && size <= StreamThreshold && !is_gpu_dirty) {
472
+ static constexpr u64 StreamThreshold = CACHING_PAGESIZE * 2 ;
473
+ if (!is_written && size <= StreamThreshold && !IsRegionGpuModified (device_addr, size)) {
437
474
const u64 offset = stream_buffer.Copy (device_addr, size, instance.UniformMinAlignment ());
438
475
return {&stream_buffer, offset};
439
476
}
@@ -443,7 +480,11 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
443
480
}
444
481
Buffer& buffer = slot_buffers[buffer_id];
445
482
SynchronizeBuffer (buffer, device_addr, size, is_texel_buffer);
446
- if (is_written) {
483
+
484
+ // Mark region as GPU modified to get additional tracking needed for readbacks.
485
+ // Somtimes huge buffers may be bound, so set a threshold here as well.
486
+ static constexpr u64 GpuMarkThreshold = 512_MB;
487
+ if (is_written && size <= GpuMarkThreshold) {
447
488
memory_tracker->MarkRegionAsGpuModified (device_addr, size);
448
489
gpu_modified_ranges.Add (device_addr, size);
449
490
}
@@ -452,8 +493,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
452
493
453
494
std::pair<Buffer*, u32 > BufferCache::ObtainViewBuffer (VAddr gpu_addr, u32 size, bool prefer_gpu) {
454
495
// Check if any buffer contains the full requested range.
455
- const u64 page = gpu_addr >> CACHING_PAGEBITS;
456
- const BufferId buffer_id = page_table[page].buffer_id ;
496
+ const BufferId buffer_id = page_table[gpu_addr >> CACHING_PAGEBITS].buffer_id ;
457
497
if (buffer_id) {
458
498
Buffer& buffer = slot_buffers[buffer_id];
459
499
if (buffer.IsInBounds (gpu_addr, size)) {
@@ -1131,4 +1171,4 @@ void BufferCache::DeleteBuffer(BufferId buffer_id) {
1131
1171
buffer.is_deleted = true ;
1132
1172
}
1133
1173
1134
- } // namespace VideoCore
1174
+ } // namespace VideoCore
0 commit comments