2
2
// SPDX-License-Identifier: GPL-2.0-or-later
3
3
4
4
#include < algorithm>
5
+ #include < semaphore>
5
6
#include " common/alignment.h"
6
7
#include " common/debug.h"
7
8
#include " common/scope_exit.h"
@@ -35,8 +36,8 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
35
36
gds_buffer{instance, scheduler, MemoryUsage::Stream, 0 , AllFlags, DataShareBufferSize},
36
37
bda_pagetable_buffer{instance, scheduler, MemoryUsage::DeviceLocal,
37
38
0 , AllFlags, BDA_PAGETABLE_SIZE},
38
- fault_buffer (instance, scheduler, MemoryUsage::DeviceLocal, 0 , AllFlags, FAULT_BUFFER_SIZE),
39
- memory_tracker{ tracker} {
39
+ fault_buffer (instance, scheduler, MemoryUsage::DeviceLocal, 0 , AllFlags, FAULT_BUFFER_SIZE) {
40
+ memory_tracker = std::make_unique<MemoryTracker>( tracker);
40
41
Vulkan::SetObjectName (instance.GetDevice (), gds_buffer.Handle (), " GDS Buffer" );
41
42
Vulkan::SetObjectName (instance.GetDevice (), bda_pagetable_buffer.Handle (),
42
43
" BDA Page Table Buffer" );
@@ -127,21 +128,35 @@ BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& s
127
128
BufferCache::~BufferCache () = default ;
128
129
129
130
void BufferCache::InvalidateMemory (VAddr device_addr, u64 size, bool unmap) {
130
- const bool is_tracked = IsRegionRegistered (device_addr, size);
131
- if (is_tracked) {
132
- // Mark the page as CPU modified to stop tracking writes.
133
- memory_tracker.MarkRegionAsCpuModified (device_addr, size);
131
+ if (!IsRegionRegistered (device_addr, size)) {
132
+ return ;
133
+ }
134
+ if (memory_tracker->IsRegionGpuModified (device_addr, size)) {
135
+ ReadMemory (device_addr, size);
136
+ }
137
+ memory_tracker->MarkRegionAsCpuModified (device_addr, size);
138
+ }
134
139
135
- if (unmap) {
136
- return ;
137
- }
140
+ void BufferCache::ReadMemory (VAddr device_addr, u64 size) {
141
+ if (std::this_thread::get_id () != liverpool->gpu_id ) {
142
+ std::binary_semaphore command_wait{0 };
143
+ liverpool->SendCommand ([this , &command_wait, device_addr, size] {
144
+ Buffer& buffer = slot_buffers[FindBuffer (device_addr, size)];
145
+ DownloadBufferMemory (buffer, device_addr, size);
146
+ command_wait.release ();
147
+ });
148
+ command_wait.acquire ();
149
+ } else {
150
+ Buffer& buffer = slot_buffers[FindBuffer (device_addr, size)];
151
+ DownloadBufferMemory (buffer, device_addr, size);
138
152
}
153
+ memory_tracker->UnmarkRegionAsGpuModified (device_addr, size);
139
154
}
140
155
141
156
void BufferCache::DownloadBufferMemory (Buffer& buffer, VAddr device_addr, u64 size) {
142
157
boost::container::small_vector<vk::BufferCopy, 1 > copies;
143
158
u64 total_size_bytes = 0 ;
144
- memory_tracker. ForEachDownloadRange <true >(
159
+ memory_tracker-> ForEachDownloadRange <true >(
145
160
device_addr, size, [&](u64 device_addr_out, u64 range_size) {
146
161
const VAddr buffer_addr = buffer.CpuAddr ();
147
162
const auto add_download = [&](VAddr start, VAddr end) {
@@ -307,6 +322,94 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo
307
322
InlineDataBuffer (*buffer, address, value, num_bytes);
308
323
}
309
324
325
+ void BufferCache::CopyBuffer (VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) {
326
+ if (!dst_gds && !IsRegionRegistered (dst, num_bytes)) {
327
+ if (!src_gds && !IsRegionRegistered (src, num_bytes)) {
328
+ // Both buffers were not transferred to GPU yet. Can safely copy in host memory.
329
+ memcpy (std::bit_cast<void *>(dst), std::bit_cast<void *>(src), num_bytes);
330
+ return ;
331
+ }
332
+ // Without a readback there's nothing we can do with this
333
+ // Fallback to creating dst buffer on GPU to at least have this data there
334
+ }
335
+ if (!src_gds && !IsRegionRegistered (src, num_bytes)) {
336
+ InlineData (dst, std::bit_cast<void *>(src), num_bytes, dst_gds);
337
+ return ;
338
+ }
339
+ auto & src_buffer = [&] -> const Buffer& {
340
+ if (src_gds) {
341
+ return gds_buffer;
342
+ }
343
+ const BufferId buffer_id = FindBuffer (src, num_bytes);
344
+ return slot_buffers[buffer_id];
345
+ }();
346
+ auto & dst_buffer = [&] -> const Buffer& {
347
+ if (dst_gds) {
348
+ return gds_buffer;
349
+ }
350
+ const BufferId buffer_id = FindBuffer (dst, num_bytes);
351
+ return slot_buffers[buffer_id];
352
+ }();
353
+ vk::BufferCopy region{
354
+ .srcOffset = src_buffer.Offset (src),
355
+ .dstOffset = dst_buffer.Offset (dst),
356
+ .size = num_bytes,
357
+ };
358
+ const vk::BufferMemoryBarrier2 buf_barriers_before[2 ] = {
359
+ {
360
+ .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
361
+ .srcAccessMask = vk::AccessFlagBits2::eMemoryRead,
362
+ .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
363
+ .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
364
+ .buffer = dst_buffer.Handle (),
365
+ .offset = dst_buffer.Offset (dst),
366
+ .size = num_bytes,
367
+ },
368
+ {
369
+ .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
370
+ .srcAccessMask = vk::AccessFlagBits2::eMemoryWrite,
371
+ .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
372
+ .dstAccessMask = vk::AccessFlagBits2::eTransferRead,
373
+ .buffer = src_buffer.Handle (),
374
+ .offset = src_buffer.Offset (src),
375
+ .size = num_bytes,
376
+ },
377
+ };
378
+ scheduler.EndRendering ();
379
+ const auto cmdbuf = scheduler.CommandBuffer ();
380
+ cmdbuf.pipelineBarrier2 (vk::DependencyInfo{
381
+ .dependencyFlags = vk::DependencyFlagBits::eByRegion,
382
+ .bufferMemoryBarrierCount = 2 ,
383
+ .pBufferMemoryBarriers = buf_barriers_before,
384
+ });
385
+ cmdbuf.copyBuffer (src_buffer.Handle (), dst_buffer.Handle (), region);
386
+ const vk::BufferMemoryBarrier2 buf_barriers_after[2 ] = {
387
+ {
388
+ .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
389
+ .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
390
+ .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
391
+ .dstAccessMask = vk::AccessFlagBits2::eMemoryRead,
392
+ .buffer = dst_buffer.Handle (),
393
+ .offset = dst_buffer.Offset (dst),
394
+ .size = num_bytes,
395
+ },
396
+ {
397
+ .srcStageMask = vk::PipelineStageFlagBits2::eAllCommands,
398
+ .srcAccessMask = vk::AccessFlagBits2::eTransferRead,
399
+ .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands,
400
+ .dstAccessMask = vk::AccessFlagBits2::eMemoryWrite,
401
+ .buffer = src_buffer.Handle (),
402
+ .offset = src_buffer.Offset (src),
403
+ .size = num_bytes,
404
+ },
405
+ };
406
+ cmdbuf.pipelineBarrier2 (vk::DependencyInfo{
407
+ .dependencyFlags = vk::DependencyFlagBits::eByRegion,
408
+ .bufferMemoryBarrierCount = 2 ,
409
+ .pBufferMemoryBarriers = buf_barriers_after,
410
+ });
411
+ }
412
+
310
413
void BufferCache::WriteData (VAddr address, const void * value, u32 num_bytes, bool is_gds) {
311
414
ASSERT_MSG (address % 4 == 0 , " GDS offset must be dword aligned" );
312
415
if (!is_gds && !IsRegionRegistered (address, num_bytes)) {
@@ -329,7 +432,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
329
432
// use device local stream buffer to reduce renderpass breaks.
330
433
// Maybe we want to modify the threshold now that the page size is 16KB?
331
434
static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
332
- const bool is_gpu_dirty = memory_tracker. IsRegionGpuModified (device_addr, size);
435
+ const bool is_gpu_dirty = memory_tracker-> IsRegionGpuModified (device_addr, size);
333
436
if (!is_written && size <= StreamThreshold && !is_gpu_dirty) {
334
437
const u64 offset = stream_buffer.Copy (device_addr, size, instance.UniformMinAlignment ());
335
438
return {&stream_buffer, offset};
@@ -341,7 +444,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
341
444
Buffer& buffer = slot_buffers[buffer_id];
342
445
SynchronizeBuffer (buffer, device_addr, size, is_texel_buffer);
343
446
if (is_written) {
344
- memory_tracker. MarkRegionAsGpuModified (device_addr, size);
447
+ memory_tracker-> MarkRegionAsGpuModified (device_addr, size);
345
448
gpu_modified_ranges.Add (device_addr, size);
346
449
}
347
450
return {&buffer, buffer.Offset (device_addr)};
@@ -361,7 +464,7 @@ std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size,
361
464
// If no buffer contains the full requested range but some buffer within was GPU-modified,
362
465
// fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications.
363
466
// This is only done if the request prefers to use GPU memory, otherwise we can skip it.
364
- if (prefer_gpu && memory_tracker. IsRegionGpuModified (gpu_addr, size)) {
467
+ if (prefer_gpu && memory_tracker-> IsRegionGpuModified (gpu_addr, size)) {
365
468
return ObtainBuffer (gpu_addr, size, false , false );
366
469
}
367
470
// In all other cases, just do a CPU copy to the staging buffer.
@@ -375,11 +478,11 @@ bool BufferCache::IsRegionRegistered(VAddr addr, size_t size) {
375
478
}
376
479
377
480
bool BufferCache::IsRegionCpuModified (VAddr addr, size_t size) {
378
- return memory_tracker. IsRegionCpuModified (addr, size);
481
+ return memory_tracker-> IsRegionCpuModified (addr, size);
379
482
}
380
483
381
484
bool BufferCache::IsRegionGpuModified (VAddr addr, size_t size) {
382
- return memory_tracker. IsRegionGpuModified (addr, size);
485
+ return memory_tracker-> IsRegionGpuModified (addr, size);
383
486
}
384
487
385
488
BufferId BufferCache::FindBuffer (VAddr device_addr, u32 size) {
@@ -718,7 +821,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size,
718
821
boost::container::small_vector<vk::BufferCopy, 4 > copies;
719
822
u64 total_size_bytes = 0 ;
720
823
VAddr buffer_start = buffer.CpuAddr ();
721
- memory_tracker. ForEachUploadRange (device_addr, size, [&](u64 device_addr_out, u64 range_size) {
824
+ memory_tracker-> ForEachUploadRange (device_addr, size, [&](u64 device_addr_out, u64 range_size) {
722
825
copies.push_back (vk::BufferCopy{
723
826
.srcOffset = total_size_bytes,
724
827
.dstOffset = device_addr_out - buffer_start,
@@ -1028,4 +1131,4 @@ void BufferCache::DeleteBuffer(BufferId buffer_id) {
1028
1131
buffer.is_deleted = true ;
1029
1132
}
1030
1133
1031
- } // namespace VideoCore
1134
+ } // namespace VideoCore
0 commit comments