Skip to content

Commit becdad9

Browse files
committed
Fast readbacks
1 parent e2a58f8 commit becdad9

File tree

5 files changed

+186
-114
lines changed

5 files changed

+186
-114
lines changed

src/video_core/amdgpu/liverpool.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,9 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
694694
break;
695695
}
696696
case PM4ItOpcode::Rewind: {
697+
if (!rasterizer) {
698+
break;
699+
}
697700
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
698701
while (!rewind->Valid()) {
699702
YIELD_GFX();
@@ -873,6 +876,9 @@ Liverpool::Task Liverpool::ProcessCompute(const u32* acb, u32 acb_dwords, u32 vq
873876
break;
874877
}
875878
case PM4ItOpcode::Rewind: {
879+
if (!rasterizer) {
880+
break;
881+
}
876882
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
877883
while (!rewind->Valid()) {
878884
YIELD_ASC(vqid);

src/video_core/buffer_cache/buffer_cache.cpp

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,10 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si
167167
.dstOffset = total_size_bytes,
168168
.size = new_size,
169169
});
170-
total_size_bytes += new_size;
170+
// Align up to avoid cache conflicts
171+
constexpr u64 align = 64ULL;
172+
constexpr u64 mask = ~(align - 1ULL);
173+
total_size_bytes += (new_size + align - 1) & mask;
171174
};
172175
gpu_modified_ranges.ForEachInRange(device_addr_out, range_size, add_download);
173176
gpu_modified_ranges.Subtract(device_addr_out, range_size);
@@ -322,16 +325,25 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo
322325
InlineDataBuffer(*buffer, address, value, num_bytes);
323326
}
324327

328+
void BufferCache::EnsureRegionRegistered(VAddr address, u32 size) {
329+
if (!IsRegionRegistered(address, size)) {
330+
const BufferId buffer_id = FindBuffer(address, size);
331+
(void)buffer_id;
332+
}
333+
}
334+
325335
void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds) {
326-
if (!dst_gds && !IsRegionRegistered(dst, num_bytes)) {
327-
if (!src_gds && !IsRegionRegistered(src, num_bytes)) {
328-
// Both buffers were not transferred to GPU yet. Can safely copy in host memory.
336+
if (!src_gds && !IsRegionRegistered(src, num_bytes)) {
337+
if (!dst_gds && !IsRegionRegistered(dst, num_bytes)) {
329338
memcpy(std::bit_cast<void*>(dst), std::bit_cast<void*>(src), num_bytes);
330339
return;
331340
}
332-
// Without a readback there's nothing we can do with this
333-
// Fallback to creating dst buffer on GPU to at least have this data there
341+
// Force GPU write to dst even if dst was not previously registered
342+
EnsureRegionRegistered(dst, num_bytes); // You'd implement this
343+
InlineData(dst, std::bit_cast<void*>(src), num_bytes, dst_gds);
344+
return;
334345
}
346+
335347
if (!src_gds && !IsRegionRegistered(src, num_bytes)) {
336348
InlineData(dst, std::bit_cast<void*>(src), num_bytes, dst_gds);
337349
return;
@@ -375,7 +387,7 @@ void BufferCache::CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds,
375387
.size = num_bytes,
376388
},
377389
};
378-
scheduler.EndRendering();
390+
379391
const auto cmdbuf = scheduler.CommandBuffer();
380392
cmdbuf.pipelineBarrier2(vk::DependencyInfo{
381393
.dependencyFlags = vk::DependencyFlagBits::eByRegion,
@@ -931,15 +943,16 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr,
931943
const u32 height = std::max(image.info.size.height >> m, 1u);
932944
const u32 depth =
933945
image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u;
934-
const auto& [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m];
935-
offset += mip_ofs;
936-
if (offset + mip_size > max_offset) {
946+
const auto& mip = image.info.mips_layout[m];
947+
offset += mip.offset;
948+
if (offset + mip.size > max_offset) {
937949
break;
938950
}
951+
939952
copies.push_back({
940953
.bufferOffset = offset,
941-
.bufferRowLength = static_cast<u32>(mip_pitch),
942-
.bufferImageHeight = static_cast<u32>(mip_height),
954+
.bufferRowLength = static_cast<u32>(mip.pitch),
955+
.bufferImageHeight = static_cast<u32>(mip.height),
943956
.imageSubresource{
944957
.aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil,
945958
.mipLevel = m,
@@ -1131,4 +1144,4 @@ void BufferCache::DeleteBuffer(BufferId buffer_id) {
11311144
buffer.is_deleted = true;
11321145
}
11331146

1134-
} // namespace VideoCore
1147+
} // namespace VideoCore

src/video_core/buffer_cache/buffer_cache.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class BufferCache {
6666
VAddr end;
6767
bool has_stream_leap = false;
6868
};
69+
6970
public:
7071
explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler,
7172
Vulkan::Rasterizer& rasterizer_, AmdGpu::Liverpool* liverpool,
@@ -100,7 +101,7 @@ class BufferCache {
100101
/// Invalidates any buffer in the logical page range.
101102
void InvalidateMemory(VAddr device_addr, u64 size, bool unmap);
102103

103-
void ReadMemory(VAddr device_addr, u64 size);
104+
void ReadMemory(VAddr device_addr, u64 size);
104105

105106
/// Binds host vertex buffers for the current draw.
106107
void BindVertexBuffers(const Vulkan::GraphicsPipeline& pipeline);
@@ -110,8 +111,8 @@ class BufferCache {
110111

111112
/// Writes a value to GPU buffer. (uses command buffer to temporarily store the data)
112113
void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
113-
114-
void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds);
114+
void EnsureRegionRegistered(VAddr address, u32 size);
115+
void CopyBuffer(VAddr dst, VAddr src, u32 num_bytes, bool dst_gds, bool src_gds);
115116

116117
/// Writes a value to GPU buffer. (uses staging buffer to temporarily store the data)
117118
void WriteData(VAddr address, const void* value, u32 num_bytes, bool is_gds);

src/video_core/page_manager.cpp

Lines changed: 97 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
22
// SPDX-License-Identifier: GPL-2.0-or-later
33

4-
#include <thread>
5-
#include <boost/icl/interval_set.hpp>
4+
#include <boost/container/small_vector.hpp>
65
#include "common/assert.h"
7-
#include "common/error.h"
6+
#include "common/debug.h"
87
#include "common/signal_context.h"
98
#include "common/spin_lock.h"
109
#include "core/memory.h"
@@ -16,23 +15,57 @@
1615
#include <sys/mman.h>
1716
#include "common/adaptive_mutex.h"
1817
#ifdef ENABLE_USERFAULTFD
18+
#include <thread>
1919
#include <fcntl.h>
2020
#include <linux/userfaultfd.h>
2121
#include <poll.h>
2222
#include <sys/ioctl.h>
23+
#include "common/error.h"
2324
#endif
2425
#else
2526
#include <windows.h>
27+
#endif
28+
29+
#ifdef __linux__
30+
#include "common/adaptive_mutex.h"
31+
#else
2632
#include "common/spin_lock.h"
2733
#endif
2834

2935
namespace VideoCore {
3036

37+
constexpr size_t PAGE_SIZE = 4_KB;
38+
constexpr size_t PAGE_BITS = 12;
39+
3140
struct PageManager::Impl {
41+
struct PageState {
42+
u8 num_watchers{};
43+
44+
Core::MemoryPermission Perm() const noexcept {
45+
return num_watchers == 0 ? Core::MemoryPermission::ReadWrite
46+
: Core::MemoryPermission::Read;
47+
}
48+
49+
template <s32 delta>
50+
u8 AddDelta() {
51+
if constexpr (delta == 1) {
52+
return ++num_watchers;
53+
} else {
54+
ASSERT_MSG(num_watchers > 0, "Not enough watchers");
55+
return --num_watchers;
56+
}
57+
}
58+
};
59+
60+
struct UpdateProtectRange {
61+
VAddr addr;
62+
u64 size;
63+
Core::MemoryPermission perms;
64+
};
65+
3266
static constexpr size_t ADDRESS_BITS = 40;
3367
static constexpr size_t NUM_ADDRESS_PAGES = 1ULL << (40 - PAGE_BITS);
3468
inline static Vulkan::Rasterizer* rasterizer;
35-
3669
#ifdef ENABLE_USERFAULTFD
3770
Impl(Vulkan::Rasterizer* rasterizer_) {
3871
rasterizer = rasterizer_;
@@ -67,7 +100,8 @@ struct PageManager::Impl {
67100
ASSERT_MSG(ret != -1, "Uffdio unregister failed");
68101
}
69102

70-
void Protect(VAddr address, size_t size, bool allow_write) {
103+
void Protect(VAddr address, size_t size, Core::MemoryPermission perms) {
104+
bool allow_write = True(perms & Core::MemoryPermission::Write);
71105
uffdio_writeprotect wp;
72106
wp.range.start = address;
73107
wp.range.len = size;
@@ -143,6 +177,7 @@ struct PageManager::Impl {
143177
}
144178

145179
void Protect(VAddr address, size_t size, Core::MemoryPermission perms) {
180+
RENDERER_TRACE;
146181
auto* memory = Core::Memory::Instance();
147182
auto& impl = memory->GetAddressSpace();
148183
impl.Protect(address, size, perms);
@@ -152,96 +187,75 @@ struct PageManager::Impl {
152187
const auto addr = reinterpret_cast<VAddr>(fault_address);
153188
if (Common::IsWriteError(context)) {
154189
return rasterizer->InvalidateMemory(addr, 1);
155-
} else {
156-
return rasterizer->ReadMemory(addr, 1);
157190
}
158191
return false;
159192
}
160-
#endif
161193

162-
template <s32 delta, bool is_read>
194+
#endif
195+
template <s32 delta>
163196
void UpdatePageWatchers(VAddr addr, u64 size) {
164-
std::scoped_lock lk{lock};
165-
std::atomic_thread_fence(std::memory_order_acquire);
166-
167-
size_t page = addr >> PAGE_BITS;
168-
auto perms = cached_pages[page].Perms();
169-
u64 range_begin = 0;
170-
u64 range_bytes = 0;
171-
172-
const auto release_pending = [&] {
173-
if (range_bytes > 0) {
174-
Protect(range_begin << PAGE_BITS, range_bytes, perms);
175-
range_bytes = 0;
176-
}
177-
};
178-
// Iterate requested pages.
179-
const size_t page_end = Common::DivCeil(addr + size, PAGE_SIZE);
180-
for (; page != page_end; ++page) {
181-
PageState& state = cached_pages[page];
182-
183-
// Apply the change to the page state.
184-
const auto new_count = state.AddDelta<is_read, delta>();
185-
186-
// If the protection changed flush pending (un)protect action.
187-
if (auto new_perms = state.Perms(); new_perms != perms) [[unlikely]] {
188-
release_pending();
189-
perms = new_perms;
190-
}
197+
RENDERER_TRACE;
198+
boost::container::small_vector<UpdateProtectRange, 16> update_ranges;
199+
{
200+
std::scoped_lock lk(lock);
201+
202+
size_t page = addr >> PAGE_BITS;
203+
auto perms = cached_pages[page].Perm();
204+
u64 range_begin = 0;
205+
u64 range_bytes = 0;
206+
207+
const auto release_pending = [&] {
208+
if (range_bytes > 0) {
209+
RENDERER_TRACE;
210+
// Add pending (un)protect action
211+
update_ranges.push_back({range_begin << PAGE_BITS, range_bytes, perms});
212+
range_bytes = 0;
213+
}
214+
};
215+
216+
// Iterate requested pages
217+
const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE);
218+
const u64 aligned_addr = page << PAGE_BITS;
219+
const u64 aligned_end = page_end << PAGE_BITS;
220+
// ASSERT_MSG(rasterizer->IsMapped(aligned_addr, aligned_end - aligned_addr),
221+
// "Attempted to track non-GPU memory at address {:#x}, size {:#x}.",
222+
// aligned_addr, aligned_end - aligned_addr);
223+
224+
for (; page != page_end; ++page) {
225+
PageState& state = cached_pages[page];
226+
227+
// Apply the change to the page state
228+
const u8 new_count = state.AddDelta<delta>();
229+
230+
// If the protection changed add pending (un)protect action
231+
if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] {
232+
release_pending();
233+
perms = new_perms;
234+
}
191235

192-
// If the page must be (un)protected add it to pending range.
193-
if ((new_count == 0 && delta < 0) || (new_count == 1 && delta > 0)) {
194-
if (range_bytes == 0) {
195-
range_begin = page;
236+
// If the page must be (un)protected, add it to the pending range
237+
if ((new_count == 0 && delta < 0) || (new_count == 1 && delta > 0)) {
238+
if (range_bytes == 0) {
239+
range_begin = page;
240+
}
241+
range_bytes += PAGE_SIZE;
242+
} else {
243+
release_pending();
196244
}
197-
range_bytes += PAGE_SIZE;
198-
} else {
199-
release_pending();
200245
}
201-
}
202-
release_pending();
203-
}
204-
205-
struct PageState {
206-
u8 num_write_watchers : 7;
207-
// At the moment only buffer cache can request read watchers.
208-
// And buffers cannot overlap, thus only 1 can exist per page.
209-
u8 num_read_watchers : 1;
210-
211-
Core::MemoryPermission WritePerm() const noexcept {
212-
return num_write_watchers == 0 ? Core::MemoryPermission::Write
213-
: Core::MemoryPermission::None;
214-
}
215246

216-
Core::MemoryPermission ReadPerm() const noexcept {
217-
return num_read_watchers == 0 ? Core::MemoryPermission::Read
218-
: Core::MemoryPermission::None;
247+
// Add pending (un)protect action
248+
release_pending();
219249
}
220250

221-
Core::MemoryPermission Perms() const noexcept {
222-
return ReadPerm() | WritePerm();
251+
// Flush deferred protects
252+
for (const auto& range : update_ranges) {
253+
Protect(range.addr, range.size, range.perms);
223254
}
224-
225-
template <bool is_read, s32 delta>
226-
u8 AddDelta() {
227-
if constexpr (is_read) {
228-
if constexpr (delta == 1) {
229-
return ++num_read_watchers;
230-
} else {
231-
return --num_read_watchers;
232-
}
233-
} else {
234-
if constexpr (delta == 1) {
235-
return ++num_write_watchers;
236-
} else {
237-
return --num_write_watchers;
238-
}
239-
}
240-
}
241-
};
255+
}
242256

243257
std::array<PageState, NUM_ADDRESS_PAGES> cached_pages{};
244-
#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP
258+
#ifdef __linux__
245259
Common::AdaptiveMutex lock;
246260
#else
247261
Common::SpinLock lock;
@@ -263,12 +277,12 @@ void PageManager::OnGpuUnmap(VAddr address, size_t size) {
263277

264278
template <s32 delta, bool is_read>
265279
void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const {
266-
impl->UpdatePageWatchers<delta, is_read>(addr, size);
280+
impl->UpdatePageWatchers<delta>(addr, size);
267281
}
268282

269283
template void PageManager::UpdatePageWatchers<1, true>(VAddr addr, u64 size) const;
270284
template void PageManager::UpdatePageWatchers<1, false>(VAddr addr, u64 size) const;
271285
template void PageManager::UpdatePageWatchers<-1, true>(VAddr addr, u64 size) const;
272286
template void PageManager::UpdatePageWatchers<-1, false>(VAddr addr, u64 size) const;
273287

274-
} // namespace VideoCore
288+
} // namespace VideoCore

0 commit comments

Comments
 (0)