Skip to content

renderer_vulkan: Introduce shader HLE system with copy shader implementation. #1683

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,8 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp
src/video_core/renderer_vulkan/vk_resource_pool.h
src/video_core/renderer_vulkan/vk_scheduler.cpp
src/video_core/renderer_vulkan/vk_scheduler.h
src/video_core/renderer_vulkan/vk_shader_hle.cpp
src/video_core/renderer_vulkan/vk_shader_hle.h
src/video_core/renderer_vulkan/vk_shader_util.cpp
src/video_core/renderer_vulkan/vk_shader_util.h
src/video_core/renderer_vulkan/vk_swapchain.cpp
Expand Down
10 changes: 9 additions & 1 deletion src/video_core/buffer_cache/buffer_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,8 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
return {&buffer, buffer.Offset(device_addr)};
}

std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size) {
std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) {
// Check if any buffer contains the full requested range.
const u64 page = gpu_addr >> CACHING_PAGEBITS;
const BufferId buffer_id = page_table[page];
if (buffer_id) {
Expand All @@ -370,6 +371,13 @@ std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size)
return {&buffer, buffer.Offset(gpu_addr)};
}
}
// If no buffer contains the full requested range but some buffer within was GPU-modified,
// fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications.
// This is only done if the request prefers to use GPU memory, otherwise we can skip it.
if (prefer_gpu && memory_tracker.IsRegionGpuModified(gpu_addr, size)) {
return ObtainBuffer(gpu_addr, size, false, false);
}
// In all other cases, just do a CPU copy to the staging buffer.
const u32 offset = staging_buffer.Copy(gpu_addr, size, 16);
return {&staging_buffer, offset};
}
Expand Down
3 changes: 2 additions & 1 deletion src/video_core/buffer_cache/buffer_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ class BufferCache {
BufferId buffer_id = {});

/// Attempts to obtain a buffer without modifying the cache contents.
[[nodiscard]] std::pair<Buffer*, u32> ObtainViewBuffer(VAddr gpu_addr, u32 size);
[[nodiscard]] std::pair<Buffer*, u32> ObtainViewBuffer(VAddr gpu_addr, u32 size,
bool prefer_gpu);

/// Return true when a region is registered on the cache
[[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
Expand Down
6 changes: 6 additions & 0 deletions src/video_core/renderer_vulkan/vk_rasterizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "video_core/renderer_vulkan/vk_instance.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_shader_hle.h"
#include "video_core/texture_cache/image_view.h"
#include "video_core/texture_cache/texture_cache.h"
#include "vk_rasterizer.h"
Expand Down Expand Up @@ -318,6 +319,11 @@ void Rasterizer::DispatchDirect() {
return;
}

const auto& cs = pipeline->GetStage(Shader::Stage::Compute);
if (ExecuteShaderHLE(cs, liverpool->regs, *this)) {
return;
}

if (!BindResources(pipeline)) {
return;
}
Expand Down
8 changes: 8 additions & 0 deletions src/video_core/renderer_vulkan/vk_rasterizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ class Rasterizer {
AmdGpu::Liverpool* liverpool);
~Rasterizer();

[[nodiscard]] Scheduler& GetScheduler() noexcept {
return scheduler;
}

[[nodiscard]] VideoCore::BufferCache& GetBufferCache() noexcept {
return buffer_cache;
}

[[nodiscard]] VideoCore::TextureCache& GetTextureCache() noexcept {
return texture_cache;
}
Expand Down
4 changes: 4 additions & 0 deletions src/video_core/renderer_vulkan/vk_scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
#include "video_core/renderer_vulkan/vk_master_semaphore.h"
#include "video_core/renderer_vulkan/vk_resource_pool.h"

namespace tracy {
class VkCtxScope;
}

namespace Vulkan {

class Instance;
Expand Down
139 changes: 139 additions & 0 deletions src/video_core/renderer_vulkan/vk_shader_hle.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#include "shader_recompiler/info.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_shader_hle.h"

#include "vk_rasterizer.h"

namespace Vulkan {

static constexpr u64 COPY_SHADER_HASH = 0xfefebf9f;

bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs,
Rasterizer& rasterizer) {
auto& scheduler = rasterizer.GetScheduler();
auto& buffer_cache = rasterizer.GetBufferCache();

// Copy shader defines three formatted buffers as inputs: control, source, and destination.
const auto ctl_buf_sharp = info.texture_buffers[0].GetSharp(info);
const auto src_buf_sharp = info.texture_buffers[1].GetSharp(info);
const auto dst_buf_sharp = info.texture_buffers[2].GetSharp(info);
const auto buf_stride = src_buf_sharp.GetStride();
ASSERT(buf_stride == dst_buf_sharp.GetStride());

struct CopyShaderControl {
u32 dst_idx;
u32 src_idx;
u32 end;
};
static_assert(sizeof(CopyShaderControl) == 12);
ASSERT(ctl_buf_sharp.GetStride() == sizeof(CopyShaderControl));
const auto ctl_buf = reinterpret_cast<const CopyShaderControl*>(ctl_buf_sharp.base_address);

static std::vector<vk::BufferCopy> copies;
copies.clear();
copies.reserve(regs.cs_program.dim_x);

for (u32 i = 0; i < regs.cs_program.dim_x; i++) {
const auto& [dst_idx, src_idx, end] = ctl_buf[i];
const u32 local_dst_offset = dst_idx * buf_stride;
const u32 local_src_offset = src_idx * buf_stride;
const u32 local_size = (end + 1) * buf_stride;
copies.emplace_back(local_src_offset, local_dst_offset, local_size);
}

scheduler.EndRendering();

static constexpr vk::MemoryBarrier READ_BARRIER{
.srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
.dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite,
};
static constexpr vk::MemoryBarrier WRITE_BARRIER{
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
};
scheduler.CommandBuffer().pipelineBarrier(
vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eTransfer,
vk::DependencyFlagBits::eByRegion, READ_BARRIER, {}, {});

static constexpr vk::DeviceSize MaxDistanceForMerge = 64_MB;
u32 batch_start = 0;
u32 batch_end = 1;

while (batch_end < copies.size()) {
// Place first copy into the current batch
const auto& copy = copies[batch_start];
auto src_offset_min = copy.srcOffset;
auto src_offset_max = copy.srcOffset + copy.size;
auto dst_offset_min = copy.dstOffset;
auto dst_offset_max = copy.dstOffset + copy.size;

for (int i = batch_start + 1; i < copies.size(); i++) {
// Compute new src and dst bounds if we were to batch this copy
const auto [src_offset, dst_offset, size] = copies[i];
auto new_src_offset_min = std::min(src_offset_min, src_offset);
auto new_src_offset_max = std::max(src_offset_max, src_offset + size);
if (new_src_offset_max - new_src_offset_min > MaxDistanceForMerge) {
continue;
}

auto new_dst_offset_min = std::min(dst_offset_min, dst_offset);
auto new_dst_offset_max = std::max(dst_offset_max, dst_offset + size);
if (new_dst_offset_max - new_dst_offset_min > MaxDistanceForMerge) {
continue;
}

// We can batch this copy
src_offset_min = new_src_offset_min;
src_offset_max = new_src_offset_max;
dst_offset_min = new_dst_offset_min;
dst_offset_max = new_dst_offset_max;
if (i != batch_end) {
std::swap(copies[i], copies[batch_end]);
}
++batch_end;
}

// Obtain buffers for the total source and destination ranges.
const auto [src_buf, src_buf_offset] =
buffer_cache.ObtainBuffer(src_buf_sharp.base_address + src_offset_min,
src_offset_max - src_offset_min, false, false);
const auto [dst_buf, dst_buf_offset] =
buffer_cache.ObtainBuffer(dst_buf_sharp.base_address + dst_offset_min,
dst_offset_max - dst_offset_min, true, false);

// Apply found buffer base.
const auto vk_copies = std::span{copies}.subspan(batch_start, batch_end - batch_start);
for (auto& copy : vk_copies) {
copy.srcOffset = copy.srcOffset - src_offset_min + src_buf_offset;
copy.dstOffset = copy.dstOffset - dst_offset_min + dst_buf_offset;
}

// Execute buffer copies.
LOG_TRACE(Render_Vulkan, "HLE buffer copy: src_size = {}, dst_size = {}",
src_offset_max - src_offset_min, dst_offset_max - dst_offset_min);
scheduler.CommandBuffer().copyBuffer(src_buf->Handle(), dst_buf->Handle(), vk_copies);
batch_start = batch_end;
++batch_end;
}

scheduler.CommandBuffer().pipelineBarrier(
vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands,
vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {});

return true;
}

bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs,
Rasterizer& rasterizer) {
switch (info.pgm_hash) {
case COPY_SHADER_HASH:
return ExecuteCopyShaderHLE(info, regs, rasterizer);
default:
return false;
}
}

} // namespace Vulkan
20 changes: 20 additions & 0 deletions src/video_core/renderer_vulkan/vk_shader_hle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later

#pragma once

#include "video_core/amdgpu/liverpool.h"

namespace Shader {
struct Info;
}

namespace Vulkan {

class Rasterizer;

/// Attempts to execute a shader using HLE if possible.
bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs,
Rasterizer& rasterizer);

} // namespace Vulkan
8 changes: 5 additions & 3 deletions src/video_core/texture_cache/texture_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,9 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
const auto& num_mips = image.info.resources.levels;
ASSERT(num_mips == image.info.mips_layout.size());

const bool is_gpu_modified = True(image.flags & ImageFlagBits::GpuModified);
const bool is_gpu_dirty = True(image.flags & ImageFlagBits::GpuDirty);

boost::container::small_vector<vk::BufferImageCopy, 14> image_copy{};
for (u32 m = 0; m < num_mips; m++) {
const u32 width = std::max(image.info.size.width >> m, 1u);
Expand All @@ -475,8 +478,6 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
const auto& mip = image.info.mips_layout[m];

// Protect GPU modified resources from accidental CPU reuploads.
const bool is_gpu_modified = True(image.flags & ImageFlagBits::GpuModified);
const bool is_gpu_dirty = True(image.flags & ImageFlagBits::GpuDirty);
if (is_gpu_modified && !is_gpu_dirty) {
const u8* addr = std::bit_cast<u8*>(image.info.guest_address);
const u64 hash = XXH3_64bits(addr + mip.offset, mip.size);
Expand Down Expand Up @@ -515,7 +516,8 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule

const VAddr image_addr = image.info.guest_address;
const size_t image_size = image.info.guest_size_bytes;
const auto [vk_buffer, buf_offset] = buffer_cache.ObtainViewBuffer(image_addr, image_size);
const auto [vk_buffer, buf_offset] =
buffer_cache.ObtainViewBuffer(image_addr, image_size, is_gpu_dirty);
// The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW
// hazard
if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead,
Expand Down
Loading