Skip to content

Commit d97035f

Browse files
authored
Merge pull request #16434 from unknownbrackets/stencil-opt
Vulkan: Use stencil export when available
2 parents 8f141d1 + 9fcccd7 commit d97035f

12 files changed

+121
-58
lines changed

Common/GPU/D3D11/thin3d_d3d11.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *de
269269
caps_.anisoSupported = true;
270270
caps_.textureNPOTFullySupported = true;
271271
caps_.fragmentShaderDepthWriteSupported = true;
272+
caps_.fragmentShaderStencilWriteSupported = false;
272273
caps_.blendMinMaxSupported = true;
273274

274275
D3D11_FEATURE_DATA_D3D11_OPTIONS options{};

Common/GPU/D3D9/thin3d_d3d9.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -759,6 +759,7 @@ D3D9Context::D3D9Context(IDirect3D9 *d3d, IDirect3D9Ex *d3dEx, int adapterId, ID
759759
caps_.framebufferSeparateDepthCopySupported = false;
760760
caps_.texture3DSupported = true;
761761
caps_.fragmentShaderDepthWriteSupported = true;
762+
caps_.fragmentShaderStencilWriteSupported = false;
762763
caps_.blendMinMaxSupported = true;
763764

764765
if ((caps.RasterCaps & D3DPRASTERCAPS_ANISOTROPY) != 0 && caps.MaxAnisotropy > 1) {

Common/GPU/DataFormat.h

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ enum class DataFormat : uint8_t {
6565

6666
S8,
6767
D16,
68+
D16_S8,
6869
D24_S8,
6970
D32F,
7071
D32F_S8,

Common/GPU/OpenGL/GLFeatures.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,7 @@ void CheckGLExtensions() {
371371
gl_extensions.ARB_uniform_buffer_object = g_set_gl_extensions.count("GL_ARB_uniform_buffer_object") != 0;
372372
gl_extensions.ARB_explicit_attrib_location = g_set_gl_extensions.count("GL_ARB_explicit_attrib_location") != 0;
373373
gl_extensions.ARB_texture_non_power_of_two = g_set_gl_extensions.count("GL_ARB_texture_non_power_of_two") != 0;
374+
gl_extensions.ARB_shader_stencil_export = g_set_gl_extensions.count("GL_ARB_shader_stencil_export") != 0;
374375
if (gl_extensions.IsGLES) {
375376
gl_extensions.EXT_blend_func_extended = g_set_gl_extensions.count("GL_EXT_blend_func_extended") != 0;
376377
gl_extensions.OES_texture_npot = g_set_gl_extensions.count("GL_OES_texture_npot") != 0;

Common/GPU/OpenGL/GLFeatures.h

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ struct GLExtensions {
7272
bool ARB_uniform_buffer_object;
7373
bool ARB_texture_non_power_of_two;
7474
bool ARB_stencil_texturing;
75+
bool ARB_shader_stencil_export;
7576

7677
// EXT
7778
bool EXT_swap_control_tear;

Common/GPU/OpenGL/thin3d_gl.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,7 @@ OpenGLContext::OpenGLContext() {
575575
} else {
576576
caps_.fragmentShaderDepthWriteSupported = true;
577577
}
578+
caps_.fragmentShaderStencilWriteSupported = gl_extensions.ARB_shader_stencil_export;
578579

579580
// GLES has no support for logic framebuffer operations. There doesn't even seem to exist any such extensions.
580581
caps_.logicOpSupported = !gl_extensions.IsGLES;

Common/GPU/Vulkan/thin3d_vulkan.cpp

+24-1
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,8 @@ static int GetBpp(VkFormat format) {
592592
static VkFormat DataFormatToVulkan(DataFormat format) {
593593
switch (format) {
594594
case DataFormat::D16: return VK_FORMAT_D16_UNORM;
595+
case DataFormat::D16_S8: return VK_FORMAT_D16_UNORM_S8_UINT;
596+
case DataFormat::D24_S8: return VK_FORMAT_D24_UNORM_S8_UINT;
595597
case DataFormat::D32F: return VK_FORMAT_D32_SFLOAT;
596598
case DataFormat::D32F_S8: return VK_FORMAT_D32_SFLOAT_S8_UINT;
597599
case DataFormat::S8: return VK_FORMAT_S8_UINT;
@@ -784,6 +786,25 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanPushBuffer *push, const Textur
784786
return true;
785787
}
786788

789+
static DataFormat DataFormatFromVulkanDepth(VkFormat fmt) {
790+
switch (fmt) {
791+
case VK_FORMAT_D24_UNORM_S8_UINT:
792+
return DataFormat::D24_S8;
793+
case VK_FORMAT_D16_UNORM:
794+
return DataFormat::D16;
795+
case VK_FORMAT_D32_SFLOAT:
796+
return DataFormat::D32F;
797+
case VK_FORMAT_D32_SFLOAT_S8_UINT:
798+
return DataFormat::D32F_S8;
799+
case VK_FORMAT_D16_UNORM_S8_UINT:
800+
return DataFormat::D16_S8;
801+
default:
802+
break;
803+
}
804+
805+
return DataFormat::UNDEFINED;
806+
}
807+
787808
VKContext::VKContext(VulkanContext *vulkan)
788809
: vulkan_(vulkan), renderManager_(vulkan) {
789810
shaderLanguageDesc_.Init(GLSL_VULKAN);
@@ -803,12 +824,14 @@ VKContext::VKContext(VulkanContext *vulkan)
803824
caps_.framebufferStencilBlitSupported = caps_.framebufferDepthBlitSupported;
804825
caps_.framebufferDepthCopySupported = true; // Will pretty much always be the case.
805826
caps_.framebufferSeparateDepthCopySupported = true; // Will pretty much always be the case.
806-
caps_.preferredDepthBufferFormat = DataFormat::D24_S8; // TODO: Ask vulkan.
827+
// This doesn't affect what depth/stencil format is actually used, see VulkanQueueRunner.
828+
caps_.preferredDepthBufferFormat = DataFormatFromVulkanDepth(vulkan->GetDeviceInfo().preferredDepthStencilFormat);
807829
caps_.texture3DSupported = true;
808830
caps_.textureDepthSupported = true;
809831
caps_.fragmentShaderInt32Supported = true;
810832
caps_.textureNPOTFullySupported = true;
811833
caps_.fragmentShaderDepthWriteSupported = true;
834+
caps_.fragmentShaderStencilWriteSupported = vulkan->Extensions().EXT_shader_stencil_export;
812835
caps_.blendMinMaxSupported = true;
813836
caps_.logicOpSupported = vulkan->GetDeviceFeatures().enabled.standard.logicOp != 0;
814837
caps_.multiViewSupported = vulkan->GetDeviceFeatures().enabled.multiview.multiview != 0;

Common/GPU/thin3d.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ size_t DataFormatSizeInBytes(DataFormat fmt) {
4646

4747
case DataFormat::S8: return 1;
4848
case DataFormat::D16: return 2;
49+
case DataFormat::D16_S8: return 3;
4950
case DataFormat::D24_S8: return 4;
5051
case DataFormat::D32F: return 4;
5152
// Or maybe 8...
@@ -68,6 +69,7 @@ const char *DataFormatToString(DataFormat fmt) {
6869

6970
case DataFormat::S8: return "S8";
7071
case DataFormat::D16: return "D16";
72+
case DataFormat::D16_S8: return "D16_S8";
7173
case DataFormat::D24_S8: return "D24_S8";
7274
case DataFormat::D32F: return "D32F";
7375
case DataFormat::D32F_S8: return "D32F_S8";
@@ -80,6 +82,7 @@ const char *DataFormatToString(DataFormat fmt) {
8082
bool DataFormatIsDepthStencil(DataFormat fmt) {
8183
switch (fmt) {
8284
case DataFormat::D16:
85+
case DataFormat::D16_S8:
8386
case DataFormat::D24_S8:
8487
case DataFormat::S8:
8588
case DataFormat::D32F:

Common/GPU/thin3d.h

+1
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,7 @@ struct DeviceCaps {
572572
bool fragmentShaderInt32Supported;
573573
bool textureNPOTFullySupported;
574574
bool fragmentShaderDepthWriteSupported;
575+
bool fragmentShaderStencilWriteSupported;
575576
bool textureDepthSupported;
576577
bool blendMinMaxSupported;
577578
bool multiViewSupported;

GPU/Common/StencilCommon.cpp

+71-45
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,34 @@ static u8 StencilBits8888(const u8 *ptr8, u32 numPixels) {
5858
return bits >> 24;
5959
}
6060

61+
static bool CheckStencilBits(const u8 *src, const VirtualFramebuffer *dstBuffer, int &values, u8 &usedBits) {
62+
switch (dstBuffer->fb_format) {
63+
case GE_FORMAT_565:
64+
// Well, this doesn't make much sense.
65+
return false;
66+
case GE_FORMAT_5551:
67+
usedBits = StencilBits5551(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
68+
values = 2;
69+
break;
70+
case GE_FORMAT_4444:
71+
usedBits = StencilBits4444(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
72+
values = 16;
73+
break;
74+
case GE_FORMAT_8888:
75+
usedBits = StencilBits8888(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
76+
values = 256;
77+
break;
78+
case GE_FORMAT_INVALID:
79+
case GE_FORMAT_DEPTH16:
80+
case GE_FORMAT_CLUT8:
81+
// Inconceivable.
82+
_assert_(false);
83+
return false;
84+
}
85+
86+
return true;
87+
}
88+
6189
struct StencilUB {
6290
float stencilValue;
6391
};
@@ -83,8 +111,12 @@ static const SamplerDef samplers[1] = {
83111
{ 0, "tex" },
84112
};
85113

86-
void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs) {
87-
ShaderWriter writer(buffer, lang, ShaderStage::Fragment);
114+
void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs, bool useExport) {
115+
std::vector<const char *> extensions;
116+
if (useExport)
117+
extensions.push_back("#extension GL_ARB_shader_stencil_export : require");
118+
119+
ShaderWriter writer(buffer, lang, ShaderStage::Fragment, extensions);
88120
writer.HighPrecisionFloat();
89121
writer.DeclareSamplers(samplers);
90122

@@ -98,9 +130,13 @@ void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw:
98130

99131
writer.C(" vec4 index = ").SampleTexture2D("tex", "v_texcoord.xy").C(";\n");
100132
writer.C(" vec4 outColor = index.aaaa;\n"); // Only care about a.
101-
writer.C(" float shifted = roundAndScaleTo255f(index.a) / roundAndScaleTo255f(stencilValue);\n");
102-
// Bitwise operations on floats, ugh.
103-
writer.C(" if (mod(floor(shifted), 2.0) < 0.99) DISCARD;\n");
133+
if (useExport) {
134+
writer.C(" gl_FragStencilRefARB = int(roundAndScaleTo255f(index.a));\n");
135+
} else {
136+
writer.C(" float shifted = roundAndScaleTo255f(index.a) / roundAndScaleTo255f(stencilValue);\n");
137+
// Bitwise operations on floats, ugh.
138+
writer.C(" if (mod(floor(shifted), 2.0) < 0.99) DISCARD;\n");
139+
}
104140

105141
if (bugs.Has(Draw::Bugs::NO_DEPTH_CANNOT_DISCARD_STENCIL)) {
106142
writer.C(" gl_FragDepth = gl_FragCoord.z;\n");
@@ -135,10 +171,11 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size,
135171
return false;
136172
}
137173

138-
VirtualFramebuffer *dstBuffer = 0;
174+
VirtualFramebuffer *dstBuffer = nullptr;
139175
for (size_t i = 0; i < vfbs_.size(); ++i) {
140176
VirtualFramebuffer *vfb = vfbs_[i];
141-
if (vfb->fb_address == addr) {
177+
// TODO: Maybe we should broadcast to all? Most of the time, there's only one.
178+
if (vfb->fb_address == addr && (!dstBuffer || dstBuffer->colorBindSeq < vfb->colorBindSeq)) {
142179
dstBuffer = vfb;
143180
}
144181
}
@@ -148,34 +185,15 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size,
148185

149186
int values = 0;
150187
u8 usedBits = 0;
188+
bool useExportShader = draw_->GetDeviceCaps().fragmentShaderStencilWriteSupported;
151189

152190
const u8 *src = Memory::GetPointer(addr);
153191
if (!src)
154192
return false;
155193

156-
switch (dstBuffer->fb_format) {
157-
case GE_FORMAT_565:
158-
// Well, this doesn't make much sense.
194+
// Could skip this when doing useExportShader, but then we couldn't optimize usedBits == 0.
195+
if (!CheckStencilBits(src, dstBuffer, values, usedBits))
159196
return false;
160-
case GE_FORMAT_5551:
161-
usedBits = StencilBits5551(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
162-
values = 2;
163-
break;
164-
case GE_FORMAT_4444:
165-
usedBits = StencilBits4444(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
166-
values = 16;
167-
break;
168-
case GE_FORMAT_8888:
169-
usedBits = StencilBits8888(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
170-
values = 256;
171-
break;
172-
case GE_FORMAT_INVALID:
173-
case GE_FORMAT_DEPTH16:
174-
case GE_FORMAT_CLUT8:
175-
// Inconceivable.
176-
_assert_(false);
177-
break;
178-
}
179197

180198
if (usedBits == 0) {
181199
if (flags & WriteStencil::STENCIL_IS_ZERO) {
@@ -201,7 +219,7 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size,
201219

202220
char *fsCode = new char[8192];
203221
char *vsCode = new char[8192];
204-
GenerateStencilFs(fsCode, shaderLanguageDesc, draw_->GetBugs());
222+
GenerateStencilFs(fsCode, shaderLanguageDesc, draw_->GetBugs(), useExportShader);
205223
GenerateStencilVs(vsCode, shaderLanguageDesc);
206224

207225
_assert_msg_(strlen(fsCode) < 8192, "StenFS length error: %d", (int)strlen(fsCode));
@@ -303,24 +321,32 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size,
303321
draw_->SetScissorRect(0, 0, w, h);
304322
draw_->BindPipeline(stencilWritePipeline_);
305323

306-
for (int i = 1; i < values; i += i) {
307-
if (!(usedBits & i)) {
308-
// It's already zero, let's skip it.
309-
continue;
310-
}
324+
if (useExportShader) {
325+
// We only need to do one pass if using an export shader.
311326
StencilUB ub{};
312-
if (dstBuffer->fb_format == GE_FORMAT_4444) {
313-
draw_->SetStencilParams(0xFF, (i << 4) | i, 0xFF);
314-
ub.stencilValue = i * (16.0f / 255.0f);
315-
} else if (dstBuffer->fb_format == GE_FORMAT_5551) {
316-
draw_->SetStencilParams(0xFF, 0xFF, 0xFF);
317-
ub.stencilValue = i * (128.0f / 255.0f);
318-
} else {
319-
draw_->SetStencilParams(0xFF, i, 0xFF);
320-
ub.stencilValue = i * (1.0f / 255.0f);
321-
}
327+
draw_->SetStencilParams(0xFF, 0xFF, 0xFF);
322328
draw_->UpdateDynamicUniformBuffer(&ub, sizeof(ub));
323329
draw_->DrawUP(positions, 3);
330+
} else {
331+
for (int i = 1; i < values; i += i) {
332+
if (!(usedBits & i)) {
333+
// It's already zero, let's skip it.
334+
continue;
335+
}
336+
StencilUB ub{};
337+
if (dstBuffer->fb_format == GE_FORMAT_4444) {
338+
draw_->SetStencilParams(0xFF, (i << 4) | i, 0xFF);
339+
ub.stencilValue = i * (16.0f / 255.0f);
340+
} else if (dstBuffer->fb_format == GE_FORMAT_5551) {
341+
draw_->SetStencilParams(0xFF, 0xFF, 0xFF);
342+
ub.stencilValue = i * (128.0f / 255.0f);
343+
} else {
344+
draw_->SetStencilParams(0xFF, i, 0xFF);
345+
ub.stencilValue = i * (1.0f / 255.0f);
346+
}
347+
draw_->UpdateDynamicUniformBuffer(&ub, sizeof(ub));
348+
draw_->DrawUP(positions, 3);
349+
}
324350
}
325351

326352
if (useBlit) {

GPU/Common/StencilCommon.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55
#include "Common/GPU/thin3d.h"
66

77
// Exposed for automated tests
8-
void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs);
8+
void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs, bool useExport);
99
void GenerateStencilVs(char *buffer, const ShaderLanguageDesc &lang);

unittest/TestShaderGenerators.cpp

+15-11
Original file line numberDiff line numberDiff line change
@@ -291,17 +291,21 @@ bool TestStencilShaders() {
291291
ShaderLanguageDesc desc(languages[k]);
292292
std::string errorMessage;
293293

294-
// Generate all despite failures - it's only 6.
295-
GenerateStencilFs(buffer, desc, bugs);
296-
if (strlen(buffer) >= 8192) {
297-
printf("Stencil fragment shader exceeded buffer:\n\n%s\n", LineNumberString(buffer).c_str());
298-
failed = true;
299-
}
300-
if (!TestCompileShader(buffer, languages[k], ShaderStage::Fragment, &errorMessage)) {
301-
printf("Error compiling stencil shader:\n\n%s\n\n%s\n", LineNumberString(buffer).c_str(), errorMessage.c_str());
302-
failed = true;
303-
} else {
304-
printf("===\n%s\n===\n", buffer);
294+
// Generate all despite failures - it's only a few.
295+
// Only use export on Vulkan, because GLSL_3xx is ES which doesn't support stencil export.
296+
bool allowUseExport = languages[k] == ShaderLanguage::GLSL_VULKAN;
297+
for (int useExport = 0; useExport <= (allowUseExport ? 1 : 0); ++useExport) {
298+
GenerateStencilFs(buffer, desc, bugs, useExport == 1);
299+
if (strlen(buffer) >= 8192) {
300+
printf("Stencil fragment shader (useExport=%d) exceeded buffer:\n\n%s\n", useExport, LineNumberString(buffer).c_str());
301+
failed = true;
302+
}
303+
if (!TestCompileShader(buffer, languages[k], ShaderStage::Fragment, &errorMessage)) {
304+
printf("Error compiling stencil shader (useExport=%d):\n\n%s\n\n%s\n", useExport, LineNumberString(buffer).c_str(), errorMessage.c_str());
305+
failed = true;
306+
} else {
307+
printf("===\n%s\n===\n", buffer);
308+
}
305309
}
306310

307311
GenerateStencilVs(buffer, desc);

0 commit comments

Comments
 (0)