Skip to content

Commit 409f9ae

Browse files
authored
workaround vulkan padding stall with new nvidia driver (#5953)
1 parent cf6c700 commit 409f9ae

File tree

2 files changed

+20
-0
lines changed

2 files changed

+20
-0
lines changed

src/layer/vulkan/shader/padding_pack1to4.comp

+8
Original file line numberDiff line numberDiff line change
@@ -173,12 +173,20 @@ void main()
173173
ivec4 v_offset = y4 * psc(w) + x;
174174

175175
afpvec4 v;
176+
#if ncnn_vendorID == 4318
177+
// out of index load cause stall on nvidia
178+
v.r = mask.r ? buffer_ld1(bottom_blob_data, v_offset.r) : value;
179+
v.g = mask.g ? buffer_ld1(bottom_blob_data, v_offset.g) : value;
180+
v.b = mask.b ? buffer_ld1(bottom_blob_data, v_offset.b) : value;
181+
v.a = mask.a ? buffer_ld1(bottom_blob_data, v_offset.a) : value;
182+
#else
176183
v.r = buffer_ld1(bottom_blob_data, v_offset.r);
177184
v.g = buffer_ld1(bottom_blob_data, v_offset.g);
178185
v.b = buffer_ld1(bottom_blob_data, v_offset.b);
179186
v.a = buffer_ld1(bottom_blob_data, v_offset.a);
180187

181188
v = mix(afpvec4(value), v, mask);
189+
#endif
182190

183191
buffer_st4(top_blob_data, gi, v);
184192
#endif

src/layer/vulkan/shader/padding_pack1to8.comp

+12
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,17 @@ void main()
209209
ivec4 v_offset2 = yy4 * psc(w) + x;
210210

211211
afpvec8 v;
212+
#if ncnn_vendorID == 4318
213+
// out of index load cause stall on nvidia
214+
v[0].r = mask.r ? buffer_ld1(bottom_blob_data, v_offset.r) : value;
215+
v[0].g = mask.g ? buffer_ld1(bottom_blob_data, v_offset.g) : value;
216+
v[0].b = mask.b ? buffer_ld1(bottom_blob_data, v_offset.b) : value;
217+
v[0].a = mask.a ? buffer_ld1(bottom_blob_data, v_offset.a) : value;
218+
v[1].r = mask2.r ? buffer_ld1(bottom_blob_data, v_offset2.r) : value;
219+
v[1].g = mask2.g ? buffer_ld1(bottom_blob_data, v_offset2.g) : value;
220+
v[1].b = mask2.b ? buffer_ld1(bottom_blob_data, v_offset2.b) : value;
221+
v[1].a = mask2.a ? buffer_ld1(bottom_blob_data, v_offset2.a) : value;
222+
#else
212223
v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
213224
v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
214225
v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
@@ -220,6 +231,7 @@ void main()
220231

221232
v[0] = mix(afpvec4(value), v[0], mask);
222233
v[1] = mix(afpvec4(value), v[1], mask2);
234+
#endif
223235

224236
buffer_st8(top_blob_data, gi, v);
225237
#endif

0 commit comments

Comments
 (0)