@@ -209,6 +209,17 @@ void main()
209
209
ivec4 v_offset2 = yy4 * psc(w) + x;
210
210
211
211
afpvec8 v;
212
+ #if ncnn_vendorID == 4318
213
+ // out of index load cause stall on nvidia
214
+ v[0].r = mask.r ? buffer_ld1(bottom_blob_data, v_offset.r) : value;
215
+ v[0].g = mask.g ? buffer_ld1(bottom_blob_data, v_offset.g) : value;
216
+ v[0].b = mask.b ? buffer_ld1(bottom_blob_data, v_offset.b) : value;
217
+ v[0].a = mask.a ? buffer_ld1(bottom_blob_data, v_offset.a) : value;
218
+ v[1].r = mask2.r ? buffer_ld1(bottom_blob_data, v_offset2.r) : value;
219
+ v[1].g = mask2.g ? buffer_ld1(bottom_blob_data, v_offset2.g) : value;
220
+ v[1].b = mask2.b ? buffer_ld1(bottom_blob_data, v_offset2.b) : value;
221
+ v[1].a = mask2.a ? buffer_ld1(bottom_blob_data, v_offset2.a) : value;
222
+ #else
212
223
v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
213
224
v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
214
225
v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
@@ -220,6 +231,7 @@ void main()
220
231
221
232
v[0] = mix(afpvec4(value), v[0], mask);
222
233
v[1] = mix(afpvec4(value), v[1], mask2);
234
+ #endif
223
235
224
236
buffer_st8(top_blob_data, gi, v);
225
237
#endif
0 commit comments