@@ -255,23 +255,13 @@ static inline void LightColorSum(Vec4<int> &sum, const Vec4<int> &src) {
255
255
#endif
256
256
}
257
257
258
- #if defined(_M_SSE)
259
- #if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
260
- [[gnu::target(" sse4.1" )]]
261
- #endif
262
- static inline __m128 Dot33SSE4 (__m128 a, __m128 b) {
263
- __m128 multiplied = _mm_insert_ps (_mm_mul_ps (a, b), _mm_setzero_ps (), 0x30 );
264
- __m128 lanes3311 = _mm_movehdup_ps (multiplied);
265
- __m128 partial = _mm_add_ps (multiplied, lanes3311);
266
- return _mm_add_ss (partial, _mm_movehl_ps (lanes3311, partial));
267
- }
268
- #endif
269
-
270
- template <bool useSSE4>
271
258
static inline float Dot33 (const Vec3f &a, const Vec3f &b) {
272
- #if defined(_M_SSE) && !PPSSPP_ARCH(X86)
273
- if (useSSE4)
274
- return _mm_cvtss_f32 (Dot33SSE4 (a.vec , b.vec ));
259
+ #if defined(_M_SSE)
260
+ __m128 v = _mm_mul_ps (a.vec , b.vec ); // [X, Y, Z, W]
261
+ __m128 shuf = _mm_shuffle_ps (v, v, _MM_SHUFFLE (3 , 2 , 0 , 1 )); // [Y, X, Z, W]
262
+ __m128 sums = _mm_add_ps (v, shuf); // [X + Y, X + Y, Z + Z, W + W]
263
+ shuf = _mm_movehl_ps (shuf, shuf); // [Z, W, Z, W]
264
+ return _mm_cvtss_f32 (_mm_add_ss (sums, shuf)); // X + Y + Z
275
265
#elif PPSSPP_ARCH(ARM64_NEON)
276
266
float32x4_t multipled = vsetq_lane_f32 (0 .0f , vmulq_f32 (a.vec , b.vec ), 3 );
277
267
float32x2_t add1 = vget_low_f32 (vpaddq_f32 (multipled, multipled));
@@ -311,7 +301,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
311
301
// TODO: Should this normalize (0, 0, 0) to (0, 0, 1)?
312
302
float d = L.NormalizeOr001 ();
313
303
314
- att = 1 .0f / Dot33<useSSE4> (lstate.att , Vec3f (1 .0f , d, d * d));
304
+ att = 1 .0f / Dot33 (lstate.att , Vec3f (1 .0f , d, d * d));
315
305
if (!(att > 0 .0f ))
316
306
att = 0 .0f ;
317
307
else if (att > 1 .0f )
@@ -320,7 +310,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
320
310
321
311
float spot = 1 .0f ;
322
312
if (lstate.spot ) {
323
- float rawSpot = Dot33<useSSE4> (lstate.spotDir , L);
313
+ float rawSpot = Dot33 (lstate.spotDir , L);
324
314
if (std::isnan (rawSpot))
325
315
rawSpot = std::signbit (rawSpot) ? 0 .0f : 1 .0f ;
326
316
@@ -345,7 +335,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
345
335
// diffuse lighting
346
336
float diffuse_factor;
347
337
if (lstate.diffuse || lstate.specular ) {
348
- diffuse_factor = Dot33<useSSE4> (L, worldnormal);
338
+ diffuse_factor = Dot33 (L, worldnormal);
349
339
if (lstate.poweredDiffuse ) {
350
340
diffuse_factor = pspLightPow (diffuse_factor, state.specularExp );
351
341
}
@@ -363,7 +353,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
363
353
if (lstate.specular && diffuse_factor >= 0 .0f ) {
364
354
Vec3<float > H = L + Vec3<float >(0 .f , 0 .f , 1 .f );
365
355
366
- float specular_factor = Dot33<useSSE4> (H.NormalizedOr001 (useSSE4), worldnormal);
356
+ float specular_factor = Dot33 (H.NormalizedOr001 (useSSE4), worldnormal);
367
357
specular_factor = pspLightPow (specular_factor, state.specularExp );
368
358
369
359
if (specular_factor > 0 .0f ) {
0 commit comments