Skip to content

Commit f0d844a

Browse files
committed
Convert Dot33 to SSE2
Simpler, lower requirements, and doesn't seem to hurt speed. See hrydgard#17571.
1 parent 4a4cd3d commit f0d844a

File tree

1 file changed

+10
-20
lines changed

1 file changed

+10
-20
lines changed

GPU/Software/Lighting.cpp

+10-20
Original file line numberDiff line numberDiff line change
@@ -255,23 +255,13 @@ static inline void LightColorSum(Vec4<int> &sum, const Vec4<int> &src) {
255255
#endif
256256
}
257257

258-
#if defined(_M_SSE)
259-
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
260-
[[gnu::target("sse4.1")]]
261-
#endif
262-
static inline __m128 Dot33SSE4(__m128 a, __m128 b) {
263-
__m128 multiplied = _mm_insert_ps(_mm_mul_ps(a, b), _mm_setzero_ps(), 0x30);
264-
__m128 lanes3311 = _mm_movehdup_ps(multiplied);
265-
__m128 partial = _mm_add_ps(multiplied, lanes3311);
266-
return _mm_add_ss(partial, _mm_movehl_ps(lanes3311, partial));
267-
}
268-
#endif
269-
270-
template <bool useSSE4>
271258
static inline float Dot33(const Vec3f &a, const Vec3f &b) {
272-
#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
273-
if (useSSE4)
274-
return _mm_cvtss_f32(Dot33SSE4(a.vec, b.vec));
259+
#if defined(_M_SSE)
260+
__m128 v = _mm_mul_ps(a.vec, b.vec); // [X, Y, Z, W]
261+
__m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1)); // [Y, X, Z, W]
262+
__m128 sums = _mm_add_ps(v, shuf); // [X + Y, X + Y, Z + Z, W + W]
263+
shuf = _mm_movehl_ps(shuf, shuf); // [Z, W, Z, W]
264+
return _mm_cvtss_f32(_mm_add_ss(sums, shuf)); // X + Y + Z
275265
#elif PPSSPP_ARCH(ARM64_NEON)
276266
float32x4_t multipled = vsetq_lane_f32(0.0f, vmulq_f32(a.vec, b.vec), 3);
277267
float32x2_t add1 = vget_low_f32(vpaddq_f32(multipled, multipled));
@@ -311,7 +301,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
311301
// TODO: Should this normalize (0, 0, 0) to (0, 0, 1)?
312302
float d = L.NormalizeOr001();
313303

314-
att = 1.0f / Dot33<useSSE4>(lstate.att, Vec3f(1.0f, d, d * d));
304+
att = 1.0f / Dot33(lstate.att, Vec3f(1.0f, d, d * d));
315305
if (!(att > 0.0f))
316306
att = 0.0f;
317307
else if (att > 1.0f)
@@ -320,7 +310,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
320310

321311
float spot = 1.0f;
322312
if (lstate.spot) {
323-
float rawSpot = Dot33<useSSE4>(lstate.spotDir, L);
313+
float rawSpot = Dot33(lstate.spotDir, L);
324314
if (std::isnan(rawSpot))
325315
rawSpot = std::signbit(rawSpot) ? 0.0f : 1.0f;
326316

@@ -345,7 +335,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
345335
// diffuse lighting
346336
float diffuse_factor;
347337
if (lstate.diffuse || lstate.specular) {
348-
diffuse_factor = Dot33<useSSE4>(L, worldnormal);
338+
diffuse_factor = Dot33(L, worldnormal);
349339
if (lstate.poweredDiffuse) {
350340
diffuse_factor = pspLightPow(diffuse_factor, state.specularExp);
351341
}
@@ -363,7 +353,7 @@ static void ProcessSIMD(VertexData &vertex, const WorldCoords &worldpos, const W
363353
if (lstate.specular && diffuse_factor >= 0.0f) {
364354
Vec3<float> H = L + Vec3<float>(0.f, 0.f, 1.f);
365355

366-
float specular_factor = Dot33<useSSE4>(H.NormalizedOr001(useSSE4), worldnormal);
356+
float specular_factor = Dot33(H.NormalizedOr001(useSSE4), worldnormal);
367357
specular_factor = pspLightPow(specular_factor, state.specularExp);
368358

369359
if (specular_factor > 0.0f) {

0 commit comments

Comments
 (0)