Skip to content

Fix some NEON code that had bad compile-time checks #15481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 11 additions & 12 deletions Common/Data/Convert/SmallDataConvert.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#ifdef _M_SSE
#include <emmintrin.h>
#endif
#if PPSSPP_PLATFORM(ARM_NEON)
#if PPSSPP_ARCH(ARM_NEON)
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
#include <arm64_neon.h>
#else
Expand All @@ -31,13 +31,12 @@ inline void Uint8x4ToFloat4(float f[4], uint32_t u) {
__m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, zero), zero);
__m128 fvalues = _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_load_ps(one_over_255_x4));
_mm_storeu_ps(f, fvalues);
#elif PPSSPP_PLATFORM(ARM_NEON)
const float32x4_t one_over = vdupq_n_f32(1.0f/255.0f);
const uint8x8_t value = vld1_lane_u32(u);
const uint16x8_t value16 = vmovl_s8(value);
const uint32x4_t value32 = vmovl_s16(vget_low_s16(value16));
const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), one_over);
vst1q_u32((uint32_t *)dest, valueFloat);
#elif PPSSPP_ARCH(ARM_NEON)
const uint8x8_t value = (uint8x8_t)vdup_n_u32(u);
const uint16x8_t value16 = vmovl_u8(value);
const uint32x4_t value32 = vmovl_u16(vget_low_u16(value16));
const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), vdupq_n_f32(1.0f / 255.0f));
vst1q_f32(f, valueFloat);
#else
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
Expand All @@ -62,7 +61,7 @@ inline uint32_t Float4ToUint8x4(const float f[4]) {
}

inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {
#if defined(_M_SSE) || PPSSPP_PLATFORM(ARM_NEON)
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
Uint8x4ToFloat4(f, (u & 0xFFFFFF) | (alpha << 24));
#else
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
Expand All @@ -73,7 +72,7 @@ inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {
}

inline void Uint8x3ToFloat4(float f[4], uint32_t u) {
#if defined(_M_SSE) || PPSSPP_PLATFORM(ARM_NEON)
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
Uint8x4ToFloat4(f, u & 0xFFFFFF);
#else
f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
Expand Down Expand Up @@ -168,8 +167,8 @@ inline void ExpandFloat24x3ToFloat4(float dest[4], const uint32_t src[3]) {
#ifdef _M_SSE
__m128i values = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8);
_mm_storeu_si128((__m128i *)dest, values);
#elif PPSSPP_PLATFORM(ARM_NEON)
const uint32x4_t values = vshlq_n_u32(vld1q_u32(&gstate.texscaleu), 8);
#elif PPSSPP_ARCH(ARM_NEON)
const uint32x4_t values = vshlq_n_u32(vld1q_u32(src), 8);
vst1q_u32((uint32_t *)dest, values);
#else
uint32_t temp[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 };
Expand Down
2 changes: 1 addition & 1 deletion Core/HLE/sceNetAdhoc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3318,7 +3318,7 @@ int RecreatePtpSocket(int ptpId) {
WARN_LOG(SCENET, "RecreatePtpSocket(%id) - Wrapped Port Detected: Original(%d) -> Requested(%d), Bound(%d) -> BoundOriginal(%d)", ptpId, sock->data.ptp.lport, requestedport, boundport, boundport - portOffset);
u16 newlport = boundport - portOffset;
if (newlport != sock->data.ptp.lport) {
WARN_LOG(SCENET, "RecreatePtpSocket(%id) - Old and New LPort is different! The port may need to be reforwarded");
WARN_LOG(SCENET, "RecreatePtpSocket(%id) - Old and New LPort is different! The port may need to be reforwarded", ptpId);
if (!sock->isClient)
UPnP_Add(IP_PROTOCOL_TCP, isOriPort ? newlport : newlport + portOffset, newlport + portOffset);
}
Expand Down
2 changes: 1 addition & 1 deletion Core/MIPS/ARM64/Arm64CompVFPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1792,7 +1792,7 @@ namespace MIPSComp {
fpr.MapRegsAndSpillLockV(sregs, sz, 0);
gpr.MapReg(MIPS_REG_VFPUCC);
for (int i = 0; i < n; i++) {
TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1 << i);
TSTI2R(gpr.R(MIPS_REG_VFPUCC), 1ULL << i);
FixupBranch b = B(tf ? CC_NEQ : CC_EQ);
fp.FMOV(fpr.V(dregs[i]), fpr.V(sregs[i]));
SetJumpTarget(b);
Expand Down
27 changes: 27 additions & 0 deletions GPU/GPUCommon.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
#include "ppsspp_config.h"

#if defined(_M_SSE)
#include <emmintrin.h>
#endif
#if PPSSPP_ARCH(ARM_NEON)
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif

#include <algorithm>
#include <type_traits>
#include <mutex>
Expand Down Expand Up @@ -2959,6 +2971,21 @@ bool GPUCommon::FramebufferReallyDirty() {
return true;
}

void GPUCommon::UpdateUVScaleOffset() {
#ifdef _M_SSE
__m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *) & gstate.texscaleu), 8);
_mm_storeu_si128((__m128i *) & gstate_c.uv, values);
#elif PPSSPP_ARCH(ARM_NEON)
const uint32x4_t values = vshlq_n_u32(vld1q_u32((const u32 *)&gstate.texscaleu), 8);
vst1q_u32((u32 *)&gstate_c.uv, values);
#else
gstate_c.uv.uScale = getFloat24(gstate.texscaleu);
gstate_c.uv.vScale = getFloat24(gstate.texscalev);
gstate_c.uv.uOff = getFloat24(gstate.texoffsetu);
gstate_c.uv.vOff = getFloat24(gstate.texoffsetv);
#endif
}

size_t GPUCommon::FormatGPUStatsCommon(char *buffer, size_t size) {
float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
return snprintf(buffer, size,
Expand Down
19 changes: 1 addition & 18 deletions GPU/GPUCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@
#include <atomic>
#endif

#if defined(_M_SSE)
#include <emmintrin.h>
#endif

class FramebufferManagerCommon;
class TextureCacheCommon;
class DrawEngineCommon;
Expand Down Expand Up @@ -218,20 +214,7 @@ class GPUCommon : public GPUInterface, public GPUDebugInterface {
GPUgstate GetGState() override;
void SetCmdValue(u32 op) override;

void UpdateUVScaleOffset() {
#ifdef _M_SSE
__m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)&gstate.texscaleu), 8);
_mm_storeu_si128((__m128i *)&gstate_c.uv, values);
#elif PPSSPP_PLATFORM(ARM_NEON)
const uint32x4_t values = vshlq_n_u32(vld1q_u32(&gstate.texscaleu), 8);
vst1q_u32(&gstate_c.uv, values);
#else
gstate_c.uv.uScale = getFloat24(gstate.texscaleu);
gstate_c.uv.vScale = getFloat24(gstate.texscalev);
gstate_c.uv.uOff = getFloat24(gstate.texoffsetu);
gstate_c.uv.vOff = getFloat24(gstate.texoffsetv);
#endif
}
void UpdateUVScaleOffset();

DisplayList* getList(int listid) override {
return &dls[listid];
Expand Down