-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsse.cpp
147 lines (147 loc) · 4.41 KB
/
sse.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#ifndef JUST32BIT
#include "lifealgo.h"
#include "util.h"
#include <algorithm>
#include <cstdlib>
#include <x86intrin.h>
using namespace std ;
typedef __m128i lifeword ;
const int lifewordwidth = 128 ;
const int shift = 7 ;
class ssealgo : public lifealgo {
public:
virtual void init(int w, int h) ;
virtual void setcell(int x, int y) ;
virtual int getpopulation() ;
virtual int nextstep(int, int, int) ;
virtual void swap() ;
int w, h, wordwidth ;
long long wh ;
lifeword *u0, *u1 ;
} ;
static class ssealgofactory : public lifealgofactory {
public:
ssealgofactory() ;
virtual lifealgo *createInstance() {
return new ssealgo() ;
}
} factory ;
ssealgofactory::ssealgofactory() {
registerAlgo("sse", &factory) ;
}
void ssealgo::init(int w_, int h_) {
w = w_ ;
h = h_ ;
wordwidth = (w + lifewordwidth-1) >> shift ;
wh = wordwidth * h ;
u0 = (lifeword *)calloc(wordwidth*sizeof(lifeword), h+1) ;
u1 = (lifeword *)calloc(wordwidth*sizeof(lifeword), h+1) ;
}
void ssealgo::setcell(int x, int y) {
static lifeword bigbig = _mm_set_epi64x(1, 0) ;
static lifeword bigone = _mm_cvtsi64_si128(1) ;
int xx = x & (lifewordwidth - 1) ;
if (xx < 64) {
u0[(x>>shift)*h+y] |= bigone << xx ;
} else {
u0[(x>>shift)*h+y] |= bigbig << (xx - 64) ;
}
}
static inline int popcount128(__m128i n) {
const __m128i n_hi = _mm_unpackhi_epi64(n, n);
return __builtin_popcountll(_mm_cvtsi128_si64(n)) +
__builtin_popcountll(_mm_cvtsi128_si64(n_hi));
}
int ssealgo::getpopulation() {
int r = 0 ;
for (int i=0; i<wh; i++)
r += popcount128(u0[i]) ;
return r ;
}
static inline void add2(lifeword a, lifeword b,
lifeword &c0, lifeword &c1) {
c0 = a ^ b ;
c1 = a & b ;
}
static inline void add3(lifeword a, lifeword b, lifeword c,
lifeword &c0, lifeword &c1) {
lifeword t0, t1, t2 ;
add2(a, b, t0, t1) ;
add2(t0, c, c0, t2) ;
c1 = t2 | t1 ;
}
static lifeword leftshift(lifeword v) {
return _mm_slli_epi64(v, 1) | _mm_slli_si128(_mm_srli_epi64(v, 63), 8) ;
}
static lifeword rightshift(lifeword v) {
return _mm_srli_epi64(v, 1) | _mm_srli_si128(_mm_slli_epi64(v, 63), 8) ;
}
static lifeword bigleftshift(lifeword v) {
return _mm_slli_epi64(_mm_slli_si128(v, 8), 63) ;
}
static lifeword bigrightshift(lifeword v) {
return _mm_srli_epi64(_mm_srli_si128(v, 8), 63) ;
}
void ssealgo::swap() { ::swap(u0, u1) ; }
int ssealgo::nextstep(int id, int n, int needpop) {
int r = 0 ;
int loi = id * wordwidth / n ;
int hii = (id + 1) * wordwidth / n ;
for (int i=loi; i<hii; i++) {
lifeword w00 = _mm_cvtsi64_si128(0) ;
lifeword w01 = _mm_cvtsi64_si128(0) ;
lifeword *col = u0 + i * h + 1 ;
lifeword *pcol = u0 + (i-1) * h + 1 ;
lifeword *ncol = u0 + (i+1) * h + 1 ;
lifeword *wcol = u1 + i * h + 1 ;
lifeword w1 = *col ;
lifeword w1l = leftshift(w1) ;
lifeword w1r = rightshift(w1) ;
if (i > 0)
w1l += bigrightshift(*pcol) ;
if (i+1 < wordwidth)
w1r += bigleftshift(*ncol) ;
lifeword w10, w11 ;
add3(w1, w1l, w1r, w10, w11) ;
for (int j=1; j+1<h; j += 2, col += 2, pcol += 2, ncol += 2, wcol += 2) {
lifeword w2 = col[1] ;
lifeword w3 = col[2] ;
lifeword w2l = leftshift(w2) ;
lifeword w3l = leftshift(w3) ;
lifeword w2r = rightshift(w2) ;
lifeword w3r = rightshift(w3) ;
if (i > 0) {
w2l |= bigrightshift(pcol[1]) ;
w3l |= bigrightshift(pcol[2]) ;
}
if (i+1 < wordwidth) {
w2r |= bigleftshift(ncol[1]) ;
w3r |= bigleftshift(ncol[2]) ;
}
lifeword w20, w21, w30, w31, a0, a1, a2, b0, b1, b2, ng1, ng2 ;
add3(w2, w2l, w2r, w20, w21) ;
add2(w10, w20, b0, a1) ;
add3(w11, w21, a1, b1, b2) ;
add2(b0, w00, a0, a1) ;
add3(b1, w01, a1, a1, a2) ;
a2 ^= b2 ;
ng1 = (a0 ^ a2) & (a1 ^ a2) & (w1 | a1) ;
wcol[0] = ng1 ;
add3(w3, w3l, w3r, w30, w31) ;
add2(b0, w30, a0, a1) ;
add3(b1, w31, a1, a1, a2) ;
a2 ^= b2 ;
ng2 = (a0 ^ a2) & (a1 ^ a2) & (w2 | a1) ;
wcol[1] = ng2 ;
if (needpop)
r += popcount128(ng1) + popcount128(ng2) ;
w00 = w20 ;
w01 = w21 ;
w10 = w30 ;
w11 = w31 ;
w1 = w3 ;
}
}
return r ;
}
#endif