-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmatmul-bench-sse.c
116 lines (93 loc) · 3.18 KB
/
matmul-bench-sse.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#include <emmintrin.h>
#include "matmul-bench-common.h"
static NOINLINE W32_ALIGN_ARG_POINTER void
sse_(unsigned long i0,
unsigned long j0,
unsigned long k0,
unsigned long bi,
float * __restrict out,
const float* __restrict inL,
const float* __restrict inR,
unsigned long n)
{
unsigned int block_size = 32;
int i = i0+bi;
float *outp = &out[i*n+j0];
__m128 *outp4 = (__m128*)outp;
__m128 vout0 = _mm_setzero_ps();
__m128 vout1 = _mm_setzero_ps();
__m128 vout2 = _mm_setzero_ps();
__m128 vout3 = _mm_setzero_ps();
__m128 vout4 = _mm_setzero_ps();
__m128 vout5 = _mm_setzero_ps();
__m128 vout6 = _mm_setzero_ps();
__m128 vout7 = _mm_setzero_ps();
_mm_prefetch((const char*)(outp + n) ,_MM_HINT_T0);
for (long bk=0; bk<block_size; bk++) {
long k = k0+bk;
const float *inRp = &inR[k*n+j0];
_mm_prefetch((const char*)(inRp + n),_MM_HINT_T0);
float lik = inL[i*n+k];
__m128 lik4 = _mm_set1_ps(lik);
#define OUTER_SSE_J(J) \
long j_##J = (J*4); \
__m128 vr##J = _mm_load_ps(&inRp[j_##J]); \
vout##J = _mm_add_ps(vout##J, _mm_mul_ps(lik4, vr##J)); \
OUTER_SSE_J(0);
OUTER_SSE_J(1);
OUTER_SSE_J(2);
OUTER_SSE_J(3);
OUTER_SSE_J(4);
OUTER_SSE_J(5);
OUTER_SSE_J(6);
OUTER_SSE_J(7);
}
outp4[0] = _mm_add_ps(outp4[0], vout0);
outp4[1] = _mm_add_ps(outp4[1], vout1);
outp4[2] = _mm_add_ps(outp4[2], vout2);
outp4[3] = _mm_add_ps(outp4[3], vout3);
outp4[4] = _mm_add_ps(outp4[4], vout4);
outp4[5] = _mm_add_ps(outp4[5], vout5);
outp4[6] = _mm_add_ps(outp4[6], vout6);
outp4[7] = _mm_add_ps(outp4[7], vout7);
}
static void
sse_thread_func(struct MatmulBenchParam *p,
unsigned long i_start,
unsigned long i_end,
unsigned int thread_id)
{
float * __restrict out = p->out;
const float * __restrict inL = p->inL;
const float * __restrict inR = p->inR;
unsigned long n = p->n;
const unsigned int block_size = 32;
for (unsigned long i0=i_start; i0<i_end; i0+=block_size) {
for (int j0=0; j0<n; j0+=block_size) {
for (int bi=0; bi<block_size; bi++) {
for (int bj=0; bj<block_size; bj+=4) {
int i = i0+bi;
int j = j0+bj;
_mm_store_ps(&out[i*n+j], _mm_setzero_ps());
}
}
for (int k0=0; k0<n; k0+=block_size) {
for (int bi=0; bi<block_size; bi++) {
sse_(i0, j0, k0, bi, out, inL, inR, n);
}
}
}
}
}
static void
sse_run(struct MatmulBenchParam *p)
{
const unsigned int block_size = 32;
matmul_bench_thread_call(p, p->i_block_size*block_size, p->n, sse_thread_func);
}
static const struct MatmulBenchTest sse = MATMULBENCH_TEST_INITIALIZER("sse", sse_run, 32);
void
matmulbench_init_sse(struct MatmulBench *b, struct npr_varray *test_set)
{
VA_PUSH(struct MatmulBenchTest, test_set, sse);
}