15 cpuid(1, 0, &eax, &ebx, &ecx, &edx);
17 bool has_xsave = (ecx & (1U << 26)) != 0;
18 bool has_avx = (ecx & (1U << 28)) != 0;
19 bool has_sse = (ecx & (1U << 25)) != 0;
20 bool has_oxsave = (ecx & (1U << 27)) != 0;
23 cpuid(7, 0, &eax, &ebx, &ecx, &edx);
24 bool has_avx2 = (ebx & (1U << 5)) != 0;
27 LOG2_WARN(
"SIMD",
"SSE not supported, SIMD disabled");
34 if (has_avx && !has_oxsave)
37 if (has_avx2 && (!has_avx || !has_oxsave))
44 __asm__
volatile(
"mov %%cr0, %0" :
"=r"(cr0_val));
46 cr0_val &= ~(1ULL << 2);
47 cr0_val |= (1ULL << 1);
49 __asm__
volatile(
"mov %0, %%cr0" : :
"r"(cr0_val));
51 __asm__
volatile(
"mov %%cr4, %0" :
"=r"(cr4_val));
53 cr4_val |= (1ULL << 9);
54 cr4_val |= (1ULL << 10);
57 cr4_val |= (1ULL << 18);
59 __asm__
volatile(
"mov %0, %%cr4" : :
"r"(cr4_val));
62 __asm__
volatile(
"fninit");
80 __asm__
volatile(
"xsetbv"
88 "XSAVE/AVX not supported, fallback to SSE only");
106 asm volatile(
"movapd (%1), %%xmm0\n"
107 "movapd (%2), %%xmm1\n"
108 "addpd %%xmm1, %%xmm0\n"
109 "movapd %%xmm0, (%0)\n"
111 :
"r"(
dst),
"r"(
a),
"r"(b)
116 asm volatile(
"movapd (%1), %%xmm0\n"
117 "movapd (%2), %%xmm1\n"
118 "subpd %%xmm1, %%xmm0\n"
119 "movapd %%xmm0, (%0)\n"
121 :
"r"(
dst),
"r"(
a),
"r"(b)
126 asm volatile(
"movapd (%1), %%xmm0\n"
127 "movapd (%2), %%xmm1\n"
128 "mulpd %%xmm1, %%xmm0\n"
129 "movapd %%xmm0, (%0)\n"
131 :
"r"(
dst),
"r"(
a),
"r"(b)
138 asm volatile(
"vmovapd (%1), %%ymm0\n"
139 "vmovapd (%2), %%ymm1\n"
140 "vmovapd (%3), %%ymm2\n"
141 "vfmadd132pd %%ymm1, %%ymm2, %%ymm0\n"
143 "vmovapd %%ymm0, (%0)\n"
145 :
"r"(
dst),
"r"(
a),
"r"(b),
"r"(c)
146 :
"ymm0",
"ymm1",
"ymm2");
151 asm volatile(
"vmovapd (%1), %%ymm0\n"
152 "vmovapd (%2), %%ymm1\n"
153 "vmovapd (%3), %%ymm2\n"
154 "vfmsub132pd %%ymm1, %%ymm2, %%ymm0\n"
156 "vmovapd %%ymm0, (%0)\n"
158 :
"r"(
dst),
"r"(
a),
"r"(b),
"r"(c)
159 :
"ymm0",
"ymm1",
"ymm2");
void cpuid(uint32_t leaf, uint32_t subleaf, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
#define LOG2_WARN(mod, fmt,...)
#define LOG2_INFO(mod, fmt,...)
void simd_mul_pd(double *dst, const double *a, const double *b)
void fma_mul_add_pd(double *dst, const double *a, const double *b, const double *c)
void simd_sub_pd(double *dst, const double *a, const double *b)
void fma_mul_sub_pd(double *dst, const double *a, const double *b, const double *c)
void sse_add_pd(double *dst, const double *a, const double *b)