Changes include: * CPU check has been broken up into a number of small libraries * BoringSSL option has been removed * Better abseil integration
195 lines
5.3 KiB
C++
195 lines
5.3 KiB
C++
// Copyright 2020 Google LLC
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "avx.h"
|
|
|
|
#if defined(__i386__) || defined(__x86_64__)
|
|
#include <immintrin.h>
|
|
#endif
|
|
|
|
#if defined(__i386__) || defined(__x86_64__)
|
|
#define X86_TARGET_ATTRIBUTE(s) __attribute__((target(s)))
|
|
#else
|
|
#define X86_TARGET_ATTRIBUTE(s)
|
|
#endif
|
|
|
|
#if defined(__i386__) || defined(__x86_64__)
|
|
|
|
bool Avx::can_do_avx() {
|
|
__builtin_cpu_init();
|
|
return __builtin_cpu_supports("avx");
|
|
}
|
|
|
|
bool Avx::can_do_avx512f() {
|
|
__builtin_cpu_init();
|
|
return __builtin_cpu_supports("avx512f");
|
|
}
|
|
|
|
bool Avx::can_do_fma() {
|
|
__builtin_cpu_init();
|
|
return __builtin_cpu_supports("fma");
|
|
}
|
|
|
|
#else
|
|
|
|
bool Avx::can_do_avx() { return false; }
|
|
bool Avx::can_do_avx512f() { return false; }
|
|
bool Avx::can_do_fma() { return false; }
|
|
|
|
#endif
|
|
|
|
std::string Avx::MaybeGoHot() {
|
|
if (std::uniform_int_distribution<int>(0, 1)(rng_)) {
|
|
// Don't provoke.
|
|
level_ = 0;
|
|
return "";
|
|
}
|
|
if (can_do_avx512f()) {
|
|
// Processor supports both AVX and AVX512.
|
|
level_ = std::uniform_int_distribution<int>(0, 1)(rng_) ? 3 : 5;
|
|
} else {
|
|
// Processor supports only AVX.
|
|
level_ = 3;
|
|
}
|
|
return BurnIfAvxHeavy();
|
|
}
|
|
|
|
std::string Avx::BurnIfAvxHeavy() {
|
|
if (level_ == 3) {
|
|
return can_do_fma() ? Avx256FMA(kIterations) : Avx256(kIterations);
|
|
}
|
|
if (level_ == 5) {
|
|
return Avx512(kIterations);
|
|
}
|
|
return "";
|
|
}
|
|
|
|
// See notes for Avx512 below
|
|
X86_TARGET_ATTRIBUTE("avx")
|
|
std::string Avx::Avx256(int rounds) {
|
|
#if (defined(__i386__) || defined(__x86_64__))
|
|
const __m256d minus_four = _mm256_set1_pd(-4.0);
|
|
__m256d x[4];
|
|
for (int k = 0; k < 4; k++) {
|
|
x[k] =
|
|
_mm256_set1_pd(std::uniform_real_distribution<double>(0.0, 1.0)(rng_));
|
|
}
|
|
double *gross_x[4] = {
|
|
reinterpret_cast<double *>(&x[0]),
|
|
reinterpret_cast<double *>(&x[1]),
|
|
reinterpret_cast<double *>(&x[2]),
|
|
reinterpret_cast<double *>(&x[3]),
|
|
};
|
|
for (int i = 0; i < rounds; i++) {
|
|
__m256d a[4];
|
|
a[0] = _mm256_sub_pd(_mm256_mul_pd(x[0], x[0]), x[0]);
|
|
a[1] = _mm256_sub_pd(_mm256_mul_pd(x[1], x[1]), x[1]);
|
|
a[2] = _mm256_sub_pd(_mm256_mul_pd(x[2], x[2]), x[2]);
|
|
a[3] = _mm256_sub_pd(_mm256_mul_pd(x[3], x[3]), x[3]);
|
|
x[0] = _mm256_mul_pd(minus_four, a[0]);
|
|
x[1] = _mm256_mul_pd(minus_four, a[1]);
|
|
x[2] = _mm256_mul_pd(minus_four, a[2]);
|
|
x[3] = _mm256_mul_pd(minus_four, a[3]);
|
|
}
|
|
for (int k = 1; k < 4; k++) {
|
|
for (int i = 0; i < 4; i++) {
|
|
if (gross_x[k][i] != gross_x[k][0]) {
|
|
return "avx256 pd";
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
return "";
|
|
}
|
|
|
|
// See notes for Avx512 below
|
|
X86_TARGET_ATTRIBUTE("avx,fma")
|
|
std::string Avx::Avx256FMA(int rounds) {
|
|
#if (defined(__i386__) || defined(__x86_64__))
|
|
const __m256d minus_four = _mm256_set1_pd(-4.0);
|
|
__m256d x[4];
|
|
for (int k = 0; k < 4; k++) {
|
|
x[k] =
|
|
_mm256_set1_pd(std::uniform_real_distribution<double>(0.0, 1.0)(rng_));
|
|
}
|
|
double *gross_x[4] = {
|
|
reinterpret_cast<double *>(&x[0]),
|
|
reinterpret_cast<double *>(&x[1]),
|
|
reinterpret_cast<double *>(&x[2]),
|
|
reinterpret_cast<double *>(&x[3]),
|
|
};
|
|
for (int i = 0; i < rounds; i++) {
|
|
__m256d a[4];
|
|
a[0] = _mm256_fmsub_pd(x[0], x[0], x[0]);
|
|
a[1] = _mm256_fmsub_pd(x[1], x[1], x[1]);
|
|
a[2] = _mm256_fmsub_pd(x[2], x[2], x[2]);
|
|
a[3] = _mm256_fmsub_pd(x[3], x[3], x[3]);
|
|
x[0] = _mm256_mul_pd(minus_four, a[0]);
|
|
x[1] = _mm256_mul_pd(minus_four, a[1]);
|
|
x[2] = _mm256_mul_pd(minus_four, a[2]);
|
|
x[3] = _mm256_mul_pd(minus_four, a[3]);
|
|
}
|
|
for (int k = 1; k < 4; k++) {
|
|
for (int i = 0; i < 4; i++) {
|
|
if (gross_x[k][i] != gross_x[k][0]) {
|
|
return "avx256 pd";
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
return "";
|
|
}
|
|
|
|
// Interleave AVX512 parallel calculation of iterates of f(x) = 4x(1-x).
|
|
// Hope compiler too dumb to see through this.
|
|
X86_TARGET_ATTRIBUTE("avx512f")
|
|
std::string Avx::Avx512(int rounds) {
|
|
#if (defined(__i386__) || defined(__x86_64__))
|
|
const __m512d minus_four = _mm512_set1_pd(-4.0);
|
|
__m512d x[4];
|
|
for (int k = 0; k < 4; k++) {
|
|
x[k] =
|
|
_mm512_set1_pd(std::uniform_real_distribution<double>(0.0, 1.0)(rng_));
|
|
}
|
|
|
|
double *gross_x[4] = {
|
|
reinterpret_cast<double *>(&x[0]),
|
|
reinterpret_cast<double *>(&x[1]),
|
|
reinterpret_cast<double *>(&x[2]),
|
|
reinterpret_cast<double *>(&x[3]),
|
|
};
|
|
|
|
for (int i = 0; i < rounds; i++) {
|
|
__m512d a[4];
|
|
a[0] = _mm512_fmsub_pd(x[0], x[0], x[0]);
|
|
a[1] = _mm512_fmsub_pd(x[1], x[1], x[1]);
|
|
a[2] = _mm512_fmsub_pd(x[2], x[2], x[2]);
|
|
a[3] = _mm512_fmsub_pd(x[3], x[3], x[3]);
|
|
x[0] = _mm512_mul_pd(minus_four, a[0]);
|
|
x[1] = _mm512_mul_pd(minus_four, a[1]);
|
|
x[2] = _mm512_mul_pd(minus_four, a[2]);
|
|
x[3] = _mm512_mul_pd(minus_four, a[3]);
|
|
}
|
|
|
|
for (int k = 1; k < 4; k++) {
|
|
for (int i = 0; i < 7; i++) {
|
|
if (gross_x[k][i] != gross_x[k][0]) {
|
|
return "avx512 pd";
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
return "";
|
|
}
|