// Copyright 2020 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "avx.h" #if defined(__i386__) || defined(__x86_64__) #include #endif #if defined(__i386__) || defined(__x86_64__) #define X86_TARGET_ATTRIBUTE(s) __attribute__((target(s))) #else #define X86_TARGET_ATTRIBUTE(s) #endif #if defined(__i386__) || defined(__x86_64__) bool Avx::can_do_avx() { __builtin_cpu_init(); return __builtin_cpu_supports("avx"); } bool Avx::can_do_avx512f() { __builtin_cpu_init(); return __builtin_cpu_supports("avx512f"); } bool Avx::can_do_fma() { __builtin_cpu_init(); return __builtin_cpu_supports("fma"); } #else bool Avx::can_do_avx() { return false; } bool Avx::can_do_avx512f() { return false; } bool Avx::can_do_fma() { return false; } #endif std::string Avx::MaybeGoHot() { if (std::uniform_int_distribution(0, 1)(rng_)) { // Don't provoke. level_ = 0; return ""; } if (can_do_avx512f()) { // Processor supports both AVX and AVX512. level_ = std::uniform_int_distribution(0, 1)(rng_) ? 3 : 5; } else { // Processor supports only AVX. level_ = 3; } return BurnIfAvxHeavy(); } std::string Avx::BurnIfAvxHeavy() { if (level_ == 3) { return can_do_fma() ? Avx256FMA(kIterations) : Avx256(kIterations); } if (level_ == 5) { return Avx512(kIterations); } return ""; } // See notes for Avx512 below X86_TARGET_ATTRIBUTE("avx") std::string Avx::Avx256(int rounds) { #if (defined(__i386__) || defined(__x86_64__)) const __m256d minus_four = _mm256_set1_pd(-4.0); __m256d x[4]; for (int k = 0; k < 4; k++) { x[k] = _mm256_set1_pd(std::uniform_real_distribution(0.0, 1.0)(rng_)); } double *gross_x[4] = { reinterpret_cast(&x[0]), reinterpret_cast(&x[1]), reinterpret_cast(&x[2]), reinterpret_cast(&x[3]), }; for (int i = 0; i < rounds; i++) { __m256d a[4]; a[0] = _mm256_sub_pd(_mm256_mul_pd(x[0], x[0]), x[0]); a[1] = _mm256_sub_pd(_mm256_mul_pd(x[1], x[1]), x[1]); a[2] = _mm256_sub_pd(_mm256_mul_pd(x[2], x[2]), x[2]); a[3] = _mm256_sub_pd(_mm256_mul_pd(x[3], x[3]), x[3]); x[0] = _mm256_mul_pd(minus_four, a[0]); x[1] = _mm256_mul_pd(minus_four, a[1]); x[2] = _mm256_mul_pd(minus_four, a[2]); x[3] = _mm256_mul_pd(minus_four, a[3]); } for (int k = 1; k < 4; k++) { for (int i = 0; i < 4; i++) { if (gross_x[k][i] != gross_x[k][0]) { return "avx256 pd"; } } } #endif return ""; } // See notes for Avx512 below X86_TARGET_ATTRIBUTE("avx,fma") std::string Avx::Avx256FMA(int rounds) { #if (defined(__i386__) || defined(__x86_64__)) const __m256d minus_four = _mm256_set1_pd(-4.0); __m256d x[4]; for (int k = 0; k < 4; k++) { x[k] = _mm256_set1_pd(std::uniform_real_distribution(0.0, 1.0)(rng_)); } double *gross_x[4] = { reinterpret_cast(&x[0]), reinterpret_cast(&x[1]), reinterpret_cast(&x[2]), reinterpret_cast(&x[3]), }; for (int i = 0; i < rounds; i++) { __m256d a[4]; a[0] = _mm256_fmsub_pd(x[0], x[0], x[0]); a[1] = _mm256_fmsub_pd(x[1], x[1], x[1]); a[2] = _mm256_fmsub_pd(x[2], x[2], x[2]); a[3] = _mm256_fmsub_pd(x[3], x[3], x[3]); x[0] = _mm256_mul_pd(minus_four, a[0]); x[1] = _mm256_mul_pd(minus_four, a[1]); x[2] = _mm256_mul_pd(minus_four, a[2]); x[3] = _mm256_mul_pd(minus_four, a[3]); } for (int k = 1; k < 4; k++) { for (int i = 0; i < 4; i++) { if (gross_x[k][i] != gross_x[k][0]) { return "avx256 pd"; } } } #endif return ""; } // Interleave AVX512 parallel calculation of iterates of f(x) = 4x(1-x). // Hope compiler too dumb to see through this. X86_TARGET_ATTRIBUTE("avx512f") std::string Avx::Avx512(int rounds) { #if (defined(__i386__) || defined(__x86_64__)) const __m512d minus_four = _mm512_set1_pd(-4.0); __m512d x[4]; for (int k = 0; k < 4; k++) { x[k] = _mm512_set1_pd(std::uniform_real_distribution(0.0, 1.0)(rng_)); } double *gross_x[4] = { reinterpret_cast(&x[0]), reinterpret_cast(&x[1]), reinterpret_cast(&x[2]), reinterpret_cast(&x[3]), }; for (int i = 0; i < rounds; i++) { __m512d a[4]; a[0] = _mm512_fmsub_pd(x[0], x[0], x[0]); a[1] = _mm512_fmsub_pd(x[1], x[1], x[1]); a[2] = _mm512_fmsub_pd(x[2], x[2], x[2]); a[3] = _mm512_fmsub_pd(x[3], x[3], x[3]); x[0] = _mm512_mul_pd(minus_four, a[0]); x[1] = _mm512_mul_pd(minus_four, a[1]); x[2] = _mm512_mul_pd(minus_four, a[2]); x[3] = _mm512_mul_pd(minus_four, a[3]); } for (int k = 1; k < 4; k++) { for (int i = 0; i < 7; i++) { if (gross_x[k][i] != gross_x[k][0]) { return "avx512 pd"; } } } #endif return ""; }