Sync with upstream repo.

Changes include: * CPU check has been broken up into a number of small libraries * BoringSSL option has been removed * Better abseil integration
2020-11-09 13:03:39 -08:00
parent e71781fd7a
commit 83eed0a886
21 changed files with 2298 additions and 1551 deletions
--- a/avx.cc
+++ b/avx.cc
@@ -0,0 +1,194 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "avx.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <immintrin.h>
+#endif
+
+#if defined(__i386__) || defined(__x86_64__)
+#define X86_TARGET_ATTRIBUTE(s) __attribute__((target(s)))
+#else
+#define X86_TARGET_ATTRIBUTE(s)
+#endif
+
+#if defined(__i386__) || defined(__x86_64__)
+
+bool Avx::can_do_avx() {
+  __builtin_cpu_init();
+  return __builtin_cpu_supports("avx");
+}
+
+bool Avx::can_do_avx512f() {
+  __builtin_cpu_init();
+  return __builtin_cpu_supports("avx512f");
+}
+
+bool Avx::can_do_fma() {
+  __builtin_cpu_init();
+  return __builtin_cpu_supports("fma");
+}
+
+#else
+
+bool Avx::can_do_avx() { return false; }
+bool Avx::can_do_avx512f() { return false; }
+bool Avx::can_do_fma() { return false; }
+
+#endif
+
+std::string Avx::MaybeGoHot() {
+  if (std::uniform_int_distribution<int>(0, 1)(rng_)) {
+    // Don't provoke.
+    level_ = 0;
+    return "";
+  }
+  if (can_do_avx512f()) {
+    // Processor supports both AVX and AVX512.
+    level_ = std::uniform_int_distribution<int>(0, 1)(rng_) ? 3 : 5;
+  } else {
+    // Processor supports only AVX.
+    level_ = 3;
+  }
+  return BurnIfAvxHeavy();
+}
+
+std::string Avx::BurnIfAvxHeavy() {
+  if (level_ == 3) {
+    return can_do_fma() ? Avx256FMA(kIterations) : Avx256(kIterations);
+  }
+  if (level_ == 5) {
+    return Avx512(kIterations);
+  }
+  return "";
+}
+
+// See notes for Avx512 below
+X86_TARGET_ATTRIBUTE("avx")
+std::string Avx::Avx256(int rounds) {
+#if (defined(__i386__) || defined(__x86_64__))
+  const __m256d minus_four = _mm256_set1_pd(-4.0);
+  __m256d x[4];
+  for (int k = 0; k < 4; k++) {
+    x[k] =
+        _mm256_set1_pd(std::uniform_real_distribution<double>(0.0, 1.0)(rng_));
+  }
+  double *gross_x[4] = {
+      reinterpret_cast<double *>(&x[0]),
+      reinterpret_cast<double *>(&x[1]),
+      reinterpret_cast<double *>(&x[2]),
+      reinterpret_cast<double *>(&x[3]),
+  };
+  for (int i = 0; i < rounds; i++) {
+    __m256d a[4];
+    a[0] = _mm256_sub_pd(_mm256_mul_pd(x[0], x[0]), x[0]);
+    a[1] = _mm256_sub_pd(_mm256_mul_pd(x[1], x[1]), x[1]);
+    a[2] = _mm256_sub_pd(_mm256_mul_pd(x[2], x[2]), x[2]);
+    a[3] = _mm256_sub_pd(_mm256_mul_pd(x[3], x[3]), x[3]);
+    x[0] = _mm256_mul_pd(minus_four, a[0]);
+    x[1] = _mm256_mul_pd(minus_four, a[1]);
+    x[2] = _mm256_mul_pd(minus_four, a[2]);
+    x[3] = _mm256_mul_pd(minus_four, a[3]);
+  }
+  for (int k = 1; k < 4; k++) {
+    for (int i = 0; i < 4; i++) {
+      if (gross_x[k][i] != gross_x[k][0]) {
+        return "avx256 pd";
+      }
+    }
+  }
+#endif
+  return "";
+}
+
+// See notes for Avx512 below
+X86_TARGET_ATTRIBUTE("avx,fma")
+std::string Avx::Avx256FMA(int rounds) {
+#if (defined(__i386__) || defined(__x86_64__))
+  const __m256d minus_four = _mm256_set1_pd(-4.0);
+  __m256d x[4];
+  for (int k = 0; k < 4; k++) {
+    x[k] =
+        _mm256_set1_pd(std::uniform_real_distribution<double>(0.0, 1.0)(rng_));
+  }
+  double *gross_x[4] = {
+      reinterpret_cast<double *>(&x[0]),
+      reinterpret_cast<double *>(&x[1]),
+      reinterpret_cast<double *>(&x[2]),
+      reinterpret_cast<double *>(&x[3]),
+  };
+  for (int i = 0; i < rounds; i++) {
+    __m256d a[4];
+    a[0] = _mm256_fmsub_pd(x[0], x[0], x[0]);
+    a[1] = _mm256_fmsub_pd(x[1], x[1], x[1]);
+    a[2] = _mm256_fmsub_pd(x[2], x[2], x[2]);
+    a[3] = _mm256_fmsub_pd(x[3], x[3], x[3]);
+    x[0] = _mm256_mul_pd(minus_four, a[0]);
+    x[1] = _mm256_mul_pd(minus_four, a[1]);
+    x[2] = _mm256_mul_pd(minus_four, a[2]);
+    x[3] = _mm256_mul_pd(minus_four, a[3]);
+  }
+  for (int k = 1; k < 4; k++) {
+    for (int i = 0; i < 4; i++) {
+      if (gross_x[k][i] != gross_x[k][0]) {
+        return "avx256 pd";
+      }
+    }
+  }
+#endif
+  return "";
+}
+
+// Interleave AVX512 parallel calculation of iterates of f(x) = 4x(1-x).
+// Hope compiler too dumb to see through this.
+X86_TARGET_ATTRIBUTE("avx512f")
+std::string Avx::Avx512(int rounds) {
+#if (defined(__i386__) || defined(__x86_64__))
+  const __m512d minus_four = _mm512_set1_pd(-4.0);
+  __m512d x[4];
+  for (int k = 0; k < 4; k++) {
+    x[k] =
+        _mm512_set1_pd(std::uniform_real_distribution<double>(0.0, 1.0)(rng_));
+  }
+
+  double *gross_x[4] = {
+      reinterpret_cast<double *>(&x[0]),
+      reinterpret_cast<double *>(&x[1]),
+      reinterpret_cast<double *>(&x[2]),
+      reinterpret_cast<double *>(&x[3]),
+  };
+
+  for (int i = 0; i < rounds; i++) {
+    __m512d a[4];
+    a[0] = _mm512_fmsub_pd(x[0], x[0], x[0]);
+    a[1] = _mm512_fmsub_pd(x[1], x[1], x[1]);
+    a[2] = _mm512_fmsub_pd(x[2], x[2], x[2]);
+    a[3] = _mm512_fmsub_pd(x[3], x[3], x[3]);
+    x[0] = _mm512_mul_pd(minus_four, a[0]);
+    x[1] = _mm512_mul_pd(minus_four, a[1]);
+    x[2] = _mm512_mul_pd(minus_four, a[2]);
+    x[3] = _mm512_mul_pd(minus_four, a[3]);
+  }
+
+  for (int k = 1; k < 4; k++) {
+    for (int i = 0; i < 7; i++) {
+      if (gross_x[k][i] != gross_x[k][0]) {
+        return "avx512 pd";
+      }
+    }
+  }
+#endif
+  return "";
+}