Files
cpu-check/corrupt_cores.cc
2020-05-08 13:33:05 -07:00

201 lines
5.2 KiB
C++

// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Analyzes cpu_check failure tids to produce list of condemned
// cores. Usually there's just one defective core.
//
// One way to extract tids from logs is extract_tids.sh.
// Pipe its output to this program.
//
// By default, this code assumes 28 core dual socket machines.
#include <algorithm>
#include <cstdio>
#include <sstream>
#include <string>
#include <vector>
#include "log.h"
class BadCore {
public:
BadCore(int sockets, int cores_per_socket)
: sockets_(sockets), cores_per_socket_(cores_per_socket) {}
// Condemns thread 'tid'.
void Condemn(int tid) {
std::vector<int> c({TidToCanonicalCore(tid)});
accused_.push_back(c);
}
// Condemns one of 'tid_1' and 'tid_2'.
void Accuse(int tid_1, int tid_2) {
std::vector<int> c({TidToCanonicalCore(tid_1), TidToCanonicalCore(tid_2)});
accused_.push_back(c);
}
// Greedy condemnation.
void Condemn() {
while (!accused_.empty()) {
CondemnWorst();
}
}
// Returns string naming the condemned cores.
std::string Condemnations() const {
if (condemned_.empty()) {
return "None";
}
std::stringstream s;
if (ambiguous_) {
s << "AMBIGUOUS ";
}
for (auto &c : condemned_) {
s << CanonicalCoreToString(c.first) << " (" << c.second << ") ";
}
return s.str();
}
// Returns true if tid within legitimate range.
bool Plausible(int tid) const {
return (tid >= 0) && (tid < (2 * sockets_ * cores_per_socket_));
}
private:
// Condemns worst offender.
void CondemnWorst() {
int worst = -1;
int worst_k = -1;
bool ambiguous = false;
for (int c = 0; c < sockets_ * cores_per_socket_; c++) {
const int k = AccusationCount(c);
if (k == 0) continue;
if (k > worst_k) {
worst = c;
worst_k = k;
ambiguous = false;
} else {
if (k == worst_k) {
ambiguous = true;
}
}
}
ambiguous_ |= ambiguous;
condemned_.push_back({worst, worst_k});
Dispose(worst);
}
// Returns number of accusations against 'canonical_core'.
int AccusationCount(int canonical_core) const {
int k = 0;
for (auto &v : accused_) {
if (std::find(v.begin(), v.end(), canonical_core) != v.end()) {
k++;
}
}
return k;
}
// Delete accusations that include 'canonical_core'.
void Dispose(int canonical_core) {
std::vector<std::vector<int>> temp;
for (auto &v : accused_) {
if (std::find(v.begin(), v.end(), canonical_core) == v.end()) {
temp.push_back(v);
}
}
accused_ = temp;
}
int TidToCanonicalCore(int tid) const {
return tid % (sockets_ * cores_per_socket_);
}
std::string CanonicalCoreToString(int canonical_core) const {
const int socket = canonical_core / cores_per_socket_;
const int a = canonical_core;
const int b = canonical_core + sockets_ * cores_per_socket_;
std::stringstream s;
s << "CPU" << socket << " HT" << a << "-" << b;
return s.str();
}
const int sockets_;
const int cores_per_socket_;
std::vector<std::vector<int>> accused_;
std::vector<std::pair<int, int>> condemned_;
bool ambiguous_ = false;
};
static void UsageIf(bool v) {
if (!v) return;
LOG(ERROR) << "Usage corrupt_cores [-c cores_per_socket] [-s sockets]";
exit(2);
}
int main(int argc, char **argv) {
int sockets = 2; // Default: dual socket
int cores_per_socket = 28; // Default: C28
for (int i = 1; i < argc; i++) {
const char *flag = argv[i];
UsageIf(flag[0] != '-');
for (flag++; *flag != 0; flag++) {
switch (*flag) {
case 'c': {
std::string c(++flag);
flag += c.length();
std::stringstream s(c);
UsageIf((s >> cores_per_socket).fail());
break;
}
case 's': {
std::string c(++flag);
flag += c.length();
std::stringstream s(c);
UsageIf((s >> sockets).fail());
break;
}
default:
UsageIf(true);
}
if (*flag == 0) break;
}
}
std::string line;
BadCore bad(sockets, cores_per_socket);
while (std::getline(std::cin, line)) {
std::istringstream ss(line);
int a = 9999;
if ((ss >> a).fail() || !bad.Plausible(a)) {
LOG(ERROR) << "Bad input: '" << line << "'";
continue;
}
while (ss.peek() == ' ') ss.ignore();
if (ss.eof()) {
bad.Condemn(a);
} else {
int b = 9999;
if ((ss >> b).fail() || !bad.Plausible(b)) {
LOG(ERROR) << "Bad input: '" << line << "'";
continue;
}
bad.Accuse(a, b);
}
}
bad.Condemn();
printf("Condemned %s\n", bad.Condemnations().c_str());
}