普通文本  |  318行  |  10.17 KB

// Copyright 2008 Google Inc. All Rights Reserved.

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

//      http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// error_diag.cc: Collects device errors for analysis to more accurately
//                pin-point failed component.

#include <set>
#include <list>
#include <map>

// This file must work with autoconf on its public version,
// so these includes are correct.
#include "error_diag.h"
#include "sattypes.h"


// DeviceTree constructor.
DeviceTree::DeviceTree(string name)
  : parent_(0), name_(name) {
  pthread_mutex_init(&device_tree_mutex_, NULL);
}

// DeviceTree destructor.
DeviceTree::~DeviceTree() {
  // Deallocate subtree devices.
  for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
      itr != subdevices_.end();
      ++itr) {
    delete itr->second;
  }
  // Deallocate device errors.
  for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
      itr != errors_.end();
      ++itr) {
    delete (*itr);
  }
  pthread_mutex_destroy(&device_tree_mutex_);
}

// Atomically find named device in sub device tree.
// Returns 0 if not found
DeviceTree *DeviceTree::FindInSubTree(string name) {
  DeviceTree *ret;
  pthread_mutex_lock(&device_tree_mutex_);
  ret = UnlockedFindInSubTree(name);
  pthread_mutex_unlock(&device_tree_mutex_);
  return ret;
}

// Find named device in sub device tree (Non-atomic).
// Returns 0 if not found
DeviceTree *DeviceTree::UnlockedFindInSubTree(string name) {
  std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name);
  if (itr != subdevices_.end()) {
    return itr->second;
  } else {
    // Search sub-tree.
    for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
        itr != subdevices_.end();
        ++itr) {
      DeviceTree *result = itr->second->UnlockedFindInSubTree(name);
      if (result != 0)
        return result;
    }
    return 0;
  }
}

// Atomically add error instance to device.
void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) {
  pthread_mutex_lock(&device_tree_mutex_);
  errors_.push_back(error_instance);
  pthread_mutex_unlock(&device_tree_mutex_);
}

// Find or add queried device as necessary.
DeviceTree *DeviceTree::FindOrAddDevice(string name) {
  // Assume named device does not exist and try to insert the device anyway.
  // No-op if named device already exists.
  InsertSubDevice(name);
  // Find and return sub device pointer.
  return FindInSubTree(name);
}

// Pretty prints device tree.
void DeviceTree::PrettyPrint(string spacer) {
  for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
      itr != subdevices_.end();
      ++itr) {
    printf("%s%s\n", spacer.c_str(), itr->first.c_str());
    itr->second->PrettyPrint(spacer+spacer);
  }
}

// Atomically add sub device.
// No-op if named device already exists.
void DeviceTree::InsertSubDevice(string name) {
  pthread_mutex_lock(&device_tree_mutex_);
  if (UnlockedFindInSubTree(name) != 0) {
    pthread_mutex_unlock(&device_tree_mutex_);
    return;
  }
  subdevices_[name] = new DeviceTree(name);
  subdevices_[name]->parent_ = this;
  pthread_mutex_unlock(&device_tree_mutex_);
}


// Returns true of any error associated with this device is fatal.
bool DeviceTree::KnownBad() {
  pthread_mutex_lock(&device_tree_mutex_);
  for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
      itr != errors_.end();
      ++itr) {
    if ((*itr)->severity_ == SAT_ERROR_FATAL) {
      pthread_mutex_unlock(&device_tree_mutex_);
      return true;
    }
  }
  pthread_mutex_unlock(&device_tree_mutex_);
  return false;
}


// ErrorDiag constructor.
ErrorDiag::ErrorDiag() {
  os_ = 0;
  system_tree_root_ = 0;
}

// ErrorDiag destructor.
ErrorDiag::~ErrorDiag() {
  if (system_tree_root_)
    delete system_tree_root_;
}

// Set platform specific handle and initialize device tree.
// Returns false on error. true otherwise.
bool ErrorDiag::set_os(OsLayer *os) {
  os_ = os;
  return(InitializeDeviceTree());
}

// Create and initialize system device tree.
// Returns false on error. true otherwise.
bool ErrorDiag::InitializeDeviceTree() {
  system_tree_root_ = new DeviceTree("system_root");
  if (!system_tree_root_)
    return false;
  return true;
}

// Logs info about a CECC.
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
int ErrorDiag::AddCeccError(string dimm_string) {
  DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
  ECCErrorInstance *error = new ECCErrorInstance;
  if (!error)
    return -1;
  error->severity_ = SAT_ERROR_CORRECTABLE;
  dimm_device->AddErrorInstance(error);
  return 0;
}

// Logs info about a UECC.
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
int ErrorDiag::AddUeccError(string dimm_string) {
  DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
  ECCErrorInstance *error = new ECCErrorInstance;
  if (!error)
    return -1;
  error->severity_ = SAT_ERROR_FATAL;
  dimm_device->AddErrorInstance(error);
  return 0;
}

// Logs info about a miscompare.
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) {
  DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
  MiscompareErrorInstance *error = new MiscompareErrorInstance;
  if (!error)
    return -1;
  error->severity_ = SAT_ERROR_FATAL;
  error->addr_ = addr;
  dimm_device->AddErrorInstance(error);
  os_->ErrorReport(dimm_string.c_str(), "miscompare", count);
  return 1;
}

// Utility Function to translate a virtual address to DIMM number.
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) {
  char dimm_string[256] = "";
  char *vbyteaddr = reinterpret_cast<char*>(addr) + offset;
  uint64 paddr = os->VirtualToPhysical(vbyteaddr);
  os->FindDimm(paddr, dimm_string, sizeof(dimm_string));
  return string(dimm_string);
}

// Info about a miscompare from a drive.
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset,
                                     void *src_addr, void *dst_addr) {
  bool mask_hdd_error = false;

  HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance;
  if (!error)
    return -1;

  error->addr_ = reinterpret_cast<uint64>(src_addr);
  error->addr2_ = reinterpret_cast<uint64>(dst_addr);
  error->offset_ = offset;
  error->block_ = block;

  string src_dimm = AddressToDimmString(os_, src_addr, offset);
  string dst_dimm = AddressToDimmString(os_, dst_addr, offset);

  // DIMM name look up success
  if (src_dimm.compare("DIMM Unknown")) {
    // Add src DIMM as possible miscompare cause.
    DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
    error->causes_.insert(src_dimm_dev);
    if (src_dimm_dev->KnownBad()) {
      mask_hdd_error = true;
      logprintf(5, "Log: supressed %s miscompare report: "
                "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
    }
  }
  if (dst_dimm.compare("DIMM Unknown")) {
    // Add dst DIMM as possible miscompare cause.
    DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
    error->causes_.insert(dst_dimm_dev);
    if (dst_dimm_dev->KnownBad()) {
      mask_hdd_error = true;
      logprintf(5, "Log: supressed %s miscompare report: "
                "known bad destination: %s\n", devicename.c_str(),
                dst_dimm.c_str());
    }
  }

  DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
  hdd_dev->AddErrorInstance(error);

  // HDD error was not masked by bad DIMMs: report bad HDD.
  if (!mask_hdd_error) {
    os_->ErrorReport(devicename.c_str(), "miscompare", 1);
    error->severity_ = SAT_ERROR_FATAL;
    return 1;
  }
  return 0;
}

// Info about a sector tag miscompare from a drive.
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset,
                                    int sector, void *src_addr,
                                    void *dst_addr) {
  bool mask_hdd_error = false;

  HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance;
  if (!error)
    return -1;

  error->addr_ = reinterpret_cast<uint64>(src_addr);
  error->addr2_ = reinterpret_cast<uint64>(dst_addr);
  error->sector_ = sector;
  error->block_ = block;

  string src_dimm = AddressToDimmString(os_, src_addr, offset);
  string dst_dimm = AddressToDimmString(os_, dst_addr, offset);

  // DIMM name look up success
  if (src_dimm.compare("DIMM Unknown")) {
    // Add src DIMM as possible miscompare cause.
    DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
    error->causes_.insert(src_dimm_dev);
    if (src_dimm_dev->KnownBad()) {
      mask_hdd_error = true;
      logprintf(5, "Log: supressed %s sector tag error report: "
                "known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
    }
  }
  if (dst_dimm.compare("DIMM Unknown")) {
    // Add dst DIMM as possible miscompare cause.
    DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
    error->causes_.insert(dst_dimm_dev);
    if (dst_dimm_dev->KnownBad()) {
      mask_hdd_error = true;
      logprintf(5, "Log: supressed %s sector tag error report: "
                "known bad destination: %s\n", devicename.c_str(),
                dst_dimm.c_str());
    }
  }

  DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
  hdd_dev->AddErrorInstance(error);

  // HDD error was not masked by bad DIMMs: report bad HDD.
  if (!mask_hdd_error) {
    os_->ErrorReport(devicename.c_str(), "sector", 1);
    error->severity_ = SAT_ERROR_FATAL;
    return 1;
  }
  return 0;
}