breakpad/src/processor/basic_source_line_resolver.cc

// Copyright (c) 2010 Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// basic_source_line_resolver.cc: BasicSourceLineResolver implementation.
//
// See basic_source_line_resolver.h and basic_source_line_resolver_types.h
// for documentation.

#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>

#include <limits>
#include <map>
#include <utility>
#include <vector>

#include "google_breakpad/processor/basic_source_line_resolver.h"
#include "processor/basic_source_line_resolver_types.h"
#include "processor/module_factory.h"

#include "processor/tokenize.h"

using std::map;
using std::vector;
using std::make_pair;

namespace google_breakpad {

#ifdef _WIN32
#ifdef _MSC_VER
#define strtok_r strtok_s
#endif
#define strtoull _strtoui64
#endif

namespace {

// Utility function to tokenize given the presence of an optional initial
// field. In this case, optional_field is the expected string for the optional
// field, and max_tokens is the maximum number of tokens including the optional
// field. Refer to the documentation for Tokenize for descriptions of the other
// arguments.
bool TokenizeWithOptionalField(char *line,
                               const char *optional_field,
                               const char *separators,
                               int max_tokens,
                               vector<char*> *tokens) {
  // First tokenize assuming the optional field is not present.  If we then see
  // the optional field, additionally tokenize the last token into two tokens.
  if (!Tokenize(line, separators, max_tokens - 1, tokens)) {
    return false;
  }

  if (strcmp(tokens->front(), optional_field) == 0) {
    // The optional field is present. Split the last token in two to recover the
    // field prior to the last.
    vector<char*> last_tokens;
    if (!Tokenize(tokens->back(), separators, 2, &last_tokens)) {
      return false;
    }
    // Replace the previous last token with the two new tokens.
    tokens->pop_back();
    tokens->push_back(last_tokens[0]);
    tokens->push_back(last_tokens[1]);
  }

  return true;
}

}  // namespace

static const char *kWhitespace = " \r\n";
static const int kMaxErrorsPrinted = 5;
static const int kMaxErrorsBeforeBailing = 100;

BasicSourceLineResolver::BasicSourceLineResolver() :
    SourceLineResolverBase(new BasicModuleFactory) { }

// static
void BasicSourceLineResolver::Module::LogParseError(
   const string &message,
   int line_number,
   int *num_errors) {
  if (++(*num_errors) <= kMaxErrorsPrinted) {
    if (line_number > 0) {
      BPLOG(ERROR) << "Line " << line_number << ": " << message;
    } else {
      BPLOG(ERROR) << message;
    }
  }
}

bool BasicSourceLineResolver::Module::LoadMapFromMemory(
    char *memory_buffer,
    size_t memory_buffer_size) {
  linked_ptr<Function> cur_func;
  int line_number = 0;
  int num_errors = 0;
  char *save_ptr;

  // If the length is 0, we can still pretend we have a symbol file. This is
  // for scenarios that want to test symbol lookup, but don't necessarily care
  // if certain modules do not have any information, like system libraries.
  if (memory_buffer_size == 0) {
    return true;
  }

  // Make sure the last character is null terminator.
  size_t last_null_terminator = memory_buffer_size - 1;
  if (memory_buffer[last_null_terminator] != '\0') {
    memory_buffer[last_null_terminator] = '\0';
  }

  // Skip any null terminators at the end of the memory buffer, and make sure
  // there are no other null terminators in the middle of the memory buffer.
  bool has_null_terminator_in_the_middle = false;
  while (last_null_terminator > 0 &&
         memory_buffer[last_null_terminator - 1] == '\0') {
    last_null_terminator--;
  }
  for (size_t i = 0; i < last_null_terminator; i++) {
    if (memory_buffer[i] == '\0') {
      memory_buffer[i] = '_';
      has_null_terminator_in_the_middle = true;
    }
  }
  if (has_null_terminator_in_the_middle) {
    LogParseError(
       "Null terminator is not expected in the middle of the symbol data",
       line_number,
       &num_errors);
  }

  char *buffer;
  buffer = strtok_r(memory_buffer, "\r\n", &save_ptr);

  while (buffer != NULL) {
    ++line_number;

    if (strncmp(buffer, "FILE ", 5) == 0) {
      if (!ParseFile(buffer)) {
        LogParseError("ParseFile on buffer failed", line_number, &num_errors);
      }
    } else if (strncmp(buffer, "STACK ", 6) == 0) {
      if (!ParseStackInfo(buffer)) {
        LogParseError("ParseStackInfo failed", line_number, &num_errors);
      }
    } else if (strncmp(buffer, "FUNC ", 5) == 0) {
      cur_func.reset(ParseFunction(buffer));
      if (!cur_func.get()) {
        LogParseError("ParseFunction failed", line_number, &num_errors);
      } else {
        // StoreRange will fail if the function has an invalid address or size.
        // We'll silently ignore this, the function and any corresponding lines
        // will be destroyed when cur_func is released.
        functions_.StoreRange(cur_func->address, cur_func->size, cur_func);
      }
    } else if (strncmp(buffer, "PUBLIC ", 7) == 0) {
      // Clear cur_func: public symbols don't contain line number information.
      cur_func.reset();

      if (!ParsePublicSymbol(buffer)) {
        LogParseError("ParsePublicSymbol failed", line_number, &num_errors);
      }
    } else if (strncmp(buffer, "MODULE ", 7) == 0) {
      // Ignore these.  They're not of any use to BasicSourceLineResolver,
      // which is fed modules by a SymbolSupplier.  These lines are present to
      // aid other tools in properly placing symbol files so that they can
      // be accessed by a SymbolSupplier.
      //
      // MODULE <guid> <age> <filename>
    } else if (strncmp(buffer, "INFO ", 5) == 0) {
      // Ignore these as well, they're similarly just for housekeeping.
      //
      // INFO CODE_ID <code id> <filename>
    } else {
      if (!cur_func.get()) {
        LogParseError("Found source line data without a function",
                       line_number, &num_errors);
      } else {
        Line *line = ParseLine(buffer);
        if (!line) {
          LogParseError("ParseLine failed", line_number, &num_errors);
        } else {
          cur_func->lines.StoreRange(line->address, line->size,
                                     linked_ptr<Line>(line));
        }
      }
    }
    if (num_errors > kMaxErrorsBeforeBailing) {
      break;
    }
    buffer = strtok_r(NULL, "\r\n", &save_ptr);
  }
  is_corrupt_ = num_errors > 0;
  return true;
}

void BasicSourceLineResolver::Module::LookupAddress(StackFrame *frame) const {
  MemAddr address = frame->instruction - frame->module->base_address();

  // First, look for a FUNC record that covers address. Use
  // RetrieveNearestRange instead of RetrieveRange so that, if there
  // is no such function, we can use the next function to bound the
  // extent of the PUBLIC symbol we find, below. This does mean we
  // need to check that address indeed falls within the function we
  // find; do the range comparison in an overflow-friendly way.
  linked_ptr<Function> func;
  linked_ptr<PublicSymbol> public_symbol;
  MemAddr function_base;
  MemAddr function_size;
  MemAddr public_address;
  if (functions_.RetrieveNearestRange(address, &func, &function_base,
                                      NULL /* delta */, &function_size) &&
      address >= function_base && address - function_base < function_size) {
    frame->function_name = func->name;
    frame->function_base = frame->module->base_address() + function_base;

    linked_ptr<Line> line;
    MemAddr line_base;
    if (func->lines.RetrieveRange(address, &line, &line_base, NULL /* delta */,
                                  NULL /* size */)) {
      FileMap::const_iterator it = files_.find(line->source_file_id);
      if (it != files_.end()) {
        frame->source_file_name = files_.find(line->source_file_id)->second;
      }
      frame->source_line = line->line;
      frame->source_line_base = frame->module->base_address() + line_base;
    }
  } else if (public_symbols_.Retrieve(address,
                                      &public_symbol, &public_address) &&
             (!func.get() || public_address > function_base)) {
    frame->function_name = public_symbol->name;
    frame->function_base = frame->module->base_address() + public_address;
  }
}

WindowsFrameInfo *BasicSourceLineResolver::Module::FindWindowsFrameInfo(
    const StackFrame *frame) const {
  MemAddr address = frame->instruction - frame->module->base_address();
  scoped_ptr<WindowsFrameInfo> result(new WindowsFrameInfo());

  // We only know about WindowsFrameInfo::STACK_INFO_FRAME_DATA and
  // WindowsFrameInfo::STACK_INFO_FPO. Prefer them in this order.
  // WindowsFrameInfo::STACK_INFO_FRAME_DATA is the newer type that
  // includes its own program string.
  // WindowsFrameInfo::STACK_INFO_FPO is the older type
  // corresponding to the FPO_DATA struct. See stackwalker_x86.cc.
  linked_ptr<WindowsFrameInfo> frame_info;
  if ((windows_frame_info_[WindowsFrameInfo::STACK_INFO_FRAME_DATA]
       .RetrieveRange(address, &frame_info))
      || (windows_frame_info_[WindowsFrameInfo::STACK_INFO_FPO]
          .RetrieveRange(address, &frame_info))) {
    result->CopyFrom(*frame_info.get());
    return result.release();
  }

  // Even without a relevant STACK line, many functions contain
  // information about how much space their parameters consume on the
  // stack. Use RetrieveNearestRange instead of RetrieveRange, so that
  // we can use the function to bound the extent of the PUBLIC symbol,
  // below. However, this does mean we need to check that ADDRESS
  // falls within the retrieved function's range; do the range
  // comparison in an overflow-friendly way.
  linked_ptr<Function> function;
  MemAddr function_base, function_size;
  if (functions_.RetrieveNearestRange(address, &function, &function_base,
                                      NULL /* delta */, &function_size) &&
      address >= function_base && address - function_base < function_size) {
    result->parameter_size = function->parameter_size;
    result->valid |= WindowsFrameInfo::VALID_PARAMETER_SIZE;
    return result.release();
  }

  // PUBLIC symbols might have a parameter size. Use the function we
  // found above to limit the range the public symbol covers.
  linked_ptr<PublicSymbol> public_symbol;
  MemAddr public_address;
  if (public_symbols_.Retrieve(address, &public_symbol, &public_address) &&
      (!function.get() || public_address > function_base)) {
    result->parameter_size = public_symbol->parameter_size;
  }

  return NULL;
}

CFIFrameInfo *BasicSourceLineResolver::Module::FindCFIFrameInfo(
    const StackFrame *frame) const {
  MemAddr address = frame->instruction - frame->module->base_address();
  MemAddr initial_base, initial_size;
  string initial_rules;

  // Find the initial rule whose range covers this address. That
  // provides an initial set of register recovery rules. Then, walk
  // forward from the initial rule's starting address to frame's
  // instruction address, applying delta rules.
  if (!cfi_initial_rules_.RetrieveRange(address, &initial_rules, &initial_base,
                                        NULL /* delta */, &initial_size)) {
    return NULL;
  }

  // Create a frame info structure, and populate it with the rules from
  // the STACK CFI INIT record.
  scoped_ptr<CFIFrameInfo> rules(new CFIFrameInfo());
  if (!ParseCFIRuleSet(initial_rules, rules.get()))
    return NULL;

  // Find the first delta rule that falls within the initial rule's range.
  map<MemAddr, string>::const_iterator delta =
    cfi_delta_rules_.lower_bound(initial_base);

  // Apply delta rules up to and including the frame's address.
  while (delta != cfi_delta_rules_.end() && delta->first <= address) {
    ParseCFIRuleSet(delta->second, rules.get());
    delta++;
  }

  return rules.release();
}

bool BasicSourceLineResolver::Module::ParseFile(char *file_line) {
  long index;
  char *filename;
  if (SymbolParseHelper::ParseFile(file_line, &index, &filename)) {
    files_.insert(make_pair(index, string(filename)));
    return true;
  }
  return false;
}

BasicSourceLineResolver::Function*
BasicSourceLineResolver::Module::ParseFunction(char *function_line) {
  bool is_multiple;
  uint64_t address;
  uint64_t size;
  long stack_param_size;
  char *name;
  if (SymbolParseHelper::ParseFunction(function_line, &is_multiple, &address,
                                       &size, &stack_param_size, &name)) {
    return new Function(name, address, size, stack_param_size, is_multiple);
  }
  return NULL;
}

BasicSourceLineResolver::Line* BasicSourceLineResolver::Module::ParseLine(
    char *line_line) {
  uint64_t address;
  uint64_t size;
  long line_number;
  long source_file;

  if (SymbolParseHelper::ParseLine(line_line, &address, &size, &line_number,
                                   &source_file)) {
    return new Line(address, size, source_file, line_number);
  }
  return NULL;
}

bool BasicSourceLineResolver::Module::ParsePublicSymbol(char *public_line) {
  bool is_multiple;
  uint64_t address;
  long stack_param_size;
  char *name;

  if (SymbolParseHelper::ParsePublicSymbol(public_line, &is_multiple, &address,
                                           &stack_param_size, &name)) {
    // A few public symbols show up with an address of 0.  This has been seen
    // in the dumped output of ntdll.pdb for symbols such as _CIlog, _CIpow,
    // RtlDescribeChunkLZNT1, and RtlReserveChunkLZNT1.  They would conflict
    // with one another if they were allowed into the public_symbols_ map,
    // but since the address is obviously invalid, gracefully accept them
    // as input without putting them into the map.
    if (address == 0) {
      return true;
    }

    linked_ptr<PublicSymbol> symbol(new PublicSymbol(name, address,
                                                     stack_param_size,
                                                     is_multiple));
    return public_symbols_.Store(address, symbol);
  }
  return false;
}

bool BasicSourceLineResolver::Module::ParseStackInfo(char *stack_info_line) {
  // Skip "STACK " prefix.
  stack_info_line += 6;

  // Find the token indicating what sort of stack frame walking
  // information this is.
  while (*stack_info_line == ' ')
    stack_info_line++;
  const char *platform = stack_info_line;
  while (!strchr(kWhitespace, *stack_info_line))
    stack_info_line++;
  *stack_info_line++ = '\0';

  // MSVC stack frame info.
  if (strcmp(platform, "WIN") == 0) {
    int type = 0;
    uint64_t rva, code_size;
    linked_ptr<WindowsFrameInfo>
      stack_frame_info(WindowsFrameInfo::ParseFromString(stack_info_line,
                                                         type,
                                                         rva,
                                                         code_size));
    if (stack_frame_info == NULL)
      return false;

    // TODO(mmentovai): I wanted to use StoreRange's return value as this
    // method's return value, but MSVC infrequently outputs stack info that
    // violates the containment rules.  This happens with a section of code
    // in strncpy_s in test_app.cc (testdata/minidump2).  There, problem looks
    // like this:
    //   STACK WIN 4 4242 1a a 0 ...  (STACK WIN 4 base size prolog 0 ...)
    //   STACK WIN 4 4243 2e 9 0 ...
    // ContainedRangeMap treats these two blocks as conflicting.  In reality,
    // when the prolog lengths are taken into account, the actual code of
    // these blocks doesn't conflict.  However, we can't take the prolog lengths
    // into account directly here because we'd wind up with a different set
    // of range conflicts when MSVC outputs stack info like this:
    //   STACK WIN 4 1040 73 33 0 ...
    //   STACK WIN 4 105a 59 19 0 ...
    // because in both of these entries, the beginning of the code after the
    // prolog is at 0x1073, and the last byte of contained code is at 0x10b2.
    // Perhaps we could get away with storing ranges by rva + prolog_size
    // if ContainedRangeMap were modified to allow replacement of
    // already-stored values.

    windows_frame_info_[type].StoreRange(rva, code_size, stack_frame_info);
    return true;
  } else if (strcmp(platform, "CFI") == 0) {
    // DWARF CFI stack frame info
    return ParseCFIFrameInfo(stack_info_line);
  } else {
    // Something unrecognized.
    return false;
  }
}

bool BasicSourceLineResolver::Module::ParseCFIFrameInfo(
    char *stack_info_line) {
  char *cursor;

  // Is this an INIT record or a delta record?
  char *init_or_address = strtok_r(stack_info_line, " \r\n", &cursor);
  if (!init_or_address)
    return false;

  if (strcmp(init_or_address, "INIT") == 0) {
    // This record has the form "STACK INIT <address> <size> <rules...>".
    char *address_field = strtok_r(NULL, " \r\n", &cursor);
    if (!address_field) return false;

    char *size_field = strtok_r(NULL, " \r\n", &cursor);
    if (!size_field) return false;

    char *initial_rules = strtok_r(NULL, "\r\n", &cursor);
    if (!initial_rules) return false;

    MemAddr address = strtoul(address_field, NULL, 16);
    MemAddr size    = strtoul(size_field,    NULL, 16);
    cfi_initial_rules_.StoreRange(address, size, initial_rules);
    return true;
  }

  // This record has the form "STACK <address> <rules...>".
  char *address_field = init_or_address;
  char *delta_rules = strtok_r(NULL, "\r\n", &cursor);
  if (!delta_rules) return false;
  MemAddr address = strtoul(address_field, NULL, 16);
  cfi_delta_rules_[address] = delta_rules;
  return true;
}

// static
bool SymbolParseHelper::ParseFile(char *file_line, long *index,
                                  char **filename) {
  // FILE <id> <filename>
  assert(strncmp(file_line, "FILE ", 5) == 0);
  file_line += 5;  // skip prefix

  vector<char*> tokens;
  if (!Tokenize(file_line, kWhitespace, 2, &tokens)) {
    return false;
  }

  char *after_number;
  *index = strtol(tokens[0], &after_number, 10);
  if (!IsValidAfterNumber(after_number) || *index < 0 ||
      *index == std::numeric_limits<long>::max()) {
    return false;
  }

  *filename = tokens[1];
  if (!*filename) {
    return false;
  }

  return true;
}

// static
bool SymbolParseHelper::ParseFunction(char *function_line, bool *is_multiple,
                                      uint64_t *address, uint64_t *size,
                                      long *stack_param_size, char **name) {
  // FUNC [<multiple>] <address> <size> <stack_param_size> <name>
  assert(strncmp(function_line, "FUNC ", 5) == 0);
  function_line += 5;  // skip prefix

  vector<char*> tokens;
  if (!TokenizeWithOptionalField(function_line, "m", kWhitespace, 5, &tokens)) {
    return false;
  }

  *is_multiple = strcmp(tokens[0], "m") == 0;
  int next_token = *is_multiple ? 1 : 0;

  char *after_number;
  *address = strtoull(tokens[next_token++], &after_number, 16);
  if (!IsValidAfterNumber(after_number) ||
      *address == std::numeric_limits<unsigned long long>::max()) {
    return false;
  }
  *size = strtoull(tokens[next_token++], &after_number, 16);
  if (!IsValidAfterNumber(after_number) ||
      *size == std::numeric_limits<unsigned long long>::max()) {
    return false;
  }
  *stack_param_size = strtol(tokens[next_token++], &after_number, 16);
  if (!IsValidAfterNumber(after_number) ||
      *stack_param_size == std::numeric_limits<long>::max() ||
      *stack_param_size < 0) {
    return false;
  }
  *name = tokens[next_token++];

  return true;
}

// static
bool SymbolParseHelper::ParseLine(char *line_line, uint64_t *address,
                                  uint64_t *size, long *line_number,
                                  long *source_file) {
  // <address> <size> <line number> <source file id>
  vector<char*> tokens;
  if (!Tokenize(line_line, kWhitespace, 4, &tokens)) {
    return false;
  }

  char *after_number;
  *address  = strtoull(tokens[0], &after_number, 16);
  if (!IsValidAfterNumber(after_number) ||
      *address == std::numeric_limits<unsigned long long>::max()) {
    return false;
  }
  *size = strtoull(tokens[1], &after_number, 16);
  if (!IsValidAfterNumber(after_number) ||
      *size == std::numeric_limits<unsigned long long>::max()) {
    return false;
  }
  *line_number = strtol(tokens[2], &after_number, 10);
  if (!IsValidAfterNumber(after_number) ||
      *line_number == std::numeric_limits<long>::max()) {
    return false;
  }
  *source_file = strtol(tokens[3], &after_number, 10);
  if (!IsValidAfterNumber(after_number) || *source_file < 0 ||
      *source_file == std::numeric_limits<long>::max()) {
    return false;
  }

  // Valid line numbers normally start from 1, however there are functions that
  // are associated with a source file but not associated with any line number
  // (block helper function) and for such functions the symbol file contains 0
  // for the line numbers.  Hence, 0 should be treated as a valid line number.
  // For more information on block helper functions, please, take a look at:
  // http://clang.llvm.org/docs/Block-ABI-Apple.html
  if (*line_number < 0) {
    return false;
  }

  return true;
}

// static
bool SymbolParseHelper::ParsePublicSymbol(char *public_line, bool *is_multiple,
                                          uint64_t *address,
                                          long *stack_param_size,
                                          char **name) {
  // PUBLIC [<multiple>] <address> <stack_param_size> <name>
  assert(strncmp(public_line, "PUBLIC ", 7) == 0);
  public_line += 7;  // skip prefix

  vector<char*> tokens;
  if (!TokenizeWithOptionalField(public_line, "m", kWhitespace, 4, &tokens)) {
    return false;
  }

  *is_multiple = strcmp(tokens[0], "m") == 0;
  int next_token = *is_multiple ? 1 : 0;

  char *after_number;
  *address = strtoull(tokens[next_token++], &after_number, 16);
  if (!IsValidAfterNumber(after_number) ||
      *address == std::numeric_limits<unsigned long long>::max()) {
    return false;
  }
  *stack_param_size = strtol(tokens[next_token++], &after_number, 16);
  if (!IsValidAfterNumber(after_number) ||
      *stack_param_size == std::numeric_limits<long>::max() ||
      *stack_param_size < 0) {
    return false;
  }
  *name = tokens[next_token++];

  return true;
}

// static
bool SymbolParseHelper::IsValidAfterNumber(char *after_number) {
  if (after_number != NULL && strchr(kWhitespace, *after_number) != NULL) {
    return true;
  }
  return false;
}

}  // namespace google_breakpad