#!/usr/bin/env python
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import argparse
import bisect
import collections
import gzip
import json
import os
import re
import subprocess
import sys

_SYMBOLS_PATH = os.path.abspath(os.path.join(
    os.path.dirname(os.path.realpath(__file__)),
    '..',
    'third_party',
    'symbols'))
sys.path.append(_SYMBOLS_PATH)
# pylint: disable=import-error
import symbols.elf_symbolizer as elf_symbolizer


# Relevant trace event phases from Chromium's
# src/base/trace_event/common/trace_event_common.h.
TRACE_EVENT_PHASE_METADATA = 'M'
TRACE_EVENT_PHASE_MEMORY_DUMP = 'v'


# Matches Android library paths, supports both K (/data/app-lib/<>/lib.so)
# as well as L+ (/data/app/<>/lib/<>/lib.so). Library name is available
# via 'name' group.
ANDROID_PATH_MATCHER = re.compile(
    r'^/data/(?:app/[^/]+/lib/[^/]+/|app-lib/[^/]+/)(?P<name>.*\.so)')

# Subpath of output path where unstripped libraries are stored.
ANDROID_UNSTRIPPED_SUBPATH = 'lib.unstripped'


def FindInSystemPath(binary_name):
  paths = os.environ['PATH'].split(os.pathsep)
  for path in paths:
    binary_path = os.path.join(path, binary_name)
    if os.path.isfile(binary_path):
      return binary_path
  return None


def IsSymbolizableFile(file_path):
  result = subprocess.check_output(['file', '-0', file_path])
  type_string = result[result.find('\0') + 1:]
  return bool(re.match(r'\: (ELF|Mach-O) (32|64)-bit\b', type_string))


class ProcessMemoryMaps(object):
  """Represents 'process_mmaps' trace file entry."""

  class Region(object):
    def __init__(self, start_address, size, file_path):
      self._start_address = start_address
      self._size = size
      self._file_path = file_path

    @property
    def start_address(self):
      return self._start_address

    @property
    def end_address(self):
      return self._start_address + self._size

    @property
    def size(self):
      return self._size

    @property
    def file_path(self):
      return self._file_path

    def __cmp__(self, other):
      if isinstance(other, type(self)):
        return long(self._start_address).__cmp__(long(other._start_address))
      elif isinstance(other, (long, int)):
        return long(self._start_address).__cmp__(long(other))
      else:
        raise Exception('Cannot compare with %s' % type(other))

    def __repr__(self):
      return 'Region(0x{:X} - 0x{:X}, {})'.format(
          self.start_address, self.end_address, self.file_path)

  def __init__(self, process_mmaps):
    """Parses 'process_mmaps' dictionary."""

    regions = []
    for region_value in process_mmaps['vm_regions']:
      regions.append(self.Region(
          long(region_value['sa'], 16),
          long(region_value['sz'], 16),
          region_value['mf']))
    regions.sort()

    # Copy regions without duplicates and check for overlaps.
    self._regions = []
    previous_region = None
    for region in regions:
      if previous_region is not None:
        if region == previous_region:
          continue
        assert region.start_address >= previous_region.end_address, \
            'Regions {} and {} overlap.'.format(previous_region, region)
      previous_region = region
      self._regions.append(region)

  @property
  def regions(self):
    return self._regions

  def FindRegion(self, address):
    """Finds region containing |address|. Returns None if none found."""

    region_index = bisect.bisect_right(self._regions, address) - 1
    if region_index >= 0:
      region = self._regions[region_index]
      if address >= region.start_address and address < region.end_address:
        return region
    return None


class StackFrames(object):
  """Represents 'stackFrames' trace file entry."""

  class PCFrame(object):
    def __init__(self, pc, frame):
      self._modified = False
      self._pc = pc
      self._frame = frame

    @property
    def modified(self):
      return self._modified

    @property
    def pc(self):
      return self._pc

    @property
    def name(self):
      return self._frame['name']

    @name.setter
    def name(self, value):
      self._modified = True
      self._frame['name'] = value

  def __init__(self, stack_frames):
    """Constructs object using 'stackFrames' dictionary."""
    self._pc_frames = []
    for frame in stack_frames.itervalues():
      pc_frame = self._ParsePCFrame(frame)
      if pc_frame:
        self._pc_frames.append(pc_frame)

  @property
  def pc_frames(self):
    return self._pc_frames

  @property
  def modified(self):
    return any(f.modified for f in self._pc_frames)

  _PC_TAG = 'pc:'

  @classmethod
  def _ParsePCFrame(self, frame):
    name = frame['name']
    if not name.startswith(self._PC_TAG):
      return None
    pc = long(name[len(self._PC_TAG):], 16)
    return self.PCFrame(pc, frame)


class Process(object):
  """Holds various bits of information about a process in a trace file."""

  def __init__(self, pid):
    self.pid = pid
    self.name = None
    self.mmaps = None
    self.stack_frames = None


def CollectProcesses(trace):
  """Parses trace dictionary and returns pid->Process map of all processes
     suitable for symbolization (which have both mmaps and stack_frames).
  """

  process_map = {}

  # Android traces produced via 'chrome://inspect/?tracing#devices' are
  # just list of events.
  events = trace if isinstance(trace, list) else trace['traceEvents']
  for event in events:
    name = event.get('name')
    if not name:
      continue

    pid = event['pid']
    process = process_map.get(pid)
    if process is None:
      process = Process(pid)
      process_map[pid] = process

    phase = event['ph']
    if phase == TRACE_EVENT_PHASE_METADATA:
      if name == 'process_name':
        process.name = event['args']['name']
      elif name == 'stackFrames':
        process.stack_frames = StackFrames(event['args']['stackFrames'])
    elif phase == TRACE_EVENT_PHASE_MEMORY_DUMP:
      process_mmaps = event['args']['dumps'].get('process_mmaps')
      if process_mmaps:
        # TODO(dskiba): this parses all process_mmaps, but retains only the
        #               last one. We need to parse only once (lazy parsing?).
        process.mmaps = ProcessMemoryMaps(process_mmaps)

  return [p for p in process_map.itervalues() if p.mmaps and p.stack_frames]


class SymbolizableFile(object):
  """Holds file path, addresses to symbolize and stack frames to update.

  This class is a link between ELFSymbolizer and a trace file: it specifies
  what to symbolize (addresses) and what to update with the symbolization
  result (frames).
  """
  def __init__(self, file_path):
    self.path = file_path
    self.frames_by_address = collections.defaultdict(list)


def ResolveSymbolizableFiles(processes):
  """Resolves and groups PCs into list of SymbolizableFiles.

  As part of the grouping process, this function resolves PC from each stack
  frame to the corresponding mmap region. Stack frames that failed to resolve
  are symbolized with '<unresolved>'.
  """
  symfile_by_path = {}
  for process in processes:
    for frame in process.stack_frames.pc_frames:
      region = process.mmaps.FindRegion(frame.pc)
      if region is None:
        frame.name = '<unresolved>'
        continue

      symfile = symfile_by_path.get(region.file_path)
      if symfile is None:
        symfile = SymbolizableFile(region.file_path)
        symfile_by_path[symfile.path] = symfile

      relative_pc = frame.pc - region.start_address
      symfile.frames_by_address[relative_pc].append(frame)
  return symfile_by_path.values()


def SymbolizeFiles(symfiles, addr2line_path):
  """Symbolizes each file in the given list of SymbolizableFiles
     and updates stack frames with symbolization results."""
  print 'Symbolizing...'

  def _SubPrintf(message, *args):
    print ('  ' + message).format(*args)

  symbolized = False
  for symfile in symfiles:
    unsymbolized_name = '<{}>'.format(
        symfile.path if symfile.path else 'unnamed')

    problem = None
    if not os.path.isabs(symfile.path):
      problem = 'not a file'
    elif not os.path.isfile(symfile.path):
      problem = "file doesn't exist"
    elif not IsSymbolizableFile(symfile.path):
      problem = 'file is not symbolizable'
    if problem:
      _SubPrintf("Won't symbolize {} PCs for '{}': {}.",
                 len(symfile.frames_by_address),
                 symfile.path,
                 problem)
      for frames in symfile.frames_by_address.itervalues():
        for frame in frames:
          frame.name = unsymbolized_name
      continue

    def _SymbolizerCallback(sym_info, frames):
      # Unwind inline chain to the top.
      while sym_info.inlined_by:
        sym_info = sym_info.inlined_by

      symbolized_name = sym_info.name if sym_info.name else unsymbolized_name
      for frame in frames:
        frame.name = symbolized_name

    symbolizer = elf_symbolizer.ELFSymbolizer(symfile.path,
                                              addr2line_path,
                                              _SymbolizerCallback,
                                              inlines=True)

    _SubPrintf('Symbolizing {} PCs from {}...',
               len(symfile.frames_by_address),
               symfile.path)

    for address, frames in symfile.frames_by_address.iteritems():
      # SymbolizeAsync() asserts that the type of address is int. We operate
      # on longs (since they are raw pointers possibly from 64-bit processes).
      # It's OK to cast here because we're passing relative PC, which should
      # always fit into int.
      symbolizer.SymbolizeAsync(int(address), frames)

    symbolizer.Join()
    symbolized = True

  return symbolized


def HaveFilesFromAndroid(symfiles):
  return any(ANDROID_PATH_MATCHER.match(f.path) for f in symfiles)


def RemapAndroidFiles(symfiles, output_path):
  for symfile in symfiles:
    match = ANDROID_PATH_MATCHER.match(symfile.path)
    if match:
      name = match.group('name')
      symfile.path = os.path.join(output_path, ANDROID_UNSTRIPPED_SUBPATH, name)


# Suffix used for backup files.
BACKUP_FILE_TAG = '.BACKUP'

def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('file',
                      help='Trace file to symbolize (.json or .json.gz)')
  parser.add_argument('--no-backup',
                      dest='backup', default='true', action='store_false',
                      help="Don't create {} files".format(BACKUP_FILE_TAG))
  parser.add_argument('--output-directory',
                      help='The path to the build output directory, such ' +
                           'as out/Debug. Only needed for Android.')
  options = parser.parse_args()

  trace_file_path = options.file
  def _OpenTraceFile(mode):
    if trace_file_path.endswith('.gz'):
      return gzip.open(trace_file_path, mode + 'b')
    else:
      return open(trace_file_path, mode + 't')

  addr2line_path = FindInSystemPath('addr2line')
  if addr2line_path is None:
    sys.exit("Can't symbolize - no addr2line in PATH.")

  print 'Reading trace file...'
  with _OpenTraceFile('r') as trace_file:
    trace = json.load(trace_file)

  processes = CollectProcesses(trace)
  symfiles = ResolveSymbolizableFiles(processes)

  # Android trace files don't have any indication they are from Android.
  # So we're checking for Android-specific paths.
  if HaveFilesFromAndroid(symfiles):
    if not options.output_directory:
      parser.error('The trace file appears to be from Android. Please '
                   "specify output directory (e.g. 'out/Debug') to properly "
                   'symbolize it.')
    RemapAndroidFiles(symfiles, os.path.abspath(options.output_directory))

  if SymbolizeFiles(symfiles, addr2line_path):
    if options.backup:
      backup_file_path = trace_file_path + BACKUP_FILE_TAG
      print 'Backing up trace file to {}...'.format(backup_file_path)
      os.rename(trace_file_path, backup_file_path)

    print 'Updating trace file...'
    with _OpenTraceFile('w') as trace_file:
      json.dump(trace, trace_file)
  else:
    print 'No PCs symbolized - not updating trace file.'


if __name__ == '__main__':
  main()