// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// profiler.h: a simple sampling profiler that's always just one #include away!
//
// Overview
// ========
//
// This profiler only samples a pseudo-stack, not the actual call stack.
// The code to be profiled needs to be instrumented with
// pseudo-stack "labels", see ScopedProfilingLabel.
// Using pseudo-stacks allows this profiler to be very simple, low-overhead,
// portable, and independent of compilation details such as function inlining
// and frame pointers. The granularity of instrumentation can be freely chosen,
// and it is possible to get some annotate-like detail, i.e. detail within one
// function without splitting it into multiple functions.
//
// This profiler should remain small and simple; its key feature is to fit in
// a single header file so that there should never be a reason to refrain
// from profiling. More complex and feature-rich alternatives are
// readily available. This one offers a strict superset of its
// functionality: https://github.com/bgirard/GeckoProfiler, including
// intertwining pseudostacks with real call stacks, more annotation options,
// and advanced visualization.
//
// Usage
// =====
//
// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is
//    not enabled, profiling instrumentation from instrumentation.h
//    (ScopedProfilingLabel, RegisterCurrentThreadForProfiling)
//    is still defined but does nothing. On the other hand,
//    when profiling is not enabled, it is an error to #include the
//    present file.
//
// 1. Each thread can opt in to profiling by calling
//    RegisterCurrentThreadForProfiling() defined in instrumentation.h.
//    This can be done at any time, before or during profiling.
//    No sample will be collected from a thread until
//    it has called RegisterCurrentThreadForProfiling().
//
// 2. Instrument your code to be profiled with ScopedProfilingLabel,
//    which is a RAII helper defined in instrumentation.h. The identifier
//    names (some_label, etc) do not matter; what will show up
//    in the profile is the string passed to the constructor, which
//    must be a literal string. See the full example below.
//
//    Note: the overhead of ScopedProfilingLabel is zero when not
//    enabling profiling (when not defining GEMMLOWP_PROFILING).
//
// 3. Use the profiler.h interface to control profiling. There are two
//    functions: StartProfiling() and FinishProfiling(). They must be
//    called on the same thread. FinishProfiling() prints the profile
//    on stdout.
//
// Full example
// ============
/*
    #define GEMMLOWP_PROFILING
    #include "profiling/instrumentation.h"
    using namespace gemmlowp;

    const int iters = 100000000;
    volatile int i;

    void Bar() {
      ScopedProfilingLabel label("Bar");
      for (i = 0; i < iters; i++) {}
    }

    void Foo() {
      ScopedProfilingLabel label("Foo");
      for (i = 0; i < iters; i++) {}
      Bar();
    }

    void Init() {
      RegisterCurrentThreadForProfiling();
    }

    #include "profiling/profiler.h"

    int main() {
      Init();
      StartProfiling();
      Foo();
      FinishProfiling();
    }
*
* Output:
*
    gemmlowp profile (1 threads, 304 samples)
    100.00% Foo
        51.32% other
        48.68% Bar
    0.00% other (outside of any label)
*/
//
// Interpreting results
// ====================
//
//  Each node shows the absolute percentage, among all the samples,
//  of the number of samples that recorded the given pseudo-stack.
//  The percentages are *NOT* relative to the parent node. In addition
//  to your own labels, you will also see 'other' nodes that collect
//  the remainder of samples under the parent node that didn't fall into
//  any of the labelled child nodes. Example:
//
//  20% Foo
//      12% Bar
//      6% Xyz
//      2% other
//
//  This means that 20% of all labels were under Foo, of which 12%/20%==60%
//  were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not
//  under either Bar or Xyz.
//
//  Typically, one wants to keep adding ScopedProfilingLabel's until
//  the 'other' nodes show low percentages.
//
// Interpreting results with multiple threads
// ==========================================
//
// At each sample, each thread registered for profiling gets sampled once.
// So if there is one "main thread" spending its time in MainFunc() and
// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the
// samples will be in WorkerFunc, so the profile will look like this:
//
// 80% WorkerFunc
// 20% MainFunc

#ifndef GEMMLOWP_PROFILING_PROFILER_H_
#define GEMMLOWP_PROFILING_PROFILER_H_

#ifndef GEMMLOWP_PROFILING
#error Profiling is not enabled!
#endif

#include <vector>

#include "instrumentation.h"

namespace gemmlowp {

// A tree view of a profile.
class ProfileTreeView {
  struct Node {
    std::vector<Node*> children;
    const char* label;
    std::size_t weight;
    Node() : label(nullptr), weight(0) {}
    ~Node() {
      for (auto child : children) {
        delete child;
      }
    }
  };

  static bool CompareNodes(Node* n1, Node* n2) {
    return n1->weight > n2->weight;
  }

  Node root_;

  void PrintNode(const Node* node, int level) const {
    if (level) {
      for (int i = 1; i < level; i++) {
        printf("    ");
      }
      printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label);
    }
    for (auto child : node->children) {
      PrintNode(child, level + 1);
    }
  }

  static void AddStackToNode(const ProfilingStack& stack, Node* node,
                             std::size_t level) {
    node->weight++;
    if (stack.size == level) {
      return;
    }
    Node* child_to_add_to = nullptr;
    for (auto child : node->children) {
      if (child->label == stack.labels[level]) {
        child_to_add_to = child;
        break;
      }
    }
    if (!child_to_add_to) {
      child_to_add_to = new Node;
      child_to_add_to->label = stack.labels[level];
      node->children.push_back(child_to_add_to);
    }
    AddStackToNode(stack, child_to_add_to, level + 1);
    return;
  }

  void AddStack(const ProfilingStack& stack) {
    AddStackToNode(stack, &root_, 0);
  }

  void AddOtherChildrenToNode(Node* node) {
    std::size_t top_level_children_weight = 0;
    for (auto c : node->children) {
      AddOtherChildrenToNode(c);
      top_level_children_weight += c->weight;
    }
    if (top_level_children_weight) {
      Node* other_child = new Node;
      other_child->label =
          node == &root_ ? "other (outside of any label)" : "other";
      other_child->weight = node->weight - top_level_children_weight;
      node->children.push_back(other_child);
    }
  }

  void AddOtherNodes() { AddOtherChildrenToNode(&root_); }

  void SortNode(Node* node) {
    std::sort(node->children.begin(), node->children.end(), CompareNodes);
    for (auto child : node->children) {
      SortNode(child);
    }
  }

  void Sort() { SortNode(&root_); }

 public:
  explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) {
    for (auto stack : stacks) {
      AddStack(stack);
    }
    AddOtherNodes();
    Sort();
  }

  void Print() const {
    printf("\n");
    printf("gemmlowp profile (%d threads, %d samples)\n",
           static_cast<int>(ThreadsUnderProfiling().size()),
           static_cast<int>(root_.weight));
    PrintNode(&root_, 0);
    printf("\n");
  }
};

// This function is the only place that determines our sampling frequency.
inline void WaitOneProfilerTick() {
  static const int millisecond = 1000000;

#if defined __arm__ || defined __aarch64__
  // Reduced sampling frequency on mobile devices helps limit time and memory
  // overhead there.
  static const int interval = 10 * millisecond;
#else
  static const int interval = 1 * millisecond;
#endif

  timespec ts;
  ts.tv_sec = 0;
  ts.tv_nsec = interval;
  nanosleep(&ts, nullptr);
}

// This is how we track whether we've already started profiling,
// to guard against misuse of the API.
inline bool& IsProfiling() {
  static bool b;
  return b;
}

// This is how we tell the profiler thread to finish.
inline bool& ProfilerThreadShouldFinish() {
  static bool b;
  return b;
}

// The profiler thread. See ProfilerThreadFunc.
inline pthread_t& ProfilerThread() {
  static pthread_t t;
  return t;
}

// Records a stack from a running thread.
// The tricky part is that we're not interrupting the thread.
// This is OK because we're looking at a pseudo-stack of labels,
// not at the real thread stack, and if the pseudo-stack changes
// while we're recording it, we are OK with getting either the
// old or the new stack. Note that ProfilingStack::Pop
// only decrements the size, and doesn't null the popped label,
// so if we're concurrently recording it, it shouldn't change
// under our feet until another label is pushed, at which point
// we are OK with getting either this new label or the old one.
// In the end, the key atomicity property that we are relying on
// here is that pointers are changed atomically, and the labels
// are pointers (to literal strings).
inline void RecordStack(ThreadInfo* thread, ProfilingStack* dst) {
  ScopedLock sl(thread->stack.lock);
  assert(!dst->size);
  while (dst->size < thread->stack.size) {
    dst->labels[dst->size] = thread->stack.labels[dst->size];
    dst->size++;
  }
}

// The profiler thread's entry point.
// Note that a separate thread is to be started each time we call
// StartProfiling(), and finishes when we call FinishProfiling().
// So here we only need to handle the recording and reporting of
// a single profile.
inline void* ProfilerThreadFunc(void*) {
  assert(ProfilerThread() == pthread_self());

  // Since we only handle one profile per profiler thread, the
  // profile data (the array of recorded stacks) can be a local variable here.
  std::vector<ProfilingStack> stacks;

  while (!ProfilerThreadShouldFinish()) {
    WaitOneProfilerTick();
    {
      ScopedLock sl(GlobalMutexes::Profiler());
      for (auto t : ThreadsUnderProfiling()) {
        ProfilingStack s;
        RecordStack(t, &s);
        stacks.push_back(s);
      }
    }
  }

  // Profiling is finished and we now report the results.
  ProfileTreeView(stacks).Print();

  return nullptr;
}

// Starts recording samples.
inline void StartProfiling() {
  ScopedLock sl(GlobalMutexes::Profiler());
  ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
  IsProfiling() = true;
  ProfilerThreadShouldFinish() = false;
  pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr);
}

// Stops recording samples, and prints a profile tree-view on stdout.
inline void FinishProfiling() {
  {
    ScopedLock sl(GlobalMutexes::Profiler());
    ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
    // The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
    // as the scary comments below should make clear.
    // Should we use a condition variable?
    ProfilerThreadShouldFinish() = true;
  }  // must release the lock here to avoid deadlock with profiler thread.
  pthread_join(ProfilerThread(), nullptr);
  IsProfiling() = false;  // yikes, this should be guarded by the lock!
}

}  // namespace gemmlowp

#endif  // GEMMLOWP_PROFILING_PROFILER_H_