// Copyright 2015 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // instrumentation.h: contains the definitions needed to // instrument code for profiling: // ScopedProfilingLabel, RegisterCurrentThreadForProfiling. // // profiler.h is only needed to drive the profiler: // StartProfiling, FinishProfiling. // // See the usage example in profiler.h. #ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_ #define GEMMLOWP_PROFILING_INSTRUMENTATION_H_ #include <pthread.h> #include <cstdio> #ifndef GEMMLOWP_USE_STLPORT #include <cstdint> #else #include <stdint.h> namespace std { using ::uint8_t; using ::uint16_t; using ::uint32_t; using ::int8_t; using ::int16_t; using ::int32_t; using ::size_t; using ::uintptr_t; } #endif #include <algorithm> #include <cassert> #include <cstdlib> #ifdef GEMMLOWP_PROFILING #include <cstring> #include <set> #endif // We should always use C++11 thread_local; unfortunately that // isn't fully supported on Apple yet. #ifdef __APPLE__ #define GEMMLOWP_THREAD_LOCAL static __thread #define GEMMLOWP_USING_OLD_THREAD_LOCAL #else #define GEMMLOWP_THREAD_LOCAL thread_local #endif namespace gemmlowp { inline void ReleaseBuildAssertion(bool condition, const char* msg) { if (!condition) { fprintf(stderr, "gemmlowp error: %s\n", msg); abort(); } } // To be used as template parameter for GlobalLock. // GlobalLock<ProfilerLockId> is the profiler global lock: // registering threads, starting profiling, finishing profiling, and // the profiler itself as it samples threads, all need to lock it. struct ProfilerLockId; // A very plain global lock. Templated in LockId so we can have multiple // locks, one for each LockId type. template <typename LockId> class GlobalLock { static pthread_mutex_t* Mutex() { static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; return &m; } public: static void Lock() { pthread_mutex_lock(Mutex()); } static void Unlock() { pthread_mutex_unlock(Mutex()); } }; // A very simple RAII helper to lock and unlock a GlobalLock template <typename LockId> struct AutoGlobalLock { AutoGlobalLock() { GlobalLock<LockId>::Lock(); } ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); } }; // MemoryBarrier is purely a compile-time thing; it tells two things // to the compiler: // 1) It prevents reordering code across it // (thanks to the 'volatile' after 'asm') // 2) It requires the compiler to assume that any value previously // read from memory, may have changed. Thus it offers an alternative // to using 'volatile' variables. inline void MemoryBarrier() { asm volatile("" ::: "memory"); } // Profiling definitions. Two paths: when profiling is enabled, // and when profiling is disabled. #ifdef GEMMLOWP_PROFILING // This code path is when profiling is enabled. // A pseudo-call-stack. Contrary to a real call-stack, this only // contains pointers to literal strings that were manually entered // in the instrumented code (see ScopedProfilingLabel). struct ProfilingStack { static const std::size_t kMaxSize = 15; typedef const char* LabelsArrayType[kMaxSize]; LabelsArrayType labels; std::size_t size; ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); } void Push(const char* label) { MemoryBarrier(); ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow"); labels[size] = label; MemoryBarrier(); size++; MemoryBarrier(); } void Pop() { MemoryBarrier(); ReleaseBuildAssertion(size > 0, "ProfilingStack underflow"); size--; MemoryBarrier(); } void UpdateTop(const char* new_label) { MemoryBarrier(); assert(size); labels[size - 1] = new_label; MemoryBarrier(); } ProfilingStack& operator=(const ProfilingStack& other) { memcpy(this, &other, sizeof(ProfilingStack)); return *this; } bool operator==(const ProfilingStack& other) const { return !memcmp(this, &other, sizeof(ProfilingStack)); } }; static_assert( !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)), "ProfilingStack should have power-of-two size to fit in cache lines"); struct ThreadInfo; // The global set of threads being profiled. inline std::set<ThreadInfo*>& ThreadsUnderProfiling() { static std::set<ThreadInfo*> v; return v; } struct ThreadInfo { pthread_key_t key; // used only to get a callback at thread exit. ProfilingStack stack; ThreadInfo() { pthread_key_create(&key, ThreadExitCallback); pthread_setspecific(key, this); } static void ThreadExitCallback(void* ptr) { AutoGlobalLock<ProfilerLockId> lock; ThreadInfo* self = static_cast<ThreadInfo*>(ptr); ThreadsUnderProfiling().erase(self); pthread_key_delete(self->key); } }; inline ThreadInfo& ThreadLocalThreadInfo() { #ifdef GEMMLOWP_USING_OLD_THREAD_LOCAL // We're leaking this ThreadInfo structure, because Apple doesn't support // non-trivial constructors or destructors for their __thread type modifier. GEMMLOWP_THREAD_LOCAL ThreadInfo* i = nullptr; if (i == nullptr) { i = new ThreadInfo(); } return *i; #else GEMMLOWP_THREAD_LOCAL ThreadInfo i; return i; #endif } // ScopedProfilingLabel is how one instruments code for profiling // with this profiler. Construct local ScopedProfilingLabel variables, // passing a literal string describing the local code. Profile // samples will then be annotated with this label, while it is in scope // (whence the name --- also known as RAII). // See the example in profiler.h. class ScopedProfilingLabel { ProfilingStack* profiling_stack_; public: explicit ScopedProfilingLabel(const char* label) : profiling_stack_(&ThreadLocalThreadInfo().stack) { profiling_stack_->Push(label); } ~ScopedProfilingLabel() { profiling_stack_->Pop(); } void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); } }; // To be called once on each thread to be profiled. inline void RegisterCurrentThreadForProfiling() { AutoGlobalLock<ProfilerLockId> lock; ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo()); } #else // not GEMMLOWP_PROFILING // This code path is when profiling is disabled. // This empty definition of ScopedProfilingLabel ensures that // it has zero runtime overhead when profiling is disabled. struct ScopedProfilingLabel { explicit ScopedProfilingLabel(const char*) {} void Update(const char*) {} }; inline void RegisterCurrentThreadForProfiling() {} #endif } // end namespace gemmlowp #endif // GEMMLOWP_PROFILING_INSTRUMENTATION_H_