* Copyright (C) 2017 The Android Open Source Project
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
#undef NDEBUG
#include "Bridge.h"
#include "CompilationBuilder.h"
#include "Manager.h"
#include "ModelBuilder.h"
#include "NeuralNetworks.h"
#include "NeuralNetworksWrapper.h"
#include "SampleDriver.h"
#include "Utils.h"
#include "ValidateHal.h"
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <random>
#include <set>
#include <tuple>
#include <utility>
#include <vector>
#include <unistd.h>
#include <android-base/logging.h>
#include <android/sharedmem.h>
#include <gtest/gtest.h>
// Uncomment the following line to generate some debugging output that
// may be useful when analyzing failures:
// Uncomment the following line to generate graphs from models:
// #define GRAPH GRAPH
// We randomly generate tests (model + input data) at runtime, and verify
// that we get the same results whether we do partitioned compilation/execution
// or non partitioned compilation/execution. We perform a test as follows:
// (1) Randomly generate a model (graph and weights), randomly generate input
// data, randomly assign inputs and outputs to CPU memory or to shared
// memory.
// Randomly leaves dimensions unset for intermediate operands.
// (2) Randomly generate drivers based on the sample driver, each of which
// executes models on the CPU. They differ according to which operations
// they support.
// (3) Compile and execute without partitioning, saving off the results.
// (4) Compile and execute with partitioning.
// (5) Verify that the saved results from (3) match the results from (4).
// For simplicity, all data (model inputs, model outputs, weights,
// temps) are of the same type: a 2-D TENSOR_FLOAT32 where the two
// dimensions are fixed throughout a particular test case (and
// randomly determined). This prevents us from having to find a
// mechanism to "resize" data (e.g., if ADD#a operates on data of size
// 2x2, ADD#b operates on data of size 3x3, and the outputs of ADD#a
// and ADD#b become inputs of ADD#c, do we need to insert one or more
// operations between (say) ADD#a and ADD#c to convert ADD#2's data
// from size 2x2 to size 3x3 in order to match ADD#b). In the few
// cases where an operand cannot be of this type, it is a constant
// (e.g., activation functions and RNN bias).
// Each operation we generate has a signature (described in more
// detail later). The randomly generated drivers decide which
// operations they can execute by checking operation signatures. Once
// we have built the model and know the set of signatures, we randomly
// assign each signature to a driver. No signature is supported by
// multiple drivers -- we're not testing the logic that the
// partitioning algorithm uses to select the best driver for an
// operation.
namespace android {
using CompilationBuilder = nn::CompilationBuilder;
using Device = nn::Device;
using DeviceManager = nn::DeviceManager;
using ExecutionPlan = nn::ExecutionPlan;
using HidlModel = hardware::neuralnetworks::V1_1::Model;
using MemoryBuilder = nn::Memory;
using ModelBuilder = nn::ModelBuilder;
using Result = nn::wrapper::Result;
using SampleDriver = nn::sample_driver::SampleDriver;
using WrapperCompilation = nn::wrapper::Compilation;
using WrapperExecution = nn::wrapper::Execution;
using WrapperMemory = nn::wrapper::Memory;
using WrapperModel = nn::wrapper::Model;
using WrapperOperandType = nn::wrapper::OperandType;
using WrapperType = nn::wrapper::Type;
namespace {
/// Configure test size //////////////////////////////////////////////////////////
// We may exceed this in order to connect otherwise disjoint subgraphs.
static const unsigned kMaxNumOperations = 100;
// We build models to process 2-D square tensors up to this size in each dimension;
// note that the API promotes by-value weights larger than 128 to by-reference,
// so we want to ensure that we can pick both types that exceed and types that do
// not exceed this size.
static const unsigned kMaxProblemSize = 8;
// First seed for pseudorandom test generation.
static const unsigned kFirstSeed = 0;
// Number of test cases.
static const unsigned kNumTestCases = 225;
// Force all graph weights into a single pool (as we recommend to users)
// or allow them to be distributed across multiple pools (more stress
// on the partitioning algorithm and the rest of the runtime)?
// Forcing all graph weights into a single pool may be necessary to
// prevent large graphs from running up against http://b/70302693
// "NNAPI overuses (?) fds".
static const bool kAllWeightsInOnePool = false;
// The signature of an operation consists of the operation type (e.g.,
// ADD) and the activation function (use -1 in the case of an
// operation type for which the activation function is inapplicable).
typedef std::pair<ANeuralNetworksOperationType, int> Signature;
// This class adds some simple utilities on top of
// ::android::nn::wrapper::Model. For example, it provides access to
// certain features from ModelBuilder that are not exposed by the base
// class (such as inputCount() and operation index).
class TestModel : public WrapperModel {
uint32_t addOperation(ANeuralNetworksOperationType type, const std::vector<uint32_t>& inputs,
const std::vector<uint32_t>& outputs) {
const uint32_t operationIndex = operationCount();
WrapperModel::addOperation(type, inputs, outputs);
return operationIndex;
uint32_t operationCount() const {
return mOperations.size();
uint32_t inputCount() const {
return builder()->inputCount();
uint32_t outputCount() const {
return builder()->outputCount();
const std::vector<uint32_t>& getOperationOutputs(uint32_t index) const {
assert(index < mOperations.size());
return mOperations[index];
// All values are immediately copied into the model (we need to do
// this ourselves in cases where the underlying NNAPI does not).
void setOperandValue(uint32_t index, const std::vector<float>& value) {
const size_t length = value.size() * sizeof(float);
WrapperModel::setOperandValue(index, value.data(), length);
} else {
WrapperModel::setOperandValue(index, mOperandValues.back().data(), length);
void setOperandValue(uint32_t index, int32_t value) {
WrapperModel::setOperandValue(index, &value, sizeof(value));
const ModelBuilder* builder() const {
return reinterpret_cast<const ModelBuilder*>(getHandle());
// Representation of operations: vector index is operation number,
// vector value is operation's output operands.
std::vector<std::vector<uint32_t>> mOperations;
// Large operand values -- not immediately copied into the
// WrapperModel, so remembered here instead.
std::vector<std::vector<float>> mOperandValues;
// This class adds some simple utilities on top of
// ::android::nn::wrapper::Compilation in order to provide access to
// certain features from CompilationBuilder that are not exposed by
// the base class.
class TestCompilation : public WrapperCompilation {
TestCompilation(const WrapperModel* model) : WrapperCompilation(model) {}
Result setPartitioning(uint32_t partitioning) {
return static_cast<Result>(builder()->setPartitioning(partitioning));
using WrapperCompilation::finish;
Result finish(const std::vector<std::shared_ptr<Device>>& devices) {
return static_cast<Result>(builder()->finish(devices));
const ExecutionPlan& getExecutionPlan() const {
return builder()->forTest_getExecutionPlan();
const CompilationBuilder* builder() const {
return reinterpret_cast<const CompilationBuilder*>(getHandle());
CompilationBuilder* builder() {
return reinterpret_cast<CompilationBuilder*>(getHandle());
// This class is used to manage a collection of memory regions,
// disjoint windows onto a set of Memory instances, each of which is
// associated with a single shared memory region. Each region and
// Memory instance is assigned a number. The usage pattern is as
// follows:
// - Call addMemory() and addRegion() as many times as needed to
// declare (but not define) Memory instances and declare region
// instances.
// - Call layout() to define the Memory instances.
// - Call getRegion() as many times as needed to get the details
// of memory regions (such as address, or Memory/offset/length).
// The Memory instances created by layout() are owned by the
// TestMemories instance, and are destroyed when the TestMemories
// instance is destroyed.
class TestMemories {
TestMemories() = default;
TestMemories(const TestMemories&) = delete;
TestMemories& operator=(const TestMemories&) = delete;
unsigned addMemory() {
return memoryCount() - 1;
unsigned memoryCount() const {
return mMemorySizes.size();
unsigned addRegion(unsigned memoryIndex, uint32_t length) {
assert(memoryIndex < memoryCount());
uint32_t& memorySize = mMemorySizes[memoryIndex];
auto desc = std::make_tuple(memoryIndex, (uint32_t)memorySize, length);
memorySize += length;
return regionCount() - 1;
unsigned regionCount() const {
return mRegions.size();
void layout();
void* getRegion(unsigned regionIndex,
const WrapperMemory** pMemory, uint32_t* pOffset, uint32_t* pLength) {
assert(regionIndex < regionCount());
const auto& regionDescriptor = mRegions[regionIndex];
const WrapperMemory* memory = &mMemorys[std::get<0>(regionDescriptor)];
uint32_t offset = std::get<1>(regionDescriptor);
uint32_t length = std::get<2>(regionDescriptor);
uint8_t* buffer;
if (reinterpret_cast<MemoryBuilder*>(memory->get())->getPointer(&buffer) !=
if (pMemory) *pMemory = memory;
if (pOffset) *pOffset = offset;
if (pLength) *pLength = length;
return buffer + offset;
void* getRegion(unsigned regionIndex) {
return getRegion(regionIndex, nullptr, nullptr, nullptr);
// Index is the memory index; value is the size of the memory
// (aggregate size of all regions in the memory).
std::vector<uint32_t> mMemorySizes;
// Index is the memory index.
std::vector<WrapperMemory> mMemorys;
std::vector<int> mFDs;
// Index is the region index; tuple represents memory index,
// region offset within memory, region length.
std::vector<std::tuple<unsigned, uint32_t, uint32_t>> mRegions;
// For sanity checking.
bool mLayoutDone = false;
void TestMemories::layout() {
for (uint32_t memorySize : mMemorySizes) {
const int fd = ASharedMemory_create(nullptr, memorySize);
assert(fd >= 0);
mMemorys.emplace_back(memorySize, PROT_READ | PROT_WRITE, fd, 0);
mLayoutDone = true;
TestMemories::~TestMemories() {
for (int fd : mFDs) {
class RandomPartitioningTest : public ::testing::TestWithParam<unsigned> {
RandomPartitioningTest() : mRandNumEng(GetParam() /* seed */), mRandNumUnitDist(0.0, 1.0) {}
static Signature getSignature(const HidlModel& model, const Operation& operation);
void graphDump(const WrapperModel& model);
bool randBool() {
return randUInt(2) == 1;
double randFrac() { // [0.0, 1.0)
return mRandNumUnitDist(mRandNumEng);
unsigned randUInt(unsigned limit) { // [0, limit)
return unsigned(randFrac() * limit);
// Represents an operation in which every input and output operand
// is a TENSOR_FLOAT32 of dimensions [problemSize, problemSize] except:
// - One input operand may be an activation function.
// - Any number of input operands may be "special" in some other way
// (and in this implementation, not produced by any other operation).
// We require that:
// - There be at least one input operand that is neither an
// activation function nor "special".
struct OperationPattern {
int mOperationType;
unsigned mNumInputs;
unsigned mNumOutputs;
int mActivationFunctionInputIndex; // <0 if none
// Returns operand index, or <0 if input is normal (must not
// be called for an activation function operand). Function
// should have the following prototype:
// int makeSpecialInput(unsigned problemSize, TestModel* model, unsigned inputIndex);
int (RandomPartitioningTest::*mMakeSpecialInput)(unsigned, TestModel*, unsigned);
static const OperationPattern kOperationPatterns[];
int makeRnnSpecialInput(unsigned problemSize, TestModel* model, unsigned inputIndex) {
if (inputIndex != 3) {
return -1;
// input operand 3 is bias, a 1-D tensor
const WrapperOperandType biasType(WrapperType::TENSOR_FLOAT32, { problemSize });
const uint32_t operandIndex = model->addOperand(&biasType);
std::vector<float> biasValue(problemSize);
std::generate(biasValue.begin(), biasValue.end(),
[this]{ return randFrac(); });
model->setOperandValue(operandIndex, biasValue);
return int(operandIndex);
#ifdef VERBOSE
class ModelStats {
ModelStats(const ModelBuilder* model) :
mBuilder(model) { }
ModelStats(const WrapperModel* model) :
mBuilder(reinterpret_cast<const ModelBuilder*>(model->getHandle())) { }
friend std::ostream& operator<<(std::ostream& out, const ModelStats& stats) {
const uint32_t operandCount = stats.mBuilder->operandCount();
const uint32_t inputCount = stats.mBuilder->inputCount();
const uint32_t outputCount = stats.mBuilder->outputCount();
out << "operationCount = " << stats.mBuilder->operationCount()
<< ", operandCount = " << operandCount
<< ", inputCount = " << inputCount
<< " (" << (double(inputCount) / operandCount) << ")"
<< ", outputCount = " << outputCount
<< " (" << (double(outputCount) / operandCount) << ")";
return out;
const ModelBuilder* mBuilder;
std::mt19937 mRandNumEng;
std::uniform_real_distribution<double> mRandNumUnitDist;
const RandomPartitioningTest::OperationPattern RandomPartitioningTest::kOperationPatterns[] = {
{ ANEURALNETWORKS_ADD, 3, 1, 2, nullptr },
{ ANEURALNETWORKS_LOGISTIC, 1, 1, -1, nullptr },
{ ANEURALNETWORKS_MUL, 3, 1, 2, nullptr },
{ ANEURALNETWORKS_RNN, 6, 2, 5, &RandomPartitioningTest::makeRnnSpecialInput },
{ ANEURALNETWORKS_TANH, 1, 1, -1, nullptr },
Signature RandomPartitioningTest::getSignature(const HidlModel& model, const Operation& operation) {
static const std::map<ANeuralNetworksOperationType, int> kOperationToActivation = []() {
std::map<ANeuralNetworksOperationType, int> result;
for (const auto& pattern : kOperationPatterns) {
result[pattern.mOperationType] = pattern.mActivationFunctionInputIndex;
return result;
const ANeuralNetworksOperationType operationType =
const int activationFunctionInputIndex = kOperationToActivation.at(operationType);
if (activationFunctionInputIndex < 0) {
return Signature(operationType, -1);
const Operand& operand = model.operands[operation.inputs[activationFunctionInputIndex]];
assert(operand.lifetime == OperandLifeTime::CONSTANT_COPY);
assert(operand.type == OperandType::INT32);
int32_t value;
return Signature(operationType, value);
void RandomPartitioningTest::graphDump([[maybe_unused]] const WrapperModel& model) {
#ifdef GRAPH
const std::string name = "Test-" + std::to_string(GetParam());
reinterpret_cast<const ModelBuilder*>(model.getHandle()));
class TestDriver : public SampleDriver {
// Behaves like SampleDriver, except that it only supports
// operations with the specified signatures.
TestDriver(const char* name, std::set<Signature> signatures) :
SampleDriver(name), mSignatures(std::move(signatures)) { }
Return<void> getCapabilities_1_1(getCapabilities_1_1_cb _hidl_cb) override {
Capabilities capabilities =
{.float32Performance = {.execTime = 0.75f, .powerUsage = 0.75f},
.quantized8Performance = {.execTime = 0.75f, .powerUsage = 0.75f},
.relaxedFloat32toFloat16Performance = {.execTime = 0.75f, .powerUsage = 0.75f}};
_hidl_cb(ErrorStatus::NONE, capabilities);
return Void();
Return<void> getSupportedOperations_1_1(const HidlModel& model,
getSupportedOperations_cb cb) override {
if (nn::validateModel(model)) {
const size_t count = model.operations.size();
std::vector<bool> supported(count);
for (size_t i = 0; i < count; i++) {
supported[i] =
model.operations[i])) != 0);
cb(ErrorStatus::NONE, supported);
} else {
std::vector<bool> supported;
cb(ErrorStatus::INVALID_ARGUMENT, supported);
return Void();
Return<ErrorStatus> prepareModel_1_1(const HidlModel& model, ExecutionPreference preference,
const sp<IPreparedModelCallback>& callback) override {
// NOTE: We verify that all operations in the model are supported.
ErrorStatus outStatus = ErrorStatus::INVALID_ARGUMENT;
auto ret = getSupportedOperations_1_1(
[&outStatus](ErrorStatus inStatus, const hidl_vec<bool>& supportedOperations) {
if (inStatus == ErrorStatus::NONE) {
if (std::all_of(supportedOperations.begin(), supportedOperations.end(),
[](bool v){ return v; })) {
outStatus = ErrorStatus::NONE;
if (ret.isOk() && (outStatus == ErrorStatus::NONE)) {
return SampleDriver::prepareModel_1_1(model, preference, callback);
} else {
callback->notify(ErrorStatus::INVALID_ARGUMENT, nullptr);
return ErrorStatus::INVALID_ARGUMENT;
const std::set<Signature> mSignatures;
INSTANTIATE_TEST_CASE_P(Seed, RandomPartitioningTest,
::testing::Range(kFirstSeed, kFirstSeed + kNumTestCases));
TEST_P(RandomPartitioningTest, Test) {
LOG(INFO) << "RandomPartitioningTest: GetParam() = " << GetParam();
#ifdef VERBOSE
std::cout << std::setprecision(2) << std::fixed << std::setw(4);
const unsigned problemSize = 1+randUInt(kMaxProblemSize);
const WrapperOperandType problemType(WrapperType::TENSOR_FLOAT32, { problemSize, problemSize });
const WrapperOperandType unknownDimensionsType(WrapperType::TENSOR_FLOAT32, { 0, 0 });
static const WrapperOperandType activationFunctionType(WrapperType::INT32, { });
const unsigned numOperations = 2+randUInt(kMaxNumOperations-1);
const bool allowDeadOperations = (randFrac() < 0.2);
const bool allowUnknownDimensions = (randFrac() < 0.25);
// TODO: The current algorithm builds the graph in a forward
// direction (i.e., later-generated operations consume outputs
// from earlier-generated operations). In order to get more
// variation in graph topology, perhaps we should also create an
// algorithm to build the graph in a backward direction (i.e.,
// later-generated operations produce outputs to be consumed by
// earlier-generated operations).
[[maybe_unused]] const bool buildForward = randBool();
// TODO: Add a form of forced connectivity that operates by
// joining disjoint subgraphs rather than by forcing a root.
const bool forceCommonRoot = (randFrac() < 0.75);
TestModel model;
std::vector<uint32_t> modelInputs;
std::vector<uint32_t> modelOutputs;
// Each region in weights is a problem-sized 2-D TENSOR_FLOAT32.
TestMemories weights;
// Keep track of all normal (i.e., not activation function and not
// "special") operands that are values (from setOperandValue*()).
// .first: operand index
// .second: if the operand is already defined (via setOperandValue*()) then ~0U;
// otherwise, the operand has yet to be defined, and this is the corresponding
// region index in "weights"
std::vector<std::pair<uint32_t, unsigned>> valueOperands;
// An operand is "dead" if it is not consumed by another operation
// and is not a model output. Key is operand index; value is
// operation index.
std::map<uint32_t, uint32_t> deadOperands;
// An operation is "dead" if all of its outputs are dead.
std::set<uint32_t> deadOperations;
// Collect the signatures of operations in this model.
std::set<Signature> signatures;
// For reporting purposes, keep track of the number of root
// operations (those that do not consume results produced by other
// operations).
unsigned rootOperationCount = 0;
// Track if we added operands with unknown dimensions. In this case,
// partitioned compilation will fail if such an operand is read in a
// different partition than it is written.
bool hasUnknownDimensions = false;
// Generate operations.
for (unsigned i = 0; i < numOperations; i++) {
const unsigned operationPatternIndex =
const auto& operationPattern = kOperationPatterns[operationPatternIndex];
// INPUTS //////////////////////////////////////////////////////////////////////////////////
std::vector<uint32_t> operationInputs(operationPattern.mNumInputs, ~0U);
// First, process activation function and special inputs, and
// keep track of which inputs remain.
std::vector<uint32_t> normalOperationInputIndexes;
int32_t activationFunction = -1;
for (unsigned operationInputIndex = 0; operationInputIndex < operationPattern.mNumInputs;
operationInputIndex++) {
if (int(operationInputIndex) == operationPattern.mActivationFunctionInputIndex) {
const uint32_t operandIndex = model.addOperand(&activationFunctionType);
activationFunction = randUInt(4);
if (activationFunction == ANEURALNETWORKS_FUSED_RELU1) {
// workaround for http://b/69011131
model.setOperandValue(operandIndex, activationFunction);
operationInputs[operationInputIndex] = operandIndex;
if (operationPattern.mMakeSpecialInput != nullptr) {
const int operandIndex = (this->*(operationPattern.mMakeSpecialInput))(
problemSize, &model, operationInputIndex);
if (operandIndex >= 0) {
operationInputs[operationInputIndex] = operandIndex;
signatures.insert(Signature(operationPattern.mOperationType, activationFunction));
// A (normal) operation input can be one of:
// - a new or existing model input
// - an output of an existing operation
// - an OperandValue
// - an OperandValueFromMemory
// Some guidelines:
// - We generally don't want all of an operation's inputs to be values (constants)
const unsigned normalOperationInputCount = normalOperationInputIndexes.size();
// How many of this operation's inputs are constants?
unsigned normalOperationInputConstantCount = 0;
// How many of this operation's inputs are model inputs?
unsigned normalOperationInputModelInputCount = 0;
// We begin by deciding what kind of input each (normal) operation will be; we don't
// actually pick input operand indexes at this time, because we might override this
// decision later.
std::vector<InputKind> normalOperationInputKinds(normalOperationInputCount);
std::generate(normalOperationInputKinds.begin(), normalOperationInputKinds.end(),
[this, &model,
&normalOperationInputModelInputCount]() -> InputKind {
// Constant? Becomes less likely the more
// constants we already have as inputs to
// this operation.
if (randFrac() < 0.3 * (1 - double(normalOperationInputConstantCount) /
normalOperationInputCount)) {
return IK_VALUE;
// Model input? Becomes less likely the
// more model inputs we already have as
// inputs to this operation, and the further
// along we are in generating this model
// (i.e., the more operations we have
// generated).
if ((model.operationCount() == 0) ||
(randFrac() < 0.5 *
(1 - double(normalOperationInputModelInputCount) /
normalOperationInputCount) *
std::min(0.3, (1 - double(model.operationCount()) /
numOperations)))) {
// Else output of an existing operation.
// Now force common root or model input, if necessary. (A
// model must have at least one input.)
auto force =
[this, &normalOperationInputKinds, normalOperationInputCount](InputKind forceKind){
if (std::none_of(normalOperationInputKinds.begin(),
[forceKind](InputKind kind){ return kind == forceKind; })) {
normalOperationInputKinds[randUInt(normalOperationInputCount)] = forceKind;
if (forceCommonRoot && (model.operationCount() != 0)) {
if (modelInputs.empty()) {
assert(model.operationCount() == 0);
// Finally create the normal inputs.
bool isRootOperation = true;
for (unsigned i = 0; i < normalOperationInputCount; i++) {
uint32_t operandIndex = ~0U;
switch (normalOperationInputKinds[i]) {
if (!modelInputs.empty() && (randFrac() < 0.5)) {
operandIndex = modelInputs[randUInt(modelInputs.size())];
} else {
operandIndex = model.addOperand(&problemType);
decltype(deadOperands.begin()) deadOperandI;
if (!deadOperands.empty() && (randFrac() < 0.5)) {
deadOperandI = deadOperands.begin();
std::advance(deadOperandI, randUInt(deadOperands.size()));
operandIndex = deadOperandI->first;
} else {
const uint32_t existingOperationIndex = randUInt(model.operationCount());
const auto& existingOperationOutputs =
operandIndex =
deadOperandI = deadOperands.find(operandIndex);
assert(deadOperandI == deadOperands.end() ||
deadOperandI->second == existingOperationIndex);
if (deadOperandI != deadOperands.end()) {
const uint32_t correspondingOperation = deadOperandI->second;
auto deadOperationI = deadOperations.find(correspondingOperation);
if (deadOperationI != deadOperations.end()) {
isRootOperation = false;
case IK_VALUE: {
if (!valueOperands.empty() && (randFrac() < 0.25)) {
operandIndex = valueOperands[randUInt(valueOperands.size())].first;
} else {
operandIndex = model.addOperand(&problemType);
if (randFrac() < 0.5) {
std::vector<float> value(problemSize * problemSize);
std::generate(value.begin(), value.end(), [this]{ return randFrac(); });
model.setOperandValue(operandIndex, value);
valueOperands.push_back(std::make_pair(operandIndex, ~0U));
} else {
unsigned memoryIndex = ~0U;
if ((weights.memoryCount() != 0) &&
(kAllWeightsInOnePool || (randFrac() < 0.5))) {
memoryIndex = randUInt(weights.memoryCount());
} else {
memoryIndex = weights.addMemory();
const size_t length = problemSize * problemSize * sizeof(float);
const unsigned regionIndex = weights.addRegion(memoryIndex, length);
valueOperands.push_back(std::make_pair(operandIndex, regionIndex));
operationInputs[normalOperationInputIndexes[i]] = operandIndex;
if (isRootOperation) {
// OUTPUTS /////////////////////////////////////////////////////////////////////////////////
std::vector<uint32_t> operationOutputs(operationPattern.mNumOutputs);
std::generate(operationOutputs.begin(), operationOutputs.end(),
[&model, &problemType, &unknownDimensionsType, &hasUnknownDimensions,
allowUnknownDimensions, this]{
// 3% unknowns causes ~35% of partitionings to fail
// (determined by commenting out the fallback code,
// running tests and noting number of failures).
if (allowUnknownDimensions && randFrac() < 0.03) {
hasUnknownDimensions = true;
return model.addOperand(&unknownDimensionsType);
} else {
return model.addOperand(&problemType);
// OPERATION ///////////////////////////////////////////////////////////////////////////////
const uint32_t operationIndex =
operationInputs, operationOutputs);
std::for_each(operationOutputs.begin(), operationOutputs.end(),
[&deadOperands, operationIndex](uint32_t operandIndex) {
deadOperands.insert(std::make_pair(operandIndex, operationIndex));
// Now finalize the weights.
for (const auto& valueOperand : valueOperands) {
const uint32_t operandIndex = valueOperand.first;
const unsigned regionIndex = valueOperand.second;
if (regionIndex == ~0U) {
const WrapperMemory* memory;
uint32_t offset, length;
float* region =
static_cast<float*>(weights.getRegion(regionIndex, &memory, &offset, &length));
assert(length == problemSize * problemSize * sizeof(float));
std::generate(region, region + problemSize * problemSize, [this]{ return randFrac(); });
model.setOperandValueFromMemory(operandIndex, memory, offset, length);
// Now select model outputs.
for (uint32_t operationIdx = 0, operationCount = model.operationCount();
operationIdx < operationCount; operationIdx++) {
const auto& outputs = model.getOperationOutputs(operationIdx);
for (uint32_t outputIdx = 0, outputCount = outputs.size(); outputIdx < outputCount;
outputIdx++) {
bool modelOutput = false;
const uint32_t operandIndex = outputs[outputIdx];
const auto deadOperandI = deadOperands.find(operandIndex);
if (deadOperandI != deadOperands.end()) {
// This is not consumed within the model, so unless we
// make it an output of the model, it's dead. The
// further along we are in generating this model
// (i.e., the more operations we have generated), the
// more likely we are to classify this operation
// output as a model output.
const double probabilityOfModelOutput =
0.50 * [](double x){ return x*x; }((operationIdx + 1) / operationCount);
modelOutput = (randFrac() < probabilityOfModelOutput);
} else {
// This is consumed within the model, so we'll rarely
// make it an output of the model.
modelOutput = (randFrac() < 0.05);
if (!modelOutput) {
if (deadOperandI != deadOperands.end()) {
const auto deadOperationI = deadOperations.find(operationIdx);
if (deadOperationI != deadOperations.end()) {
if (!allowDeadOperations) {
// For each dead operation, pick a random output to become a model output.
for (uint32_t deadOperationIndex : deadOperations) {
const auto& deadOperationOutputs = model.getOperationOutputs(deadOperationIndex);
const uint32_t deadOperandIndex =
// A model must have at least one output.
if (modelOutputs.empty()) {
const auto& outputs = model.getOperationOutputs(randUInt(model.operationCount()));
model.identifyInputsAndOutputs(modelInputs, modelOutputs);
#ifdef VERBOSE
std::cout << "Original model: " << ModelStats(&model) << std::endl;
std::cout << "rootOperationCount = " << rootOperationCount
<< ", deadOperations = ";
if (allowDeadOperations) {
std::cout << deadOperations.size();
} else {
std::cout << "forbidden (converted " << deadOperations.size() << ")";
std::cout << std::endl;
ASSERT_EQ(model.finish(), Result::NO_ERROR);
// Non-partitioned compilation.
TestCompilation c(&model);
ASSERT_EQ(c.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR);
ASSERT_EQ(c.finish(), Result::NO_ERROR);
// Create some drivers for partitioned compilation.
std::vector<std::set<Signature>> signaturesForDriver(signatures.size());
// First assign each signature to a random driver (a driver is
// just represented as an entry in the signaturesForDriver
// vector).
for (Signature signature : signatures) {
// Now remove each entry that has no signatures.
auto firstExtra =
std::remove_if(signaturesForDriver.begin(), signaturesForDriver.end(),
[](const std::set<Signature>& sigSet) { return sigSet.empty(); });
if (firstExtra != signaturesForDriver.end()) {
signaturesForDriver.erase(firstExtra, signaturesForDriver.end());
// Now actually create the drivers.
std::vector<std::shared_ptr<Device>> devices;
for (unsigned i = 0; i < signaturesForDriver.size(); i++) {
const std::string name = "TestDriver(" + std::to_string(i) + ")";
name, new TestDriver(name.c_str(), signaturesForDriver[i])));
// Partitioned compilation.
// For test cases without unknown intermediate operand sizes we require the
// partitioning to succeed without CPU fallback. With unknown sizes we
// retry with a fallback if the non-fallback partitioning fails and require
// the fallback to succeed.
TestCompilation cNoFallback(&model);
TestCompilation cWithFallback(&model);
TestCompilation *c2 = nullptr;
auto compilationResult = cNoFallback.finish(devices);
if (hasUnknownDimensions && compilationResult == Result::OP_FAILED &&
cNoFallback.getExecutionPlan().forTest_hasSubModelOutputsOfUnknownSize()) {
ASSERT_EQ(cWithFallback.finish(devices), Result::NO_ERROR);
c2 = &cWithFallback;
} else {
ASSERT_EQ(compilationResult, Result::NO_ERROR);
c2 = &cNoFallback;
#ifdef VERBOSE
std::cout << "signatures = " << signatures.size()
<< ", devices = " << devices.size() << std::endl;
const ExecutionPlan& plan = c2->getExecutionPlan();
switch (plan.forTest_getKind()) {
case ExecutionPlan::Kind::SIMPLE:
std::cout << "plan: simple" << std::endl;
case ExecutionPlan::Kind::COMPOUND: {
const auto& steps = plan.forTest_compoundGetSteps();
std::set<const Device*> devicesInPlan;
for (const auto& step : steps) {
std::cout << "plan: compound, " << steps.size() << " steps over "
<< devicesInPlan.size() << " devices" << std::endl;
for (unsigned i = 0; i < steps.size(); i++) {
std::cout << "Step " << i << ": "
<< ModelStats(steps[i]->getSubModel()) << std::endl;
std::cout << "Unexpected plan kind: "
<< static_cast<unsigned>(plan.forTest_getKind());
// For execution:
// - create master inputs (one long vector) and master output value
// - master inputs will be copied to actual inputs before each
// of the two executions
// - master output will be used to fill actual outputs before each
// of the two executions
// - create actual inputs and outputs
// - first execution (non-partitioned)
// - initialize inputs and (to avoid unrelated oddities) outputs
// - execute
// - copy outputs to a save area (one long vector)
// - second execution (partitioned)
// - (to avoid unrelated oddities) initialize inputs and outputs
// - execute
// - compare outputs to save area
// If the runtime and drivers are working properly, execution
// should not change the inputs. Nonetheless, we reinitialize the
// inputs for each execution, so as to avoid unrelated problems
// appearing to be problems related to unpartitioned execution
// versus partitioned execution. Similarly, execution behavior
// should not be dependent on the outputs; but we'll initialize the
// outputs anyway.
std::vector<float> masterInputs(problemSize * problemSize * model.inputCount());
std::generate(masterInputs.begin(), masterInputs.end(), [this]{ return randFrac(); });
const float masterOutput = randFrac();
// Create the memory for the actual inputs and outputs.
struct InputOutputDescriptor {
enum Kind { INPUT, OUTPUT };
Kind mKind;
// The input or output either resides in a local buffer
// (mVector, in which case mMemoryRegion is ignored); or in a
// shared memory region within a TestMemories instance
// (mMemoryRegion, in which case mVector is ignored).
enum Location { VECTOR, REGION };
Location getLocation() const { return !mVector.empty() ? VECTOR : REGION; }
std::vector<float> mVector;
unsigned mMemoryRegion;
std::vector<InputOutputDescriptor> ioDescriptors(model.inputCount() + model.outputCount());
for (unsigned i = 0; i < ioDescriptors.size(); i++) {
ioDescriptors[i].mKind = (i < model.inputCount()
? InputOutputDescriptor::INPUT
: InputOutputDescriptor::OUTPUT);
// We randomly interleave inputs and outputs in creation
// order, because when we we create memory regions in a
// TestMemories instance, the order in which regions are
// created within a single Memory is the order they'll be laid
// out in that memory; and when we have inputs and outputs
// within the same Memory, we want the possibility that
// they'll be interleaved.
std::random_shuffle(ioDescriptors.begin(), ioDescriptors.end(),
[this](unsigned n) { return randUInt(n); });
TestMemories ioMemories;
for (auto &desc : ioDescriptors) {
if (randFrac() < 0.5) {
desc.mVector.resize(problemSize * problemSize);
} else {
// TODO: common this with the way we create IK_VALUE inputs?
unsigned memoryIndex = ~0U;
if ((ioMemories.memoryCount() != 0) && (randFrac() < 0.5)) {
memoryIndex = randUInt(ioMemories.memoryCount());
} else {
memoryIndex = ioMemories.addMemory();
const size_t length = problemSize * problemSize * sizeof(float);
desc.mMemoryRegion = ioMemories.addRegion(memoryIndex, length);
// Function to set up actual inputs and outputs (initializing them
// and telling the WrapperExecution about them).
auto prepareForExecution =
[&model, &ioDescriptors, &ioMemories,
&masterInputs, &masterOutput, problemSize, &problemType](WrapperExecution *e) {
uint32_t inputIndex = 0, outputIndex = 0;
for (auto &desc : ioDescriptors) {
if (desc.getLocation() == InputOutputDescriptor::VECTOR) {
if (desc.mKind == InputOutputDescriptor::INPUT) {
const size_t inputOffset = inputIndex * problemSize * problemSize;
std::copy(masterInputs.begin() + inputOffset,
masterInputs.begin() + inputOffset + problemSize * problemSize,
e->setInput(inputIndex++, desc.mVector.data(),
desc.mVector.size() * sizeof(float));
} else {
desc.mVector.begin() + problemSize * problemSize,
e->setOutput(outputIndex++, desc.mVector.data(),
desc.mVector.size() * sizeof(float),
} else {
const WrapperMemory* memory;
uint32_t offset, length;
float* region =
&memory, &offset, &length));
assert(length == problemSize * problemSize * sizeof(float));
if (desc.mKind == InputOutputDescriptor::INPUT) {
const size_t inputOffset = inputIndex * problemSize * problemSize;
std::copy(masterInputs.begin() + inputOffset,
masterInputs.begin() + inputOffset + problemSize * problemSize,
e->setInputFromMemory(inputIndex++, memory, offset, length);
} else {
region + problemSize * problemSize,
e->setOutputFromMemory(outputIndex++, memory, offset, length,
assert(inputIndex == model.inputCount());
assert(outputIndex == model.outputCount());
// Non-partitioned execution.
WrapperExecution e(&c);
ASSERT_EQ(e.compute(), Result::NO_ERROR);
// Copy the outputs of the non-partitioned execution to a save area.
std::vector<float> nonPartitionedOutputs(problemSize * problemSize * model.outputCount());
uint32_t outputIndex = 0;
for (const auto& desc : ioDescriptors) {
if (desc.mKind != InputOutputDescriptor::OUTPUT) {
const size_t outputOffset = outputIndex * problemSize * problemSize;
if (desc.getLocation() == InputOutputDescriptor::VECTOR) {
nonPartitionedOutputs.begin() + outputOffset);
} else {
float* region = static_cast<float*>(ioMemories.getRegion(desc.mMemoryRegion));
region + problemSize * problemSize,
nonPartitionedOutputs.begin() + outputOffset);
#ifdef VERBOSE
std::cout << "output[" << outputIndex << "] = {";
for (auto I = nonPartitionedOutputs.begin() + outputOffset,
E = nonPartitionedOutputs.begin() +
outputOffset + problemSize * problemSize;
I != E; I++) {
std::cout << " " << *I;
std::cout << " }" << std::endl;
// Partitioned execution.
WrapperExecution e2(c2);
ASSERT_EQ(e2.compute(), Result::NO_ERROR);
// Compare the outputs of the partitioned execution to the save
// area containing the outpus of the non-partitioned execution.
uint32_t outputIndex = 0;
for (const auto& desc : ioDescriptors) {
if (desc.mKind != InputOutputDescriptor::OUTPUT) {
const size_t outputOffset = outputIndex * problemSize * problemSize;
if (desc.getLocation() == InputOutputDescriptor::VECTOR) {
nonPartitionedOutputs.begin() + outputOffset));
} else {
float* region = static_cast<float*>(ioMemories.getRegion(desc.mMemoryRegion));
region + problemSize * problemSize,
nonPartitionedOutputs.begin() + outputOffset));
} // namespace
} // namespace android