/*-------------------------------------------------------------------------
 * drawElements Quality Program OpenGL ES 2.0 Module
 * -------------------------------------------------
 *
 * Copyright 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *//*!
 * \file
 * \brief Shader operator performance tests.
 *//*--------------------------------------------------------------------*/

#include "es2pShaderOperatorTests.hpp"
#include "glsCalibration.hpp"
#include "gluShaderUtil.hpp"
#include "gluShaderProgram.hpp"
#include "gluPixelTransfer.hpp"
#include "tcuTestLog.hpp"
#include "tcuRenderTarget.hpp"
#include "tcuCommandLine.hpp"
#include "tcuSurface.hpp"
#include "deStringUtil.hpp"
#include "deSharedPtr.hpp"
#include "deClock.h"
#include "deMath.h"

#include "glwEnums.hpp"
#include "glwFunctions.hpp"

#include <map>
#include <algorithm>
#include <limits>
#include <set>

namespace deqp
{
namespace gles2
{
namespace Performance
{

using namespace gls;
using namespace glu;
using tcu::Vec2;
using tcu::Vec4;
using tcu::TestLog;
using de::SharedPtr;

using std::string;
using std::vector;

#define MEASUREMENT_FAIL() throw tcu::InternalError("Unable to get sensible measurements for estimation", DE_NULL, __FILE__, __LINE__)

// Number of measurements in OperatorPerformanceCase for each workload size, unless specified otherwise by a command line argument.
static const int	DEFAULT_NUM_MEASUREMENTS_PER_WORKLOAD	= 3;
// How many different workload sizes are used by OperatorPerformanceCase.
static const int	NUM_WORKLOADS							= 8;
// Maximum workload size that can be attempted. In a sensible case, this most likely won't be reached.
static const int	MAX_WORKLOAD_SIZE						= 1<<29;

// BinaryOpCase-specific constants for shader generation.
static const int	BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS	= 4;
static const int	BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT	= 2;
static const int	BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT		= 4;

// FunctionCase-specific constants for shader generation.
static const int	FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS			= 4;

static const char* const s_swizzles[][4] =
{
	{ "x", "yx", "yzx", "wzyx" },
	{ "y", "zy", "wyz", "xwzy" },
	{ "z", "wy", "zxy", "yzwx" },
	{ "w", "xw", "yxw", "zyxw" }
};

template <int N>
static tcu::Vector<float, N> mean (const vector<tcu::Vector<float, N> >& data)
{
	tcu::Vector<float, N> sum(0.0f);
	for (int i = 0; i < (int)data.size(); i++)
		sum += data[i];
	return sum / tcu::Vector<float, N>((float)data.size());
}

static void uniformNfv (const glw::Functions& gl, int n, int location, int count, const float* data)
{
	switch (n)
	{
		case 1: gl.uniform1fv(location, count, data); break;
		case 2: gl.uniform2fv(location, count, data); break;
		case 3: gl.uniform3fv(location, count, data); break;
		case 4: gl.uniform4fv(location, count, data); break;
		default: DE_ASSERT(false);
	}
}

static void uniformNiv (const glw::Functions& gl, int n, int location, int count, const int* data)
{
	switch (n)
	{
		case 1: gl.uniform1iv(location, count, data); break;
		case 2: gl.uniform2iv(location, count, data); break;
		case 3: gl.uniform3iv(location, count, data); break;
		case 4: gl.uniform4iv(location, count, data); break;
		default: DE_ASSERT(false);
	}
}

static void uniformMatrixNfv (const glw::Functions& gl, int n, int location, int count, const float* data)
{
	switch (n)
	{
		case 2: gl.uniformMatrix2fv(location, count, GL_FALSE, &data[0]); break;
		case 3: gl.uniformMatrix3fv(location, count, GL_FALSE, &data[0]); break;
		case 4: gl.uniformMatrix4fv(location, count, GL_FALSE, &data[0]); break;
		default: DE_ASSERT(false);
	}
}

static glu::DataType getDataTypeFloatOrVec (int size)
{
	return size == 1 ? glu::TYPE_FLOAT : glu::getDataTypeFloatVec(size);
}

static int getIterationCountOrDefault (const tcu::CommandLine& cmdLine, int def)
{
	const int cmdLineVal = cmdLine.getTestIterationCount();
	return cmdLineVal > 0 ? cmdLineVal : def;
}

static string lineParamsString (const LineParameters& params)
{
	return "y = " + de::toString(params.offset) + " + " + de::toString(params.coefficient) + "*x";
}

namespace
{

/*--------------------------------------------------------------------*//*!
 * \brief Abstract class for measuring shader operator performance.
 *
 * This class draws multiple times with different workload sizes (set
 * via a uniform, by subclass). Time for each frame is measured, and the
 * slope of the workload size vs frame time data is estimated. This slope
 * tells us the estimated increase in frame time caused by a workload
 * increase of 1 unit (what 1 workload unit means is up to subclass).
 *
 * Generally, the shaders contain not just the operation we're interested
 * in (e.g. addition) but also some other stuff (e.g. loop overhead). To
 * eliminate this cost, we actually do the stuff described in the above
 * paragraph with multiple programs (usually two), which contain different
 * kinds of workload (e.g. different loop contents). Then we can (in
 * theory) compute the cost of just one operation in a subclass-dependent
 * manner.
 *
 * At this point, the result tells us the increase in frame time caused
 * by the addition of one operation. Dividing this by the amount of
 * draw calls in a frame, and further by the amount of vertices or
 * fragments in a draw call, we get the time cost of one operation.
 *
 * In reality, there sometimes isn't just a trivial linear dependence
 * between workload size and frame time. Instead, there tends to be some
 * amount of initial "free" operations. That is, it may be that all
 * workload sizes below some positive integer C yield the same frame time,
 * and only workload sizes beyond C increase the frame time in a supposedly
 * linear manner. Graphically, this means that there graph consists of two
 * parts: a horizontal left part, and a linearly increasing right part; the
 * right part starts where the left parts ends. The principal task of these
 * tests is to look at the slope of the increasing right part. Additionally
 * an estimate for the amount of initial free operations is calculated.
 * Note that it is also normal to get graphs where the horizontal left part
 * is of zero width, i.e. there are no free operations.
 *//*--------------------------------------------------------------------*/
class OperatorPerformanceCase : public tcu::TestCase
{
public:
	enum CaseType
	{
		CASETYPE_VERTEX = 0,
		CASETYPE_FRAGMENT,

		CASETYPE_LAST
	};

	struct InitialCalibration
	{
		int initialNumCalls;
		InitialCalibration (void) : initialNumCalls(1) {}
	};

	typedef SharedPtr<InitialCalibration> InitialCalibrationStorage;

								OperatorPerformanceCase		(tcu::TestContext& testCtx, glu::RenderContext& renderCtx, const char* name, const char* description,
															 CaseType caseType, int numWorkloads, const InitialCalibrationStorage& initialCalibrationStorage);
								~OperatorPerformanceCase	(void);

	void						init						(void);
	void						deinit						(void);

	IterateResult				iterate						(void);

	struct AttribSpec
	{
		AttribSpec (const char* name_, const tcu::Vec4& p00_, const tcu::Vec4& p01_, const tcu::Vec4& p10_, const tcu::Vec4& p11_)
			: name		(name_)
			, p00		(p00_)
			, p01		(p01_)
			, p10		(p10_)
			, p11		(p11_)
		{
		}

		AttribSpec (void) {}

		std::string		name;
		tcu::Vec4		p00;	//!< Bottom left.
		tcu::Vec4		p01;	//!< Bottom right.
		tcu::Vec4		p10;	//!< Top left.
		tcu::Vec4		p11;	//!< Top right.
	};

protected:
	struct ProgramContext
	{
		string				vertShaderSource;
		string				fragShaderSource;
		vector<AttribSpec>	attributes;

		string				description;

		ProgramContext (void) {}
		ProgramContext (const string& vs, const string& fs, const vector<AttribSpec>& attrs, const string& desc)
			: vertShaderSource(vs), fragShaderSource(fs), attributes(attrs), description(desc) {}
	};

	virtual vector<ProgramContext>	generateProgramData					(void) const = 0;
	//! Sets program-specific uniforms that don't depend on the workload size.
	virtual void					setGeneralUniforms					(deUint32 program) const = 0;
	//! Sets the uniform(s) that specifies the workload size in the shader.
	virtual void					setWorkloadSizeUniform				(deUint32 program, int workload) const = 0;
	//! Computes the cost of a single operation, given the workload costs per program.
	virtual float					computeSingleOperationTime			(const vector<float>& perProgramWorkloadCosts) const = 0;
	//! Logs a human-readable description of what computeSingleOperationTime does.
	virtual void					logSingleOperationCalculationInfo	(void) const = 0;

	glu::RenderContext&				m_renderCtx;

	CaseType						m_caseType;

private:
	enum State
	{
		STATE_CALIBRATING = 0,		//!< Calibrate draw call count, using first program in m_programs, with workload size 1.
		STATE_FIND_HIGH_WORKLOAD,	//!< Find an appropriate lower bound for the highest workload size we intend to use (one with high-enough frame time compared to workload size 1) for each program.
		STATE_MEASURING,			//!< Do actual measurements, for each program in m_programs.
		STATE_REPORTING,			//!< Measurements are done; calculate results and log.
		STATE_FINISHED,				//!< All done.

		STATE_LAST
	};

	struct WorkloadRecord
	{
		int				workloadSize;
		vector<float>	frameTimes; //!< In microseconds.

				WorkloadRecord	(int workloadSize_)						: workloadSize(workloadSize_) {}
		bool	operator<		(const WorkloadRecord& other) const		{ return this->workloadSize < other.workloadSize; }
		void	addFrameTime	(float time)							{ frameTimes.push_back(time); }
		float	getMedianTime	(void) const
		{
			vector<float> times = frameTimes;
			std::sort(times.begin(), times.end());
			return times.size() % 2 == 0 ?
					(times[times.size()/2-1] + times[times.size()/2])*0.5f :
					times[times.size()/2];
		}
	};

	void								prepareProgram				(int progNdx);					//!< Sets attributes and uniforms for m_programs[progNdx].
	void								prepareWorkload				(int progNdx, int workload);	//!< Calls setWorkloadSizeUniform and draws, in case the implementation does some draw-time compilation.
	void								prepareNextRound			(void);							//!< Increases workload and/or updates m_state.
	void								render						(int numDrawCalls);
	deUint64							renderAndMeasure			(int numDrawCalls);
	void								adjustAndLogGridAndViewport	(void);							//!< Log grid and viewport sizes, after possibly reducing them to reduce draw time.

	vector<Vec2>						getWorkloadMedianDataPoints	(int progNdx) const; //!< [ Vec2(r.workloadSize, r.getMedianTime()) for r in m_workloadRecords[progNdx] ]

	const int							m_numMeasurementsPerWorkload;
	const int							m_numWorkloads;				//!< How many different workload sizes are used for measurement for each program.

	int									m_workloadNdx;				//!< Runs from 0 to m_numWorkloads-1.

	int									m_workloadMeasurementNdx;
	vector<vector<WorkloadRecord> >		m_workloadRecordsFindHigh;	//!< The measurements done during STATE_FIND_HIGH_WORKLOAD.
	vector<vector<WorkloadRecord> >		m_workloadRecords;			//!< The measurements of each program in m_programs. Generated during STATE_MEASURING, into index specified by m_measureProgramNdx.

	State								m_state;
	int									m_measureProgramNdx;		//!< When m_state is STATE_FIND_HIGH_WORKLOAD or STATE_MEASURING, this tells which program in m_programs is being measured.

	vector<int>							m_highWorkloadSizes;		//!< The first workload size encountered during STATE_FIND_HIGH_WORKLOAD that was determined suitable, for each program.

	TheilSenCalibrator					m_calibrator;
	InitialCalibrationStorage			m_initialCalibrationStorage;

	int									m_viewportWidth;
	int									m_viewportHeight;
	int									m_gridSizeX;
	int									m_gridSizeY;

	vector<ProgramContext>				m_programData;
	vector<SharedPtr<ShaderProgram> >	m_programs;

	std::vector<deUint32>				m_attribBuffers;
};

static inline float triangleInterpolate (float v0, float v1, float v2, float x, float y)
{
	return v0 + (v2-v0)*x + (v1-v0)*y;
}

static inline float triQuadInterpolate (float x, float y, const tcu::Vec4& quad)
{
	// \note Top left fill rule.
	if (x + y < 1.0f)
		return triangleInterpolate(quad.x(), quad.y(), quad.z(), x, y);
	else
		return triangleInterpolate(quad.w(), quad.z(), quad.y(), 1.0f-x, 1.0f-y);
}

static inline int getNumVertices (int gridSizeX, int gridSizeY)
{
	return gridSizeX * gridSizeY * 2 * 3;
}

static void generateVertices (std::vector<float>& dst, int gridSizeX, int gridSizeY, const OperatorPerformanceCase::AttribSpec& spec)
{
	const int numComponents = 4;

	DE_ASSERT(gridSizeX >= 1 && gridSizeY >= 1);
	dst.resize(getNumVertices(gridSizeX, gridSizeY) * numComponents);

	{
		int dstNdx = 0;

		for (int baseY = 0; baseY < gridSizeY; baseY++)
		for (int baseX = 0; baseX < gridSizeX; baseX++)
		{
			const float xf0 = (float)(baseX + 0) / (float)gridSizeX;
			const float yf0 = (float)(baseY + 0) / (float)gridSizeY;
			const float xf1 = (float)(baseX + 1) / (float)gridSizeX;
			const float yf1 = (float)(baseY + 1) / (float)gridSizeY;

#define ADD_VERTEX(XF, YF)										\
	for (int compNdx = 0; compNdx < numComponents; compNdx++)	\
		dst[dstNdx++] = triQuadInterpolate((XF), (YF), tcu::Vec4(spec.p00[compNdx], spec.p01[compNdx], spec.p10[compNdx], spec.p11[compNdx]))

			ADD_VERTEX(xf0, yf0);
			ADD_VERTEX(xf1, yf0);
			ADD_VERTEX(xf0, yf1);

			ADD_VERTEX(xf1, yf0);
			ADD_VERTEX(xf1, yf1);
			ADD_VERTEX(xf0, yf1);

#undef ADD_VERTEX
		}
	}
}

static float intersectionX (const gls::LineParameters& a, const gls::LineParameters& b)
{
	return (a.offset - b.offset) / (b.coefficient - a.coefficient);
}

static int numDistinctX (const vector<Vec2>& data)
{
	std::set<float> xs;
	for (int i = 0; i < (int)data.size(); i++)
		xs.insert(data[i].x());
	return (int)xs.size();
}

static gls::LineParameters simpleLinearRegression (const vector<Vec2>& data)
{
	const Vec2	mid					= mean(data);

	float		slopeNumerator		= 0.0f;
	float		slopeDenominator	= 0.0f;

	for (int i = 0; i < (int)data.size(); i++)
	{
		const Vec2 diff = data[i] - mid;

		slopeNumerator		+= diff.x()*diff.y();
		slopeDenominator	+= diff.x()*diff.x();
	}

	const float slope	= slopeNumerator / slopeDenominator;
	const float offset	= mid.y() - slope*mid.x();

	return gls::LineParameters(offset, slope);
}

static float simpleLinearRegressionError (const vector<Vec2>& data)
{
	if (numDistinctX(data) <= 2)
		return 0.0f;
	else
	{
		const gls::LineParameters	estimator	= simpleLinearRegression(data);
		float						error		= 0.0f;

		for (int i = 0; i < (int)data.size(); i++)
		{
			const float estY = estimator.offset + estimator.coefficient*data[i].x();
			const float diff = estY - data[i].y();
			error += diff*diff;
		}

		return error / (float)data.size();
	}
}

static float verticalVariance (const vector<Vec2>& data)
{
	if (numDistinctX(data) <= 2)
		return 0.0f;
	else
	{
		const float		meanY = mean(data).y();
		float			error = 0.0f;

		for (int i = 0; i < (int)data.size(); i++)
		{
			const float diff = meanY - data[i].y();
			error += diff*diff;
		}

		return error / (float)data.size();
	}
}

/*--------------------------------------------------------------------*//*!
 * \brief Find the x coord that divides the input data into two slopes.
 *
 * The operator performance measurements tend to produce results where
 * we get small operation counts "for free" (e.g. because the operations
 * are performed during some memory transfer overhead or something),
 * resulting in a curve with two parts: an initial horizontal line segment,
 * and a rising line.
 *
 * This function finds the x coordinate that divides the input data into
 * two parts such that the sum of the mean square errors for the
 * least-squares estimated lines for the two parts is minimized, under the
 * additional condition that the left line is horizontal.
 *
 * This function returns a number X s.t. { pt | pt is in data, pt.x >= X }
 * is the right line, and the rest of data is the left line.
 *//*--------------------------------------------------------------------*/
static float findSlopePivotX (const vector<Vec2>& data)
{
	std::set<float> xCoords;
	for (int i = 0; i < (int)data.size(); i++)
		xCoords.insert(data[i].x());

	float			lowestError		= std::numeric_limits<float>::infinity();
	float			bestPivotX		= -std::numeric_limits<float>::infinity();

	for (std::set<float>::const_iterator pivotX = xCoords.begin(); pivotX != xCoords.end(); ++pivotX)
	{
		vector<Vec2> leftData;
		vector<Vec2> rightData;
		for (int i = 0; i < (int)data.size(); i++)
		{
			if (data[i].x() < *pivotX)
				leftData.push_back(data[i]);
			else
				rightData.push_back(data[i]);
		}

		if (numDistinctX(rightData) < 3) // We don't trust the right data if there's too little of it.
			break;

		{
			const float totalError = verticalVariance(leftData) + simpleLinearRegressionError(rightData);

			if (totalError < lowestError)
			{
				lowestError = totalError;
				bestPivotX = *pivotX;
			}
		}
	}

	DE_ASSERT(lowestError < std::numeric_limits<float>::infinity());

	return bestPivotX;
}

struct SegmentedEstimator
{
	float					pivotX; //!< Value returned by findSlopePivotX, or -infinity if only single line.
	gls::LineParameters		left;
	gls::LineParameters		right;
	SegmentedEstimator (const gls::LineParameters& l, const gls::LineParameters& r, float pivotX_) : pivotX(pivotX_), left(l), right(r) {}
};

/*--------------------------------------------------------------------*//*!
 * \brief Compute line estimators for (potentially) two-segment data.
 *
 * Splits the given data into left and right parts (using findSlopePivotX)
 * and returns the line estimates for them.
 *
 * Sometimes, however (especially in fragment shader cases) the data is
 * in fact not segmented, but a straight line. This function attempts to
 * detect if this the case, and if so, sets left.offset = right.offset and
 * left.slope = 0, meaning essentially that the initial "flat" part of the
 * data has zero width.
 *//*--------------------------------------------------------------------*/
static SegmentedEstimator computeSegmentedEstimator (const vector<Vec2>& data)
{
	const float		pivotX = findSlopePivotX(data);
	vector<Vec2>	leftData;
	vector<Vec2>	rightData;

	for (int i = 0; i < (int)data.size(); i++)
	{
		if (data[i].x() < pivotX)
			leftData.push_back(data[i]);
		else
			rightData.push_back(data[i]);
	}

	{
		const gls::LineParameters leftLine		= gls::theilSenLinearRegression(leftData);
		const gls::LineParameters rightLine		= gls::theilSenLinearRegression(rightData);

		if (numDistinctX(leftData) < 2 || leftLine.coefficient > rightLine.coefficient*0.5f)
		{
			// Left data doesn't seem credible; assume the data is just a single line.
			const gls::LineParameters entireLine = gls::theilSenLinearRegression(data);
			return SegmentedEstimator(gls::LineParameters(entireLine.offset, 0.0f), entireLine, -std::numeric_limits<float>::infinity());
		}
		else
			return SegmentedEstimator(leftLine, rightLine, pivotX);
	}
}

OperatorPerformanceCase::OperatorPerformanceCase (tcu::TestContext& testCtx, glu::RenderContext& renderCtx, const char* name, const char* description,
												  CaseType caseType, int numWorkloads, const InitialCalibrationStorage& initialCalibrationStorage)
	: tcu::TestCase					(testCtx, tcu::NODETYPE_PERFORMANCE, name, description)
	, m_renderCtx					(renderCtx)
	, m_caseType					(caseType)
	, m_numMeasurementsPerWorkload	(getIterationCountOrDefault(m_testCtx.getCommandLine(), DEFAULT_NUM_MEASUREMENTS_PER_WORKLOAD))
	, m_numWorkloads				(numWorkloads)
	, m_workloadNdx					(-1)
	, m_workloadMeasurementNdx		(-1)
	, m_state						(STATE_LAST)
	, m_measureProgramNdx			(-1)
	, m_initialCalibrationStorage	(initialCalibrationStorage)
	, m_viewportWidth				(caseType == CASETYPE_VERTEX	? 32	: renderCtx.getRenderTarget().getWidth())
	, m_viewportHeight				(caseType == CASETYPE_VERTEX	? 32	: renderCtx.getRenderTarget().getHeight())
	, m_gridSizeX					(caseType == CASETYPE_FRAGMENT	? 1		: 100)
	, m_gridSizeY					(caseType == CASETYPE_FRAGMENT	? 1		: 100)
{
	DE_ASSERT(m_numWorkloads > 0);
}

OperatorPerformanceCase::~OperatorPerformanceCase (void)
{
	if (!m_attribBuffers.empty())
	{
		m_renderCtx.getFunctions().deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
		m_attribBuffers.clear();
	}
}

static void logRenderTargetInfo (TestLog& log, const tcu::RenderTarget& renderTarget)
{
	log << TestLog::Section("RenderTarget", "Render target")
		<< TestLog::Message << "size: " << renderTarget.getWidth() << "x" << renderTarget.getHeight() << TestLog::EndMessage
		<< TestLog::Message << "bits:"
							<< " R" << renderTarget.getPixelFormat().redBits
							<< " G" << renderTarget.getPixelFormat().greenBits
							<< " B" << renderTarget.getPixelFormat().blueBits
							<< " A" << renderTarget.getPixelFormat().alphaBits
							<< " D" << renderTarget.getDepthBits()
							<< " S" << renderTarget.getStencilBits()
							<< TestLog::EndMessage;

	if (renderTarget.getNumSamples() != 0)
		log << TestLog::Message << renderTarget.getNumSamples() << "x MSAA" << TestLog::EndMessage;
	else
		log << TestLog::Message << "No MSAA" << TestLog::EndMessage;

	log << TestLog::EndSection;
}

vector<Vec2> OperatorPerformanceCase::getWorkloadMedianDataPoints (int progNdx) const
{
	const vector<WorkloadRecord>&	records = m_workloadRecords[progNdx];
	vector<Vec2>					result;

	for (int i = 0; i < (int)records.size(); i++)
		result.push_back(Vec2((float)records[i].workloadSize, records[i].getMedianTime()));

	return result;
}

void OperatorPerformanceCase::prepareProgram (int progNdx)
{
	DE_ASSERT(progNdx < (int)m_programs.size());
	DE_ASSERT(m_programData.size() == m_programs.size());

	const glw::Functions&	gl			= m_renderCtx.getFunctions();
	const ShaderProgram&	program		= *m_programs[progNdx];

	vector<AttribSpec>		attributes	= m_programData[progNdx].attributes;

	attributes.push_back(AttribSpec("a_position",
									Vec4(-1.0f, -1.0f, 0.0f, 1.0f),
									Vec4( 1.0f, -1.0f, 0.0f, 1.0f),
									Vec4(-1.0f,  1.0f, 0.0f, 1.0f),
									Vec4( 1.0f,  1.0f, 0.0f, 1.0f)));

	DE_ASSERT(program.isOk());

	// Generate vertices.
	if (!m_attribBuffers.empty())
		gl.deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
	m_attribBuffers.resize(attributes.size(), 0);
	gl.genBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
	GLU_EXPECT_NO_ERROR(gl.getError(), "glGenBuffers()");

	for (int attribNdx = 0; attribNdx < (int)attributes.size(); attribNdx++)
	{
		std::vector<float> vertices;
		generateVertices(vertices, m_gridSizeX, m_gridSizeY, attributes[attribNdx]);

		gl.bindBuffer(GL_ARRAY_BUFFER, m_attribBuffers[attribNdx]);
		gl.bufferData(GL_ARRAY_BUFFER, (glw::GLsizeiptr)(vertices.size()*sizeof(float)), &vertices[0], GL_STATIC_DRAW);
		GLU_EXPECT_NO_ERROR(gl.getError(), "Upload buffer data");
	}

	// Setup attribute bindings.
	for (int attribNdx = 0; attribNdx < (int)attributes.size(); attribNdx++)
	{
		int location = gl.getAttribLocation(program.getProgram(), attributes[attribNdx].name.c_str());

		if (location >= 0)
		{
			gl.enableVertexAttribArray(location);
			gl.bindBuffer(GL_ARRAY_BUFFER, m_attribBuffers[attribNdx]);
			gl.vertexAttribPointer(location, 4, GL_FLOAT, GL_FALSE, 0, DE_NULL);
		}
	}
	GLU_EXPECT_NO_ERROR(gl.getError(), "Setup vertex input state");

	gl.useProgram(program.getProgram());
	setGeneralUniforms(program.getProgram());
	gl.viewport(0, 0, m_viewportWidth, m_viewportHeight);
}

void OperatorPerformanceCase::prepareWorkload (int progNdx, int workload)
{
	setWorkloadSizeUniform(m_programs[progNdx]->getProgram(), workload);
	render(m_calibrator.getCallCount());
}

void OperatorPerformanceCase::prepareNextRound (void)
{
	DE_ASSERT(m_state == STATE_CALIBRATING			||
			  m_state == STATE_FIND_HIGH_WORKLOAD	||
			  m_state == STATE_MEASURING);

	TestLog& log = m_testCtx.getLog();

	if (m_state == STATE_CALIBRATING && m_calibrator.getState() == TheilSenCalibrator::STATE_FINISHED)
	{
		m_measureProgramNdx = 0;
		m_state = STATE_FIND_HIGH_WORKLOAD;
	}

	if (m_state == STATE_CALIBRATING)
		prepareWorkload(0, 1);
	else if (m_state == STATE_FIND_HIGH_WORKLOAD)
	{
		vector<WorkloadRecord>& records = m_workloadRecordsFindHigh[m_measureProgramNdx];

		if (records.empty() || records.back().getMedianTime() < 2.0f*records[0].getMedianTime())
		{
			int workloadSize;

			if (records.empty())
				workloadSize = 1;
			else
			{
				workloadSize = records.back().workloadSize*2;

				if (workloadSize > MAX_WORKLOAD_SIZE)
				{
					log << TestLog::Message << "Even workload size " << records.back().workloadSize
											<< " doesn't give high enough frame time for program " << m_measureProgramNdx
											<< ". Can't get sensible result." << TestLog::EndMessage;
					MEASUREMENT_FAIL();
				}
			}

			records.push_back(WorkloadRecord(workloadSize));
			prepareWorkload(0, workloadSize);
			m_workloadMeasurementNdx = 0;
		}
		else
		{
			m_highWorkloadSizes[m_measureProgramNdx] = records.back().workloadSize;
			m_measureProgramNdx++;

			if (m_measureProgramNdx >= (int)m_programs.size())
			{
				m_state = STATE_MEASURING;
				m_workloadNdx = -1;
				m_measureProgramNdx = 0;
			}

			prepareProgram(m_measureProgramNdx);
			prepareNextRound();
		}
	}
	else
	{
		m_workloadNdx++;

		if (m_workloadNdx < m_numWorkloads)
		{
			DE_ASSERT(m_numWorkloads > 1);
			const int highWorkload	= m_highWorkloadSizes[m_measureProgramNdx];
			const int workload		= highWorkload > m_numWorkloads ?
										1 + m_workloadNdx*(highWorkload-1)/(m_numWorkloads-1) :
										1 + m_workloadNdx;

			prepareWorkload(m_measureProgramNdx, workload);

			m_workloadMeasurementNdx = 0;

			m_workloadRecords[m_measureProgramNdx].push_back(WorkloadRecord(workload));
		}
		else
		{
			m_measureProgramNdx++;

			if (m_measureProgramNdx < (int)m_programs.size())
			{
				m_workloadNdx = -1;
				m_workloadMeasurementNdx = 0;
				prepareProgram(m_measureProgramNdx);
				prepareNextRound();
			}
			else
				m_state = STATE_REPORTING;
		}
	}
}

void OperatorPerformanceCase::init (void)
{
	TestLog&				log		= m_testCtx.getLog();
	const glw::Functions&	gl		= m_renderCtx.getFunctions();

	// Validate that we have sane grid and viewport setup.
	DE_ASSERT(de::inBounds(m_gridSizeX, 1, 256) && de::inBounds(m_gridSizeY, 1, 256));
	TCU_CHECK(de::inRange(m_viewportWidth,	1, m_renderCtx.getRenderTarget().getWidth()) &&
			  de::inRange(m_viewportHeight,	1, m_renderCtx.getRenderTarget().getHeight()));

	logRenderTargetInfo(log, m_renderCtx.getRenderTarget());

	log << TestLog::Message << "Using additive blending." << TestLog::EndMessage;
	gl.enable(GL_BLEND);
	gl.blendEquation(GL_FUNC_ADD);
	gl.blendFunc(GL_ONE, GL_ONE);

	// Generate programs.
	DE_ASSERT(m_programs.empty());
	m_programData = generateProgramData();
	DE_ASSERT(!m_programData.empty());

	for (int progNdx = 0; progNdx < (int)m_programData.size(); progNdx++)
	{
		const string& vert = m_programData[progNdx].vertShaderSource;
		const string& frag = m_programData[progNdx].fragShaderSource;

		m_programs.push_back(SharedPtr<ShaderProgram>(new ShaderProgram(m_renderCtx, glu::makeVtxFragSources(vert, frag))));

		if (!m_programs.back()->isOk())
		{
			log << *m_programs.back();
			TCU_FAIL("Compile failed");
		}
	}

	// Log all programs.
	for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
		log << TestLog::Section("Program" + de::toString(progNdx), "Program " + de::toString(progNdx))
				<< TestLog::Message << m_programData[progNdx].description << TestLog::EndMessage
				<< *m_programs[progNdx]
			<< TestLog::EndSection;

	m_highWorkloadSizes.resize(m_programData.size());
	m_workloadRecordsFindHigh.resize(m_programData.size());
	m_workloadRecords.resize(m_programData.size());

	m_calibrator.clear(CalibratorParameters(m_initialCalibrationStorage->initialNumCalls, 10 /* calibrate iteration frames */, 2000.0f /* calibrate iteration shortcut threshold (ms) */, 16 /* max calibrate iterations */,
											1000.0f/30.0f /* frame time (ms) */, 1000.0f/60.0f /* frame time cap (ms) */, 1000.0f /* target measure duration (ms) */));
	m_state = STATE_CALIBRATING;

	prepareProgram(0);
	prepareNextRound();
}

void OperatorPerformanceCase::deinit (void)
{
	if (!m_attribBuffers.empty())
	{
		m_renderCtx.getFunctions().deleteBuffers((glw::GLsizei)m_attribBuffers.size(), &m_attribBuffers[0]);
		m_attribBuffers.clear();
	}

	m_programs.clear();
}

void OperatorPerformanceCase::render (int numDrawCalls)
{
	const glw::Functions&	gl				= m_renderCtx.getFunctions();
	const int				numVertices		= getNumVertices(m_gridSizeX, m_gridSizeY);

	for (int callNdx = 0; callNdx < numDrawCalls; callNdx++)
		gl.drawArrays(GL_TRIANGLES, 0, numVertices);

	glu::readPixels(m_renderCtx, 0, 0, tcu::Surface(1, 1).getAccess()); // \note Serves as a more reliable replacement for glFinish().
}

deUint64 OperatorPerformanceCase::renderAndMeasure (int numDrawCalls)
{
	const deUint64 startTime = deGetMicroseconds();
	render(numDrawCalls);
	return deGetMicroseconds() - startTime;
}

void OperatorPerformanceCase::adjustAndLogGridAndViewport (void)
{
	TestLog& log = m_testCtx.getLog();

	// If call count is just 1, and the target frame time still wasn't reached, reduce grid or viewport size.
	if (m_calibrator.getCallCount() == 1)
	{
		const gls::MeasureState&	calibratorMeasure	= m_calibrator.getMeasureState();
		const float					drawCallTime		= (float)calibratorMeasure.getTotalTime() / (float)calibratorMeasure.frameTimes.size();
		const float					targetDrawCallTime	= m_calibrator.getParameters().targetFrameTimeUs;
		const float					targetRatio			= targetDrawCallTime / drawCallTime;

		if (targetRatio < 0.95f)
		{
			// Reduce grid or viewport size assuming draw call time scales proportionally.
			if (m_caseType == CASETYPE_VERTEX)
			{
				const float targetRatioSqrt = deFloatSqrt(targetRatio);
				m_gridSizeX = (int)(targetRatioSqrt * (float)m_gridSizeX);
				m_gridSizeY = (int)(targetRatioSqrt * (float)m_gridSizeY);
				TCU_CHECK_MSG(m_gridSizeX >= 1 && m_gridSizeY >= 1, "Can't decrease grid size enough to achieve low-enough draw times");
				log << TestLog::Message << "Note: triangle grid size reduced from original; it's now smaller than during calibration." << TestLog::EndMessage;
			}
			else
			{
				const float targetRatioSqrt = deFloatSqrt(targetRatio);
				m_viewportWidth  = (int)(targetRatioSqrt * (float)m_viewportWidth);
				m_viewportHeight = (int)(targetRatioSqrt * (float)m_viewportHeight);
				TCU_CHECK_MSG(m_viewportWidth >= 1 && m_viewportHeight >= 1, "Can't decrease viewport size enough to achieve low-enough draw times");
				log << TestLog::Message << "Note: viewport size reduced from original; it's now smaller than during calibration." << TestLog::EndMessage;
			}
		}
	}

	prepareProgram(0);

	// Log grid and viewport sizes.
	log << TestLog::Message << "Grid size: " << m_gridSizeX << "x" << m_gridSizeY << TestLog::EndMessage;
	log << TestLog::Message << "Viewport: " << m_viewportWidth << "x" << m_viewportHeight << TestLog::EndMessage;
}

OperatorPerformanceCase::IterateResult OperatorPerformanceCase::iterate (void)
{
	const TheilSenCalibrator::State calibratorState = m_calibrator.getState();

	if (calibratorState != TheilSenCalibrator::STATE_FINISHED)
	{
		if (calibratorState == TheilSenCalibrator::STATE_RECOMPUTE_PARAMS)
			m_calibrator.recomputeParameters();
		else if (calibratorState == TheilSenCalibrator::STATE_MEASURE)
			m_calibrator.recordIteration(renderAndMeasure(m_calibrator.getCallCount()));
		else
			DE_ASSERT(false);

		if (m_calibrator.getState() == TheilSenCalibrator::STATE_FINISHED)
		{
			logCalibrationInfo(m_testCtx.getLog(), m_calibrator);
			adjustAndLogGridAndViewport();
			prepareNextRound();
			m_initialCalibrationStorage->initialNumCalls = m_calibrator.getCallCount();
		}
	}
	else if (m_state == STATE_FIND_HIGH_WORKLOAD || m_state == STATE_MEASURING)
	{
		if (m_workloadMeasurementNdx < m_numMeasurementsPerWorkload)
		{
			vector<WorkloadRecord>& records = m_state == STATE_FIND_HIGH_WORKLOAD ? m_workloadRecordsFindHigh[m_measureProgramNdx] : m_workloadRecords[m_measureProgramNdx];
			records.back().addFrameTime((float)renderAndMeasure(m_calibrator.getCallCount()));
			m_workloadMeasurementNdx++;
		}
		else
			prepareNextRound();
	}
	else
	{
		DE_ASSERT(m_state == STATE_REPORTING);

		TestLog&	log				= m_testCtx.getLog();
		const int	drawCallCount	= m_calibrator.getCallCount();

		{
			// Compute per-program estimators for measurements.
			vector<SegmentedEstimator> estimators;
			for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
				estimators.push_back(computeSegmentedEstimator(getWorkloadMedianDataPoints(progNdx)));

			// Log measurements and their estimators for all programs.
			for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
			{
				const SegmentedEstimator&	estimator	= estimators[progNdx];
				const string				progNdxStr	= de::toString(progNdx);
				vector<WorkloadRecord>		records		= m_workloadRecords[progNdx];
				std::sort(records.begin(), records.end());

				{
					const tcu::ScopedLogSection section(log,
														"Program" + progNdxStr + "Measurements",
														"Measurements for program " + progNdxStr);

					// Sample list of individual frame times.

					log << TestLog::SampleList("Program" + progNdxStr + "IndividualFrameTimes", "Individual frame times")
						<< TestLog::SampleInfo << TestLog::ValueInfo("Workload",	"Workload",		"",		QP_SAMPLE_VALUE_TAG_PREDICTOR)
											   << TestLog::ValueInfo("FrameTime",	"Frame time",	"us",	QP_SAMPLE_VALUE_TAG_RESPONSE)
						<< TestLog::EndSampleInfo;

					for (int i = 0; i < (int)records.size(); i++)
						for (int j = 0; j < (int)records[i].frameTimes.size(); j++)
							log << TestLog::Sample << records[i].workloadSize << records[i].frameTimes[j] << TestLog::EndSample;

					log << TestLog::EndSampleList;

					// Sample list of median frame times.

					log << TestLog::SampleList("Program" + progNdxStr + "MedianFrameTimes", "Median frame times")
						<< TestLog::SampleInfo << TestLog::ValueInfo("Workload",		"Workload",				"",		QP_SAMPLE_VALUE_TAG_PREDICTOR)
											   << TestLog::ValueInfo("MedianFrameTime",	"Median frame time",	"us",	QP_SAMPLE_VALUE_TAG_RESPONSE)
						<< TestLog::EndSampleInfo;

					for (int i = 0; i < (int)records.size(); i++)
						log << TestLog::Sample << records[i].workloadSize << records[i].getMedianTime() << TestLog::EndSample;

					log << TestLog::EndSampleList;

					log << TestLog::Float("Program" + progNdxStr + "WorkloadCostEstimate", "Workload cost estimate", "us / workload", QP_KEY_TAG_TIME, estimator.right.coefficient);

					if (estimator.pivotX > -std::numeric_limits<float>::infinity())
						log << TestLog::Message << "Note: the data points with x coordinate greater than or equal to " << estimator.pivotX
												<< " seem to form a rising line, and the rest of data points seem to form a near-horizontal line" << TestLog::EndMessage
							<< TestLog::Message << "Note: the left line is estimated to be " << lineParamsString(estimator.left)
												<< " and the right line " << lineParamsString(estimator.right) << TestLog::EndMessage;
					else
						log << TestLog::Message << "Note: the data seem to form a single line: " << lineParamsString(estimator.right) << TestLog::EndMessage;
				}
			}

			for (int progNdx = 0; progNdx < (int)m_programs.size(); progNdx++)
			{
				if (estimators[progNdx].right.coefficient <= 0.0f)
				{
					log << TestLog::Message << "Slope of measurements for program " << progNdx << " isn't positive. Can't get sensible result." << TestLog::EndMessage;
					MEASUREMENT_FAIL();
				}
			}

			// \note For each estimator, .right.coefficient is the increase in draw time (in microseconds) when
			// incrementing shader workload size by 1, when D draw calls are done, with a vertex/fragment count
			// of R.
			//
			// The measurements of any single program can't tell us the final result (time of single operation),
			// so we use computeSingleOperationTime to compute it from multiple programs' measurements in a
			// subclass-defined manner.
			//
			// After that, microseconds per operation can be calculated as singleOperationTime / (D * R).

			{
				vector<float>	perProgramSlopes;
				for (int i = 0; i < (int)m_programs.size(); i++)
					perProgramSlopes.push_back(estimators[i].right.coefficient);

				logSingleOperationCalculationInfo();

				const float		maxSlope				= *std::max_element(perProgramSlopes.begin(), perProgramSlopes.end());
				const float		usecsPerFramePerOp		= computeSingleOperationTime(perProgramSlopes);
				const int		vertexOrFragmentCount	= m_caseType == CASETYPE_VERTEX ?
															getNumVertices(m_gridSizeX, m_gridSizeY) :
															m_viewportWidth*m_viewportHeight;
				const double	usecsPerDrawCallPerOp	= usecsPerFramePerOp / (double)drawCallCount;
				const double	usecsPerSingleOp		= usecsPerDrawCallPerOp / (double)vertexOrFragmentCount;
				const double	megaOpsPerSecond		= (double)(drawCallCount*vertexOrFragmentCount) / usecsPerFramePerOp;
				const int		numFreeOps				= de::max(0, (int)deFloatFloor(intersectionX(estimators[0].left,
																									 LineParameters(estimators[0].right.offset,
																													usecsPerFramePerOp))));

				log << TestLog::Integer("VertexOrFragmentCount",
										"R = " + string(m_caseType == CASETYPE_VERTEX ? "Vertex" : "Fragment") + " count",
										"", QP_KEY_TAG_NONE, vertexOrFragmentCount)

					<< TestLog::Integer("DrawCallsPerFrame", "D = Draw calls per frame", "", QP_KEY_TAG_NONE, drawCallCount)

					<< TestLog::Integer("VerticesOrFragmentsPerFrame",
										"R*D = " + string(m_caseType == CASETYPE_VERTEX ? "Vertices" : "Fragments") + " per frame",
										"", QP_KEY_TAG_NONE, vertexOrFragmentCount*drawCallCount)

					<< TestLog::Float("TimePerFramePerOp",
									  "Estimated cost of R*D " + string(m_caseType == CASETYPE_VERTEX ? "vertices" : "fragments")
									  + " (i.e. one frame) with one shader operation",
									  "us", QP_KEY_TAG_TIME, (float)usecsPerFramePerOp)

					<< TestLog::Float("TimePerDrawcallPerOp",
									  "Estimated cost of one draw call with one shader operation",
									  "us", QP_KEY_TAG_TIME, (float)usecsPerDrawCallPerOp)

					<< TestLog::Float("TimePerSingleOp",
									  "Estimated cost of a single shader operation",
									  "us", QP_KEY_TAG_TIME, (float)usecsPerSingleOp);

				// \note Sometimes, when the operation is free or very cheap, it can happen that the shader with the operation runs,
				//		 for some reason, a bit faster than the shader without the operation, and thus we get a negative result. The
				//		 following threshold values for accepting a negative or almost-zero result are rather quick and dirty.
				if (usecsPerFramePerOp <= -0.1f*maxSlope)
				{
					log << TestLog::Message << "Got strongly negative result." << TestLog::EndMessage;
					MEASUREMENT_FAIL();
				}
				else if (usecsPerFramePerOp <= 0.001*maxSlope)
				{
					log << TestLog::Message << "Cost of operation seems to be approximately zero." << TestLog::EndMessage;
					m_testCtx.setTestResult(QP_TEST_RESULT_PASS, "Pass");
				}
				else
				{
					log << TestLog::Float("OpsPerSecond",
										  "Operations per second",
										  "Million/s", QP_KEY_TAG_PERFORMANCE, (float)megaOpsPerSecond)

						<< TestLog::Integer("NumFreeOps",
											"Estimated number of \"free\" operations",
											"", QP_KEY_TAG_PERFORMANCE, numFreeOps);

					m_testCtx.setTestResult(QP_TEST_RESULT_PASS, de::floatToString((float)megaOpsPerSecond, 2).c_str());
				}

				m_state = STATE_FINISHED;
			}
		}

		return STOP;
	}

	return CONTINUE;
}

// Binary operator case.
class BinaryOpCase : public OperatorPerformanceCase
{
public:
						BinaryOpCase				(Context& context, const char* name, const char* description, const char* op,
													 glu::DataType type, glu::Precision precision, bool useSwizzle, bool isVertex, const InitialCalibrationStorage& initialCalibration);

protected:
	vector<ProgramContext>	generateProgramData					(void) const;
	void					setGeneralUniforms					(deUint32 program) const;
	void					setWorkloadSizeUniform				(deUint32 program, int numOperations) const;
	float					computeSingleOperationTime			(const vector<float>& perProgramOperationCosts) const;
	void					logSingleOperationCalculationInfo	(void) const;

private:
	enum ProgramID
	{
		// \note 0-based sequential numbering is relevant, because these are also used as vector indices.
		// \note The first program should be the heaviest, because OperatorPerformanceCase uses it to reduce grid/viewport size when going too slow.
		PROGRAM_WITH_BIGGER_LOOP = 0,
		PROGRAM_WITH_SMALLER_LOOP,

		PROGRAM_LAST
	};

	ProgramContext			generateSingleProgramData		(ProgramID) const;

	const string			m_op;
	const glu::DataType		m_type;
	const glu::Precision	m_precision;
	const bool				m_useSwizzle;
};

BinaryOpCase::BinaryOpCase (Context& context, const char* name, const char* description, const char* op,
							glu::DataType type, glu::Precision precision, bool useSwizzle, bool isVertex, const InitialCalibrationStorage& initialCalibration)
	: OperatorPerformanceCase	(context.getTestContext(), context.getRenderContext(), name, description,
								 isVertex ? CASETYPE_VERTEX : CASETYPE_FRAGMENT, NUM_WORKLOADS, initialCalibration)
	, m_op						(op)
	, m_type					(type)
	, m_precision				(precision)
	, m_useSwizzle				(useSwizzle)
{
}

BinaryOpCase::ProgramContext BinaryOpCase::generateSingleProgramData (ProgramID programID) const
{
	DE_ASSERT(glu::isDataTypeFloatOrVec(m_type) || glu::isDataTypeIntOrIVec(m_type));

	const bool			isVertexCase	= m_caseType == CASETYPE_VERTEX;
	const char* const	precision		= glu::getPrecisionName(m_precision);
	const char* const	inputPrecision	= glu::isDataTypeIntOrIVec(m_type) && m_precision == glu::PRECISION_LOWP ? "mediump" : precision;
	const char* const	typeName		= getDataTypeName(m_type);

	std::ostringstream	vtx;
	std::ostringstream	frag;
	std::ostringstream&	op				= isVertexCase ? vtx : frag;

	// Attributes.
	vtx << "attribute highp vec4 a_position;\n";
	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
		vtx << "attribute " << inputPrecision << " vec4 a_in" << i << ";\n";

	if (isVertexCase)
	{
		vtx << "varying mediump vec4 v_color;\n";
		frag << "varying mediump vec4 v_color;\n";
	}
	else
	{
		for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
		{
			vtx << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
			frag << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
		}
	}

	op << "uniform mediump int u_numLoopIterations;\n";
	if (isVertexCase)
		op << "uniform mediump float u_zero;\n";

	vtx << "\n";
	vtx << "void main()\n";
	vtx << "{\n";

	if (!isVertexCase)
		vtx << "\tgl_Position = a_position;\n";

	frag << "\n";
	frag << "void main()\n";
	frag << "{\n";

	// Expression inputs.
	const char* const prefix = isVertexCase ? "a_" : "v_";
	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
	{
		const int	inSize		= getDataTypeScalarSize(m_type);
		const bool	isInt		= de::inRange<int>(m_type, TYPE_INT, TYPE_INT_VEC4);
		const bool	cast		= isInt || (!m_useSwizzle && m_type != TYPE_FLOAT_VEC4);

		op << "\t" << precision << " " << typeName << " in" << i << " = ";

		if (cast)
			op << typeName << "(";

		op << prefix << "in" << i;

		if (m_useSwizzle)
			op << "." << s_swizzles[i % DE_LENGTH_OF_ARRAY(s_swizzles)][inSize-1];

		if (cast)
			op << ")";

		op << ";\n";
	}

	// Operation accumulation variables.
	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
	{
		op << "\t" << precision << " " << typeName << " acc" << i << "a" << " = in" << i+0 << ";\n";
		op << "\t" << precision << " " << typeName << " acc" << i << "b" << " = in" << i+1 << ";\n";
	}

	// Loop, with expressions in it.
	op << "\tfor (int i = 0; i < u_numLoopIterations; i++)\n";
	op << "\t{\n";
	{
		const int unrollAmount = programID == PROGRAM_WITH_SMALLER_LOOP ? BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT : BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
		for (int unrollNdx = 0; unrollNdx < unrollAmount; unrollNdx++)
		{
			for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
			{
				if (i > 0 || unrollNdx > 0)
					op << "\n";
				op << "\t\tacc" << i << "a = acc" << i << "b " << m_op << " acc" << i << "a" << ";\n";
				op << "\t\tacc" << i << "b = acc" << i << "a " << m_op << " acc" << i << "b" << ";\n";
			}
		}
	}
	op << "\t}\n";
	op << "\n";

	// Result variable (sum of accumulation variables).
	op << "\t" << precision << " " << typeName << " res =";
	for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
		op << (i > 0 ? " "+m_op : "") << " acc" << i << "b";
	op << ";\n";

	// Convert to color.
	op << "\tmediump vec4 color = ";
	if (m_type == TYPE_FLOAT_VEC4)
		op << "res";
	else
	{
		int size = getDataTypeScalarSize(m_type);
		op << "vec4(res";

		for (int i = size; i < 4; i++)
			op << ", " << (i == 3 ? "1.0" : "0.0");

		op << ")";
	}
	op << ";\n";
	op << "\t" << (isVertexCase ? "v_color" : "gl_FragColor") << " = color;\n";

	if (isVertexCase)
	{
		vtx << "	gl_Position = a_position + u_zero*color;\n";
		frag << "	gl_FragColor = v_color;\n";
	}
	else
	{
		for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
			vtx << "	v_in" << i << " = a_in" << i << ";\n";
	}

	vtx << "}\n";
	frag << "}\n";

	{
		vector<AttribSpec> attributes;
		for (int i = 0; i < BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS+1; i++)
			attributes.push_back(AttribSpec(("a_in" + de::toString(i)).c_str(),
											Vec4(2.0f, 2.0f, 2.0f, 1.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
											Vec4(1.0f, 2.0f, 1.0f, 2.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
											Vec4(2.0f, 1.0f, 2.0f, 2.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
											Vec4(1.0f, 1.0f, 2.0f, 1.0f).swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4)));

		{
			string description = "This is the program with the ";

			description += programID == PROGRAM_WITH_SMALLER_LOOP	? "smaller"
						 : programID == PROGRAM_WITH_BIGGER_LOOP	? "bigger"
						 : DE_NULL;

			description += " loop.\n"
						   "Note: workload size for this program means the number of loop iterations.";

			return ProgramContext(vtx.str(), frag.str(), attributes, description);
		}
	}
}

vector<BinaryOpCase::ProgramContext> BinaryOpCase::generateProgramData (void) const
{
	vector<ProgramContext> progData;
	for (int i = 0; i < PROGRAM_LAST; i++)
		progData.push_back(generateSingleProgramData((ProgramID)i));
	return progData;
}

void BinaryOpCase::setGeneralUniforms (deUint32 program) const
{
	const glw::Functions& gl = m_renderCtx.getFunctions();
	gl.uniform1f(gl.getUniformLocation(program, "u_zero"), 0.0f);
}

void BinaryOpCase::setWorkloadSizeUniform (deUint32 program, int numLoopIterations) const
{
	const glw::Functions& gl = m_renderCtx.getFunctions();
	gl.uniform1i(gl.getUniformLocation(program, "u_numLoopIterations"), numLoopIterations);
}

float BinaryOpCase::computeSingleOperationTime (const vector<float>& perProgramOperationCosts) const
{
	DE_ASSERT(perProgramOperationCosts.size() == PROGRAM_LAST);

	const int		baseNumOpsInsideLoop				= 2 * BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS;
	const int		numOpsInsideLoopInSmallProgram		= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT;
	const int		numOpsInsideLoopInBigProgram		= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
	DE_STATIC_ASSERT(numOpsInsideLoopInBigProgram > numOpsInsideLoopInSmallProgram);
	const int		opDiff								= numOpsInsideLoopInBigProgram - numOpsInsideLoopInSmallProgram;
	const float		programOperationCostDiff			= perProgramOperationCosts[PROGRAM_WITH_BIGGER_LOOP] - perProgramOperationCosts[PROGRAM_WITH_SMALLER_LOOP];

	return programOperationCostDiff / (float)opDiff;
}

void BinaryOpCase::logSingleOperationCalculationInfo (void) const
{
	const int			baseNumOpsInsideLoop			= 2 * BINARY_OPERATOR_CASE_NUM_INDEPENDENT_CALCULATIONS;
	const int			numOpsInsideLoopInSmallProgram	= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_SMALL_PROGRAM_UNROLL_AMOUNT;
	const int			numOpsInsideLoopInBigProgram	= baseNumOpsInsideLoop * BINARY_OPERATOR_CASE_BIG_PROGRAM_UNROLL_AMOUNT;
	const int			opDiff							= numOpsInsideLoopInBigProgram - numOpsInsideLoopInSmallProgram;
	const char* const	opName							= m_op == "+" ? "addition"
														: m_op == "-" ? "subtraction"
														: m_op == "*" ? "multiplication"
														: m_op == "/" ? "division"
														: DE_NULL;
	DE_ASSERT(opName != DE_NULL);

	m_testCtx.getLog() << TestLog::Message << "Note: the bigger program contains " << opDiff << " more "
										   << opName << " operations in one loop iteration than the small program; "
										   << "cost of one operation is calculated as (cost_of_bigger_workload - cost_of_smaller_workload) / " << opDiff
										   << TestLog::EndMessage;
}

// Built-in function case.
class FunctionCase : public OperatorPerformanceCase
{
public:
	enum
	{
		MAX_PARAMS = 3
	};

						FunctionCase			(Context&							context,
												 const char*						name,
												 const char*						description,
												 const char*						func,
												 glu::DataType						returnType,
												 const glu::DataType				paramTypes[MAX_PARAMS],
												 const Vec4&						attribute,
												 int								modifyParamNdx, //!< Add a compile-time constant (2.0) to the parameter at this index. This is ignored if negative.
												 bool								useNearlyConstantINputs, //!< Function inputs shouldn't be much bigger than 'attribute'.
												 glu::Precision						precision,
												 bool								isVertex,
												 const InitialCalibrationStorage&	initialCalibration);

protected:
	vector<ProgramContext>	generateProgramData					(void) const;
	void					setGeneralUniforms					(deUint32 program) const;
	void					setWorkloadSizeUniform				(deUint32 program, int numOperations) const;
	float					computeSingleOperationTime			(const vector<float>& perProgramOperationCosts) const;
	void					logSingleOperationCalculationInfo	(void) const;

private:
	enum ProgramID
	{
		// \note 0-based sequential numbering is relevant, because these are also used as vector indices.
		// \note The first program should be the heaviest, because OperatorPerformanceCase uses it to reduce grid/viewport size when going too slow.
		PROGRAM_WITH_FUNCTION_CALLS = 0,
		PROGRAM_WITHOUT_FUNCTION_CALLS,

		PROGRAM_LAST
	};

	//! Forms a "sum" expression from aExpr and bExpr; for booleans, this is "equal(a,b)", otherwise actual sum.
	static string		sumExpr						(const string& aExpr, const string& bExpr, glu::DataType type);
	//! Forms an expression used to increment an input value in the shader. If type is boolean, this is just
	//! baseExpr; otherwise, baseExpr is modified by multiplication or division by a loop index,
	//! to prevent simple compiler optimizations. See m_useNearlyConstantInputs for more explanation.
	static string		incrementExpr				(const string& baseExpr, glu::DataType type, bool divide);

	ProgramContext		generateSingleProgramData	(ProgramID) const;

	const string			m_func;
	const glu::DataType		m_returnType;
	glu::DataType			m_paramTypes[MAX_PARAMS];
	// \note m_modifyParamNdx, if not negative, specifies the index of the parameter to which a
	//		 compile-time constant (2.0) is added. This is a quick and dirty way to deal with
	//		 functions like clamp or smoothstep that require that a certain parameter is
	//		 greater than a certain other parameter.
	const int				m_modifyParamNdx;
	// \note m_useNearlyConstantInputs determines whether the inputs given to the function
	//		 should increase (w.r.t m_attribute) only by very small amounts. This is relevant
	//		 for functions like asin, which requires its inputs to be in a specific range.
	//		 In practice, this affects whether expressions used to increment the input
	//		 variables use division instead of multiplication; normally, multiplication is used,
	//		 but it's hard to keep the increments very small that way, and division shouldn't
	//		 be the default, since for many functions (probably not asin, luckily), division
	//		 is too heavy and dominates time-wise.
	const bool				m_useNearlyConstantInputs;
	const Vec4				m_attribute;
	const glu::Precision	m_precision;
};

FunctionCase::FunctionCase (Context&							context,
							const char*							name,
							const char*							description,
							const char*							func,
							glu::DataType						returnType,
							const glu::DataType					paramTypes[MAX_PARAMS],
							const Vec4&							attribute,
							int									modifyParamNdx,
							bool								useNearlyConstantInputs,
							glu::Precision						precision,
							bool								isVertex,
							const InitialCalibrationStorage&	initialCalibration)
	: OperatorPerformanceCase	(context.getTestContext(), context.getRenderContext(), name, description,
								 isVertex ? CASETYPE_VERTEX : CASETYPE_FRAGMENT, NUM_WORKLOADS, initialCalibration)
	, m_func					(func)
	, m_returnType				(returnType)
	, m_modifyParamNdx			(modifyParamNdx)
	, m_useNearlyConstantInputs	(useNearlyConstantInputs)
	, m_attribute				(attribute)
	, m_precision				(precision)
{
	for (int i = 0; i < MAX_PARAMS; i++)
		m_paramTypes[i] = paramTypes[i];
}

string FunctionCase::sumExpr (const string& aExpr, const string& bExpr, glu::DataType type)
{
	if (glu::isDataTypeBoolOrBVec(type))
	{
		if (type == glu::TYPE_BOOL)
			return "(" + aExpr + " == " + bExpr + ")";
		else
			return "equal(" + aExpr + ", " + bExpr + ")";
	}
	else
		return "(" + aExpr + " + " + bExpr + ")";
}

string FunctionCase::incrementExpr (const string& baseExpr, glu::DataType type, bool divide)
{
	const string mulOrDiv = divide ? "/" : "*";

	return glu::isDataTypeBoolOrBVec(type)	? baseExpr
		 : glu::isDataTypeIntOrIVec(type)	? "(" + baseExpr + mulOrDiv + "(i+1))"
		 :									  "(" + baseExpr + mulOrDiv + "float(i+1))";
}

FunctionCase::ProgramContext FunctionCase::generateSingleProgramData (ProgramID programID) const
{
	const bool			isVertexCase			= m_caseType == CASETYPE_VERTEX;
	const char* const	precision				= glu::getPrecisionName(m_precision);
	const char* const	returnTypeName			= getDataTypeName(m_returnType);
	const string		returnPrecisionMaybe	= glu::isDataTypeBoolOrBVec(m_returnType) ? "" : string() + precision + " ";
	const char*			inputPrecision			= DE_NULL;
	const bool			isMatrixReturn			= isDataTypeMatrix(m_returnType);
	int					numParams				= 0;
	const char*			paramTypeNames[MAX_PARAMS];
	string				paramPrecisionsMaybe[MAX_PARAMS];

	for (int i = 0; i < MAX_PARAMS; i++)
	{
		paramTypeNames[i]			= getDataTypeName(m_paramTypes[i]);
		paramPrecisionsMaybe[i]		= glu::isDataTypeBoolOrBVec(m_paramTypes[i]) ? "" : string() + precision + " ";

		if (inputPrecision == DE_NULL && isDataTypeIntOrIVec(m_paramTypes[i]) && m_precision == glu::PRECISION_LOWP)
			inputPrecision = "mediump";

		if (m_paramTypes[i] != TYPE_INVALID)
			numParams = i+1;
	}

	DE_ASSERT(numParams > 0);

	if (inputPrecision == DE_NULL)
		inputPrecision = precision;

	int						numAttributes	= FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS + numParams - 1;
	std::ostringstream		vtx;
	std::ostringstream		frag;
	std::ostringstream&		op				= isVertexCase ? vtx : frag;

	// Attributes.
	vtx << "attribute highp vec4 a_position;\n";
	for (int i = 0; i < numAttributes; i++)
		vtx << "attribute " << inputPrecision << " vec4 a_in" << i << ";\n";

	if (isVertexCase)
	{
		vtx << "varying mediump vec4 v_color;\n";
		frag << "varying mediump vec4 v_color;\n";
	}
	else
	{
		for (int i = 0; i < numAttributes; i++)
		{
			vtx << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
			frag << "varying " << inputPrecision << " vec4 v_in" << i << ";\n";
		}
	}

	op << "uniform mediump int u_numLoopIterations;\n";
	if (isVertexCase)
		op << "uniform mediump float u_zero;\n";

	for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
		op << "uniform " << paramPrecisionsMaybe[paramNdx] << paramTypeNames[paramNdx] << " u_inc" << (char)('A'+paramNdx) << ";\n";

	vtx << "\n";
	vtx << "void main()\n";
	vtx << "{\n";

	if (!isVertexCase)
		vtx << "\tgl_Position = a_position;\n";

	frag << "\n";
	frag << "void main()\n";
	frag << "{\n";

	// Function call input and return value accumulation variables.
	{
		const char* const inPrefix = isVertexCase ? "a_" : "v_";

		for (int calcNdx = 0; calcNdx < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; calcNdx++)
		{
			for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
			{
				const glu::DataType		paramType	= m_paramTypes[paramNdx];
				const bool				mustCast	= paramType != glu::TYPE_FLOAT_VEC4;

				op << "\t" << paramPrecisionsMaybe[paramNdx] << paramTypeNames[paramNdx] << " in" << calcNdx << (char)('a'+paramNdx) << " = ";

				if (mustCast)
					op << paramTypeNames[paramNdx] << "(";

				if (glu::isDataTypeMatrix(paramType))
				{
					static const char* const	swizzles[3]		= { "x", "xy", "xyz" };
					const int					numRows			= glu::getDataTypeMatrixNumRows(paramType);
					const int					numCols			= glu::getDataTypeMatrixNumColumns(paramType);
					const string				swizzle			= numRows < 4 ? string() + "." + swizzles[numRows-1] : "";

					for (int i = 0; i < numCols; i++)
						op << (i > 0 ? ", " : "") << inPrefix << "in" << calcNdx+paramNdx << swizzle;
				}
				else
				{
					op << inPrefix << "in" << calcNdx+paramNdx;

					if (paramNdx == m_modifyParamNdx)
					{
						DE_ASSERT(glu::isDataTypeFloatOrVec(paramType));
						op << " + 2.0";
					}
				}

				if (mustCast)
					op << ")";

				op << ";\n";
			}

			op << "\t" << returnPrecisionMaybe << returnTypeName << " res" << calcNdx << " = " << returnTypeName << "(0);\n";
		}
	}

	// Loop with expressions in it.
	op << "\tfor (int i = 0; i < u_numLoopIterations; i++)\n";
	op << "\t{\n";
	for (int calcNdx = 0; calcNdx < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; calcNdx++)
	{
		if (calcNdx > 0)
			op << "\n";

		op << "\t\t{\n";

		for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
		{
			const string inputName	= "in" + de::toString(calcNdx) + (char)('a'+inputNdx);
			const string incName	= string() + "u_inc" + (char)('A'+inputNdx);
			const string incExpr	= incrementExpr(incName, m_paramTypes[inputNdx], m_useNearlyConstantInputs);

			op << "\t\t\t" << inputName << " = " << sumExpr(inputName, incExpr, m_paramTypes[inputNdx]) << ";\n";
		}

		op << "\t\t\t" << returnPrecisionMaybe << returnTypeName << " eval" << calcNdx << " = ";

		if (programID == PROGRAM_WITH_FUNCTION_CALLS)
		{
			op << m_func << "(";

			for (int paramNdx = 0; paramNdx < numParams; paramNdx++)
			{
				if (paramNdx > 0)
					op << ", ";

				op << "in" << calcNdx << (char)('a'+paramNdx);
			}

			op << ")";
		}
		else
		{
			DE_ASSERT(programID == PROGRAM_WITHOUT_FUNCTION_CALLS);
			op << returnTypeName << "(1)";
		}

		op << ";\n";

		{
			const string resName	= "res" + de::toString(calcNdx);
			const string evalName	= "eval" + de::toString(calcNdx);
			const string incExpr	= incrementExpr(evalName, m_returnType, m_useNearlyConstantInputs);

			op << "\t\t\tres" << calcNdx << " = " << sumExpr(resName, incExpr, m_returnType) << ";\n";
		}

		op << "\t\t}\n";
	}
	op << "\t}\n";
	op << "\n";

	// Result variables.
	for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
	{
		op << "\t" << paramPrecisionsMaybe[inputNdx] << paramTypeNames[inputNdx] << " sumIn" << (char)('A'+inputNdx) << " = ";
		{
			string expr = string() + "in0" + (char)('a'+inputNdx);
			for (int i = 1; i < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
				expr = sumExpr(expr, string() + "in" + de::toString(i) + (char)('a'+inputNdx), m_paramTypes[inputNdx]);
			op << expr;
		}
		op << ";\n";
	}

	op << "\t" << returnPrecisionMaybe << returnTypeName << " sumRes = ";
	{
		string expr = "res0";
		for (int i = 1; i < FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS; i++)
			expr = sumExpr(expr, "res" + de::toString(i), m_returnType);
		op << expr;
	}
	op << ";\n";

	{
		glu::DataType finalResultDataType = glu::TYPE_LAST;

		if (glu::isDataTypeMatrix(m_returnType))
		{
			finalResultDataType = m_returnType;

			op << "\t" << precision << " " << returnTypeName << " finalRes = ";

			for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
			{
				DE_ASSERT(m_paramTypes[inputNdx] == m_returnType);
				op << "sumIn" << (char)('A'+inputNdx) << " + ";
			}
			op << "sumRes;\n";
		}
		else
		{
			int numFinalResComponents = glu::getDataTypeScalarSize(m_returnType);
			for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
				numFinalResComponents = de::max(numFinalResComponents, glu::getDataTypeScalarSize(m_paramTypes[inputNdx]));

			finalResultDataType = getDataTypeFloatOrVec(numFinalResComponents);

			{
				const string finalResType = glu::getDataTypeName(finalResultDataType);
				op << "\t" << precision << " " << finalResType << " finalRes = ";
				for (int inputNdx = 0; inputNdx < numParams; inputNdx++)
					op << finalResType << "(sumIn" << (char)('A'+inputNdx) << ") + ";
				op << finalResType << "(sumRes);\n";
			}
		}

		// Convert to color.
		op << "\tmediump vec4 color = ";
		if (finalResultDataType == TYPE_FLOAT_VEC4)
			op << "finalRes";
		else
		{
			int size = isMatrixReturn ? getDataTypeMatrixNumRows(finalResultDataType) : getDataTypeScalarSize(finalResultDataType);

			op << "vec4(";

			if (isMatrixReturn)
			{
				for (int i = 0; i < getDataTypeMatrixNumColumns(finalResultDataType); i++)
				{
					if (i > 0)
						op << " + ";
					op << "finalRes[" << i << "]";
				}
			}
			else
				op << "finalRes";

			for (int i = size; i < 4; i++)
				op << ", " << (i == 3 ? "1.0" : "0.0");

			op << ")";
		}
		op << ";\n";
		op << "\t" << (isVertexCase ? "v_color" : "gl_FragColor") << " = color;\n";

		if (isVertexCase)
		{
			vtx << "	gl_Position = a_position + u_zero*color;\n";
			frag << "	gl_FragColor = v_color;\n";
		}
		else
		{
			for (int i = 0; i < numAttributes; i++)
				vtx << "	v_in" << i << " = a_in" << i << ";\n";
		}

		vtx << "}\n";
		frag << "}\n";
	}

	{
		vector<AttribSpec> attributes;
		for (int i = 0; i < numAttributes; i++)
			attributes.push_back(AttribSpec(("a_in" + de::toString(i)).c_str(),
											m_attribute.swizzle((i+0)%4, (i+1)%4, (i+2)%4, (i+3)%4),
											m_attribute.swizzle((i+1)%4, (i+2)%4, (i+3)%4, (i+0)%4),
											m_attribute.swizzle((i+2)%4, (i+3)%4, (i+0)%4, (i+1)%4),
											m_attribute.swizzle((i+3)%4, (i+0)%4, (i+1)%4, (i+2)%4)));

		{
			string description = "This is the program ";

			description += programID == PROGRAM_WITHOUT_FUNCTION_CALLS	? "without"
						 : programID == PROGRAM_WITH_FUNCTION_CALLS		? "with"
						 : DE_NULL;

			description += " '" + m_func + "' function calls.\n"
						   "Note: workload size for this program means the number of loop iterations.";

			return ProgramContext(vtx.str(), frag.str(), attributes, description);
		}
	}
}

vector<FunctionCase::ProgramContext> FunctionCase::generateProgramData (void) const
{
	vector<ProgramContext> progData;
	for (int i = 0; i < PROGRAM_LAST; i++)
		progData.push_back(generateSingleProgramData((ProgramID)i));
	return progData;
}

void FunctionCase::setGeneralUniforms (deUint32 program) const
{
	const glw::Functions& gl = m_renderCtx.getFunctions();

	gl.uniform1f(gl.getUniformLocation(program, "u_zero"), 0.0f);

	for (int paramNdx = 0; paramNdx < MAX_PARAMS; paramNdx++)
	{
		if (m_paramTypes[paramNdx] != glu::TYPE_INVALID)
		{
			const glu::DataType		paramType	= m_paramTypes[paramNdx];
			const int				scalarSize	= glu::getDataTypeScalarSize(paramType);
			const int				location	= gl.getUniformLocation(program, (string() + "u_inc" + (char)('A'+paramNdx)).c_str());

			if (glu::isDataTypeFloatOrVec(paramType))
			{
				float values[4];
				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
					values[i] = (float)paramNdx*0.01f + (float)i*0.001f; // Arbitrary small values.
				uniformNfv(gl, scalarSize, location, 1, &values[0]);
			}
			else if (glu::isDataTypeIntOrIVec(paramType))
			{
				int values[4];
				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
					values[i] = paramNdx*100 + i; // Arbitrary values.
				uniformNiv(gl, scalarSize, location, 1, &values[0]);
			}
			else if (glu::isDataTypeBoolOrBVec(paramType))
			{
				int values[4];
				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
					values[i] = (paramNdx >> i) & 1; // Arbitrary values.
				uniformNiv(gl, scalarSize, location, 1, &values[0]);
			}
			else if (glu::isDataTypeMatrix(paramType))
			{
				const int size = glu::getDataTypeMatrixNumRows(paramType);
				DE_ASSERT(size == glu::getDataTypeMatrixNumColumns(paramType));
				float values[4*4];
				for (int i = 0; i < DE_LENGTH_OF_ARRAY(values); i++)
					values[i] = (float)paramNdx*0.01f + (float)i*0.001f; // Arbitrary values.
				uniformMatrixNfv(gl, size, location, 1, &values[0]);
			}
			else
				DE_ASSERT(false);
		}
	}
}

void FunctionCase::setWorkloadSizeUniform (deUint32 program, int numLoopIterations) const
{
	const glw::Functions&	gl		= m_renderCtx.getFunctions();
	const int				loc		= gl.getUniformLocation(program, "u_numLoopIterations");

	gl.uniform1i(loc, numLoopIterations);
}

float FunctionCase::computeSingleOperationTime (const vector<float>& perProgramOperationCosts) const
{
	DE_ASSERT(perProgramOperationCosts.size() == PROGRAM_LAST);
	const int		numFunctionCalls			= FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS;
	const float		programOperationCostDiff	= perProgramOperationCosts[PROGRAM_WITH_FUNCTION_CALLS] - perProgramOperationCosts[PROGRAM_WITHOUT_FUNCTION_CALLS];

	return programOperationCostDiff / (float)numFunctionCalls;
}

void FunctionCase::logSingleOperationCalculationInfo (void) const
{
	const int numFunctionCalls = FUNCTION_CASE_NUM_INDEPENDENT_CALCULATIONS;

	m_testCtx.getLog() << TestLog::Message << "Note: program " << (int)PROGRAM_WITH_FUNCTION_CALLS << " contains "
										   << numFunctionCalls << " calls to '" << m_func << "' in one loop iteration; "
										   << "cost of one operation is calculated as "
										   << "(cost_of_workload_with_calls - cost_of_workload_without_calls) / " << numFunctionCalls << TestLog::EndMessage;
}

} // anonymous

ShaderOperatorTests::ShaderOperatorTests (Context& context)
	: TestCaseGroup(context, "operator", "Operator Performance Tests")
{
}

ShaderOperatorTests::~ShaderOperatorTests (void)
{
}

void ShaderOperatorTests::init (void)
{
	// Binary operator cases

	static const DataType binaryOpTypes[] =
	{
		TYPE_FLOAT,
		TYPE_FLOAT_VEC2,
		TYPE_FLOAT_VEC3,
		TYPE_FLOAT_VEC4,
		TYPE_INT,
		TYPE_INT_VEC2,
		TYPE_INT_VEC3,
		TYPE_INT_VEC4,
	};
	static const Precision precisions[] =
	{
		PRECISION_LOWP,
		PRECISION_MEDIUMP,
		PRECISION_HIGHP
	};
	static const struct
	{
		const char*		name;
		const char*		op;
		bool			swizzle;
	} binaryOps[] =
	{
		{ "add",		"+",		false	},
		{ "sub",		"-",		true	},
		{ "mul",		"*",		false	},
		{ "div",		"/",		true	}
	};

	tcu::TestCaseGroup* const binaryOpsGroup = new tcu::TestCaseGroup(m_testCtx, "binary_operator", "Binary Operator Performance Tests");
	addChild(binaryOpsGroup);

	for (int opNdx = 0; opNdx < DE_LENGTH_OF_ARRAY(binaryOps); opNdx++)
	{
		tcu::TestCaseGroup* const opGroup = new tcu::TestCaseGroup(m_testCtx, binaryOps[opNdx].name, "");
		binaryOpsGroup->addChild(opGroup);

		for (int isFrag = 0; isFrag <= 1; isFrag++)
		{
			const BinaryOpCase::InitialCalibrationStorage	shaderGroupCalibrationStorage	(new BinaryOpCase::InitialCalibration);
			const bool										isVertex						= isFrag == 0;
			tcu::TestCaseGroup* const						shaderGroup						= new tcu::TestCaseGroup(m_testCtx, isVertex ? "vertex" : "fragment", "");
			opGroup->addChild(shaderGroup);

			for (int typeNdx = 0; typeNdx < DE_LENGTH_OF_ARRAY(binaryOpTypes); typeNdx++)
			{
				for (int precNdx = 0; precNdx < DE_LENGTH_OF_ARRAY(precisions); precNdx++)
				{
					const DataType		type			= binaryOpTypes[typeNdx];
					const Precision		precision		= precisions[precNdx];
					const char* const	op				= binaryOps[opNdx].op;
					const bool			useSwizzle		= binaryOps[opNdx].swizzle;
					std::ostringstream	name;

					name << getPrecisionName(precision) << "_" << getDataTypeName(type);

					shaderGroup->addChild(new BinaryOpCase(m_context, name.str().c_str(), "", op, type, precision, useSwizzle, isVertex, shaderGroupCalibrationStorage));
				}
			}
		}
	}

	// Built-in function cases.

	// Non-specific (i.e. includes gentypes) parameter types for the functions.
	enum ValueType
	{
		VALUE_NONE			= 0,
		VALUE_FLOAT			= (1<<0),	// float scalar
		VALUE_FLOAT_VEC		= (1<<1),	// float vector
		VALUE_FLOAT_VEC34	= (1<<2),	// float vector of size 3 or 4
		VALUE_FLOAT_GENTYPE	= (1<<3),	// float scalar/vector
		VALUE_VEC3			= (1<<4),	// vec3 only
		VALUE_VEC4			= (1<<5),	// vec4 only
		VALUE_MATRIX		= (1<<6),	// matrix
		VALUE_BOOL			= (1<<7),	// boolean scalar
		VALUE_BOOL_VEC		= (1<<8),	// boolean vector
		VALUE_BOOL_GENTYPE	= (1<<9),	// boolean scalar/vector
		VALUE_INT			= (1<<10),	// int scalar
		VALUE_INT_VEC		= (1<<11),	// int vector
		VALUE_INT_GENTYPE	= (1<<12),	// int scalar/vector

		// Shorthands.
		N				= VALUE_NONE,
		F				= VALUE_FLOAT,
		FV				= VALUE_FLOAT_VEC,
		VL				= VALUE_FLOAT_VEC34, // L for "large"
		GT				= VALUE_FLOAT_GENTYPE,
		V3				= VALUE_VEC3,
		V4				= VALUE_VEC4,
		M				= VALUE_MATRIX,
		B				= VALUE_BOOL,
		BV				= VALUE_BOOL_VEC,
		BGT				= VALUE_BOOL_GENTYPE,
		I				= VALUE_INT,
		IV				= VALUE_INT_VEC,
		IGT				= VALUE_INT_GENTYPE,

		VALUE_ANY_FLOAT			= VALUE_FLOAT		|	VALUE_FLOAT_VEC		|	VALUE_FLOAT_GENTYPE	| VALUE_VEC3 | VALUE_VEC4 | VALUE_FLOAT_VEC34,
		VALUE_ANY_INT			= VALUE_INT			|	VALUE_INT_VEC		|	VALUE_INT_GENTYPE,
		VALUE_ANY_BOOL			= VALUE_BOOL		|	VALUE_BOOL_VEC		|	VALUE_BOOL_GENTYPE,

		VALUE_ANY_GENTYPE		= VALUE_FLOAT_VEC	|	VALUE_FLOAT_GENTYPE	|	VALUE_FLOAT_VEC34	|
								  VALUE_BOOL_VEC	|	VALUE_BOOL_GENTYPE	|
								  VALUE_INT_VEC		|	VALUE_INT_GENTYPE	|
								  VALUE_MATRIX
	};
	enum PrecisionMask
	{
		PRECMASK_NA				= 0,						//!< Precision not applicable (booleans)
		PRECMASK_LOWP			= (1<<PRECISION_LOWP),
		PRECMASK_MEDIUMP		= (1<<PRECISION_MEDIUMP),
		PRECMASK_HIGHP			= (1<<PRECISION_HIGHP),

		PRECMASK_MEDIUMP_HIGHP	= (1<<PRECISION_MEDIUMP) | (1<<PRECISION_HIGHP),
		PRECMASK_ALL			= (1<<PRECISION_LOWP) | (1<<PRECISION_MEDIUMP) | (1<<PRECISION_HIGHP)
	};

	static const DataType floatTypes[] =
	{
		TYPE_FLOAT,
		TYPE_FLOAT_VEC2,
		TYPE_FLOAT_VEC3,
		TYPE_FLOAT_VEC4
	};
	static const DataType intTypes[] =
	{
		TYPE_INT,
		TYPE_INT_VEC2,
		TYPE_INT_VEC3,
		TYPE_INT_VEC4
	};
	static const DataType boolTypes[] =
	{
		TYPE_BOOL,
		TYPE_BOOL_VEC2,
		TYPE_BOOL_VEC3,
		TYPE_BOOL_VEC4
	};
	static const DataType matrixTypes[] =
	{
		TYPE_FLOAT_MAT2,
		TYPE_FLOAT_MAT3,
		TYPE_FLOAT_MAT4
	};

	tcu::TestCaseGroup* const angleAndTrigonometryGroup		= new tcu::TestCaseGroup(m_testCtx, "angle_and_trigonometry",	"Built-In Angle and Trigonometry Function Performance Tests");
	tcu::TestCaseGroup* const exponentialGroup				= new tcu::TestCaseGroup(m_testCtx, "exponential",				"Built-In Exponential Function Performance Tests");
	tcu::TestCaseGroup* const commonFunctionsGroup			= new tcu::TestCaseGroup(m_testCtx, "common_functions",			"Built-In Common Function Performance Tests");
	tcu::TestCaseGroup* const geometricFunctionsGroup		= new tcu::TestCaseGroup(m_testCtx, "geometric",				"Built-In Geometric Function Performance Tests");
	tcu::TestCaseGroup* const matrixFunctionsGroup			= new tcu::TestCaseGroup(m_testCtx, "matrix",					"Built-In Matrix Function Performance Tests");
	tcu::TestCaseGroup* const floatCompareGroup				= new tcu::TestCaseGroup(m_testCtx, "float_compare",			"Built-In Floating Point Comparison Function Performance Tests");
	tcu::TestCaseGroup* const intCompareGroup				= new tcu::TestCaseGroup(m_testCtx, "int_compare",				"Built-In Integer Comparison Function Performance Tests");
	tcu::TestCaseGroup* const boolCompareGroup				= new tcu::TestCaseGroup(m_testCtx, "bool_compare",				"Built-In Boolean Comparison Function Performance Tests");

	addChild(angleAndTrigonometryGroup);
	addChild(exponentialGroup);
	addChild(commonFunctionsGroup);
	addChild(geometricFunctionsGroup);
	addChild(matrixFunctionsGroup);
	addChild(floatCompareGroup);
	addChild(intCompareGroup);
	addChild(boolCompareGroup);

	// Some attributes to be used as parameters for the functions.
	const Vec4 attrPos		= Vec4( 2.3f,  1.9f,  0.8f,  0.7f);
	const Vec4 attrNegPos	= Vec4(-1.3f,  2.5f, -3.5f,	 4.3f);
	const Vec4 attrSmall	= Vec4(-0.9f,  0.8f, -0.4f,	 0.2f);

	// Function name, return type and parameter type information; also, what attribute should be used in the test.
	// \note Different versions of the same function (i.e. with the same group name) can be defined by putting them successively in this array.
	// \note In order to reduce case count and thus total execution time, we don't test all input type combinations for every function.
	static const struct
	{
		tcu::TestCaseGroup*					parentGroup;
		const char*							groupName;
		const char*							func;
		const ValueType						types[FunctionCase::MAX_PARAMS + 1]; // Return type and parameter types, in that order.
		const Vec4&							attribute;
		int									modifyParamNdx;
		bool								useNearlyConstantInputs;
		bool								booleanCase;
		PrecisionMask						precMask;
	} functionCaseGroups[] =
	{
		{ angleAndTrigonometryGroup,	"radians",			"radians",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ angleAndTrigonometryGroup,	"degrees",			"degrees",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ angleAndTrigonometryGroup,	"sin",				"sin",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ angleAndTrigonometryGroup,	"cos",				"cos",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ angleAndTrigonometryGroup,	"tan",				"tan",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ angleAndTrigonometryGroup,	"asin",				"asin",				{ F,  F,  N,  N  }, attrSmall,		-1, true,	false,	PRECMASK_ALL			},
		{ angleAndTrigonometryGroup,	"acos",				"acos",				{ F,  F,  N,  N  }, attrSmall,		-1, true,	false,	PRECMASK_ALL			},
		{ angleAndTrigonometryGroup,	"atan2",			"atan",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ angleAndTrigonometryGroup,	"atan",				"atan",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},

		{ exponentialGroup,				"pow",				"pow",				{ F,  F,  F,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
		{ exponentialGroup,				"exp",				"exp",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ exponentialGroup,				"log",				"log",				{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
		{ exponentialGroup,				"exp2",				"exp2",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ exponentialGroup,				"log2",				"log2",				{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
		{ exponentialGroup,				"sqrt",				"sqrt",				{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},
		{ exponentialGroup,				"inversesqrt",		"inversesqrt",		{ F,  F,  N,  N  }, attrPos,		-1, false,	false,	PRECMASK_ALL			},

		{ commonFunctionsGroup,			"abs",				"abs",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"abs",				"abs",				{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"sign",				"sign",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"sign",				"sign",				{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"floor",			"floor",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"floor",			"floor",			{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"ceil",				"ceil",				{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"ceil",				"ceil",				{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"fract",			"fract",			{ F,  F,  N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"fract",			"fract",			{ V4, V4, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"mod",				"mod",				{ GT, GT, GT, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"min",				"min",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"min",				"min",				{ V4, V4, V4, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"max",				"max",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"max",				"max",				{ V4, V4, V4, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"clamp",			"clamp",			{ F,  F,  F,  F  }, attrSmall,		 2, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"clamp",			"clamp",			{ V4, V4, V4, V4 }, attrSmall,		 2, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"mix",				"mix",				{ F,  F,  F,  F  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"mix",				"mix",				{ V4, V4, V4, V4 }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"step",				"step",				{ F,  F,  F,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"step",				"step",				{ V4, V4, V4, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ commonFunctionsGroup,			"smoothstep",		"smoothstep",		{ F,  F,  F,  F  }, attrSmall,		 1, false,	false,	PRECMASK_MEDIUMP_HIGHP	},
		{ commonFunctionsGroup,			"smoothstep",		"smoothstep",		{ V4, V4, V4, V4 }, attrSmall,		 1, false,	false,	PRECMASK_ALL			},

		{ geometricFunctionsGroup,		"length",			"length",			{ F,  VL, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ geometricFunctionsGroup,		"distance",			"distance",			{ F,  VL, VL, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ geometricFunctionsGroup,		"dot",				"dot",				{ F,  VL, VL, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ geometricFunctionsGroup,		"cross",			"cross",			{ V3, V3, V3, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ geometricFunctionsGroup,		"normalize",		"normalize",		{ VL, VL, N,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ geometricFunctionsGroup,		"faceforward",		"faceforward",		{ VL, VL, VL, VL }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ geometricFunctionsGroup,		"reflect",			"reflect",			{ VL, VL, VL, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ geometricFunctionsGroup,		"refract",			"refract",			{ VL, VL, VL, F  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},

		{ matrixFunctionsGroup,			"matrixCompMult",	"matrixCompMult",	{ M,  M,  M,  N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},

		{ floatCompareGroup,			"lessThan",			"lessThan",			{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ floatCompareGroup,			"lessThanEqual",	"lessThanEqual",	{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ floatCompareGroup,			"greaterThan",		"greaterThan",		{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ floatCompareGroup,			"greaterThanEqual",	"greaterThanEqual",	{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ floatCompareGroup,			"equal",			"equal",			{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ floatCompareGroup,			"notEqual",			"notEqual",			{ BV, FV, FV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},

		{ intCompareGroup,				"lessThan",			"lessThan",			{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ intCompareGroup,				"lessThanEqual",	"lessThanEqual",	{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ intCompareGroup,				"greaterThan",		"greaterThan",		{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ intCompareGroup,				"greaterThanEqual",	"greaterThanEqual",	{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ intCompareGroup,				"equal",			"equal",			{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},
		{ intCompareGroup,				"notEqual",			"notEqual",			{ BV, IV, IV, N  }, attrNegPos,		-1, false,	false,	PRECMASK_ALL			},

		{ boolCompareGroup,				"equal",			"equal",			{ BV, BV, BV, N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
		{ boolCompareGroup,				"notEqual",			"notEqual",			{ BV, BV, BV, N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
		{ boolCompareGroup,				"any",				"any",				{ B,  BV, N,  N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
		{ boolCompareGroup,				"all",				"all",				{ B,  BV, N,  N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		},
		{ boolCompareGroup,				"not",				"not",				{ BV, BV, N,  N  }, attrNegPos,		-1, false,	true,	PRECMASK_MEDIUMP		}
	};

	// vertexSubGroup and fragmentSubGroup are the groups where the various vertex/fragment cases of a single function are added.
	// \note These are defined here so that different versions (different entries in the functionCaseGroups array) of the same function can be put in the same group.
	tcu::TestCaseGroup*							vertexSubGroup		= DE_NULL;
	tcu::TestCaseGroup*							fragmentSubGroup	= DE_NULL;
	FunctionCase::InitialCalibrationStorage		vertexSubGroupCalibrationStorage;
	FunctionCase::InitialCalibrationStorage		fragmentSubGroupCalibrationStorage;
	for (int funcNdx = 0; funcNdx < DE_LENGTH_OF_ARRAY(functionCaseGroups); funcNdx++)
	{
		tcu::TestCaseGroup* const	parentGroup					= functionCaseGroups[funcNdx].parentGroup;
		const char* const			groupName					= functionCaseGroups[funcNdx].groupName;
		const char* const			groupFunc					= functionCaseGroups[funcNdx].func;
		const ValueType* const		funcTypes					= functionCaseGroups[funcNdx].types;
		const Vec4&					groupAttribute				= functionCaseGroups[funcNdx].attribute;
		const int					modifyParamNdx				= functionCaseGroups[funcNdx].modifyParamNdx;
		const bool					useNearlyConstantInputs		= functionCaseGroups[funcNdx].useNearlyConstantInputs;
		const bool					booleanCase					= functionCaseGroups[funcNdx].booleanCase;
		const PrecisionMask			precMask					= functionCaseGroups[funcNdx].precMask;

		// If this is a new function and not just a different version of the previously defined function, create a new group.
		if (funcNdx == 0 || parentGroup != functionCaseGroups[funcNdx-1].parentGroup || string(groupName) != functionCaseGroups[funcNdx-1].groupName)
		{
			tcu::TestCaseGroup* const funcGroup = new tcu::TestCaseGroup(m_testCtx, groupName, "");
			functionCaseGroups[funcNdx].parentGroup->addChild(funcGroup);

			vertexSubGroup		= new tcu::TestCaseGroup(m_testCtx, "vertex", "");
			fragmentSubGroup	= new tcu::TestCaseGroup(m_testCtx, "fragment", "");

			funcGroup->addChild(vertexSubGroup);
			funcGroup->addChild(fragmentSubGroup);

			vertexSubGroupCalibrationStorage	= FunctionCase::InitialCalibrationStorage(new FunctionCase::InitialCalibration);
			fragmentSubGroupCalibrationStorage	= FunctionCase::InitialCalibrationStorage(new FunctionCase::InitialCalibration);
		}

		DE_ASSERT(vertexSubGroup != DE_NULL);
		DE_ASSERT(fragmentSubGroup != DE_NULL);

		// Find the type size range of parameters (e.g. from 2 to 4 in case of vectors).
		int genTypeFirstSize	= 1;
		int genTypeLastSize		= 1;

		// Find the first return value or parameter with a gentype (if any) and set sizes accordingly.
		// \note Assumes only matching sizes gentypes are to be found, e.g. no "genType func (vec param)"
		for (int i = 0; i < FunctionCase::MAX_PARAMS + 1 && genTypeLastSize == 1; i++)
		{
			switch (funcTypes[i])
			{
				case VALUE_FLOAT_VEC:
				case VALUE_BOOL_VEC:
				case VALUE_INT_VEC:			// \note Fall-through.
					genTypeFirstSize = 2;
					genTypeLastSize = 4;
					break;
				case VALUE_FLOAT_VEC34:
					genTypeFirstSize = 3;
					genTypeLastSize = 4;
					break;
				case VALUE_FLOAT_GENTYPE:
				case VALUE_BOOL_GENTYPE:
				case VALUE_INT_GENTYPE:		// \note Fall-through.
					genTypeFirstSize = 1;
					genTypeLastSize = 4;
					break;
				case VALUE_MATRIX:
					genTypeFirstSize = 2;
					genTypeLastSize = 4;
					break;
				// If none of the above, keep looping.
				default:
					break;
			}
		}

		// Create a case for each possible size of the gentype.
		for (int curSize = genTypeFirstSize; curSize <= genTypeLastSize; curSize++)
		{
			// Determine specific types for return value and the parameters, according to curSize. Non-gentypes not affected by curSize.
			DataType types[FunctionCase::MAX_PARAMS + 1];
			for (int i = 0; i < FunctionCase::MAX_PARAMS + 1; i++)
			{
				if (funcTypes[i] == VALUE_NONE)
					types[i] = TYPE_INVALID;
				else
				{
					int isFloat	= funcTypes[i] & VALUE_ANY_FLOAT;
					int isBool	= funcTypes[i] & VALUE_ANY_BOOL;
					int isInt	= funcTypes[i] & VALUE_ANY_INT;
					int isMat	= funcTypes[i] == VALUE_MATRIX;
					int inSize	= (funcTypes[i] & VALUE_ANY_GENTYPE)	? curSize
								: funcTypes[i] == VALUE_VEC3			? 3
								: funcTypes[i] == VALUE_VEC4			? 4
								: 1;
					int			typeArrayNdx = isMat ? inSize - 2 : inSize - 1; // \note No matrices of size 1.

					types[i]	= isFloat	? floatTypes[typeArrayNdx]
								: isBool	? boolTypes[typeArrayNdx]
								: isInt		? intTypes[typeArrayNdx]
								: isMat		? matrixTypes[typeArrayNdx]
								: TYPE_LAST;
				}

				DE_ASSERT(types[i] != TYPE_LAST);
			}

			// Array for just the parameter types.
			DataType paramTypes[FunctionCase::MAX_PARAMS];
			for (int i = 0; i < FunctionCase::MAX_PARAMS; i++)
				paramTypes[i] = types[i+1];

			for (int prec = (int)PRECISION_LOWP; prec < (int)PRECISION_LAST; prec++)
			{
				if ((precMask & (1 << prec)) == 0)
					continue;

				const string		precisionPrefix = booleanCase ? "" : (string(getPrecisionName((Precision)prec)) + "_");
				std::ostringstream	caseName;

				caseName << precisionPrefix;

				// Write the name of each distinct parameter data type into the test case name.
				for (int i = 1; i < FunctionCase::MAX_PARAMS + 1 && types[i] != TYPE_INVALID; i++)
				{
					if (i == 1 || types[i] != types[i-1])
					{
						if (i > 1)
							caseName << "_";

						caseName << getDataTypeName(types[i]);
					}
				}

				for (int fragI = 0; fragI <= 1; fragI++)
				{
					const bool					vert	= fragI == 0;
					tcu::TestCaseGroup* const	group	= vert ? vertexSubGroup : fragmentSubGroup;
					group->addChild	(new FunctionCase(m_context,
													  caseName.str().c_str(), "",
													  groupFunc,
													  types[0], paramTypes,
													  groupAttribute, modifyParamNdx, useNearlyConstantInputs,
													  (Precision)prec, vert,
													  vert ? vertexSubGroupCalibrationStorage : fragmentSubGroupCalibrationStorage));
				}
			}
		}
	}
}

} // Performance
} // gles2
} // deqp