/*-------------------------------------------------------------------------
 * drawElements Quality Program Random Shader Generator
 * ----------------------------------------------------
 *
 * Copyright 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *//*!
 * \file
 * \brief Program Executor.
 *//*--------------------------------------------------------------------*/

#include "rsgProgramExecutor.hpp"
#include "rsgExecutionContext.hpp"
#include "rsgVariableValue.hpp"
#include "rsgUtils.hpp"
#include "tcuSurface.hpp"
#include "deMath.h"
#include "deString.h"

#include <set>
#include <string>
#include <map>

using std::set;
using std::string;
using std::vector;
using std::map;

namespace rsg
{

class VaryingStorage
{
public:
							VaryingStorage		(const VariableType& type, int numVertices);
							~VaryingStorage		(void) {}

	ValueAccess				getValue			(const VariableType& type, int vtxNdx);
	ConstValueAccess		getValue			(const VariableType& type, int vtxNdx) const;

private:
	std::vector<Scalar>		m_value;
};

VaryingStorage::VaryingStorage (const VariableType& type, int numVertices)
	: m_value(type.getScalarSize()*numVertices)
{
}

ValueAccess VaryingStorage::getValue (const VariableType& type, int vtxNdx)
{
	return ValueAccess(type, &m_value[type.getScalarSize()*vtxNdx]);
}

ConstValueAccess VaryingStorage::getValue (const VariableType& type, int vtxNdx) const
{
	return ConstValueAccess(type, &m_value[type.getScalarSize()*vtxNdx]);
}

class VaryingStore
{
public:
							VaryingStore		(int numVertices);
							~VaryingStore		(void);

	VaryingStorage*			getStorage			(const VariableType& type, const char* name);

private:
	int											m_numVertices;
	std::map<std::string, VaryingStorage*>		m_values;
};

VaryingStore::VaryingStore (int numVertices)
	: m_numVertices(numVertices)
{
}

VaryingStore::~VaryingStore (void)
{
	for (map<string, VaryingStorage*>::iterator i = m_values.begin(); i != m_values.end(); i++)
		delete i->second;
	m_values.clear();
}

VaryingStorage* VaryingStore::getStorage (const VariableType& type, const char* name)
{
	VaryingStorage* storage = m_values[name];

	if (!storage)
	{
		storage = new VaryingStorage(type, m_numVertices);
		m_values[name] = storage;
	}

	return storage;
}

inline float interpolateVertexQuad (const tcu::Vec4& quad, float x, float y)
{
	float w00 = (1.0f-x)*(1.0f-y);
	float w01 = (1.0f-x)*y;
	float w10 = x*(1.0f-y);
	float w11 = x*y;
	return quad.x()*w00 + quad.y()*w10 + quad.z()*w01 + quad.w()*w11;
}

inline float interpolateVertex (float x0y0, float x1y1, float x, float y)
{
	return interpolateVertexQuad(tcu::Vec4(x0y0, (x0y0+x1y1)*0.5f, (x0y0+x1y1)*0.5f, x1y1), x, y);
}

inline float interpolateTri (float v0, float v1, float v2, float x, float y)
{
	return v0 + (v1-v0)*x + (v2-v0)*y;
}

inline float interpolateFragment (const tcu::Vec4& quad, float x, float y)
{
	if (x + y < 1.0f)
		return interpolateTri(quad.x(), quad.y(), quad.z(), x, y);
	else
		return interpolateTri(quad.w(), quad.z(), quad.y(), 1.0f-x, 1.0f-y);
}

template <int Stride>
void interpolateVertexInput (StridedValueAccess<Stride> dst, int dstComp, const ConstValueRangeAccess valueRange, float x, float y)
{
	TCU_CHECK(valueRange.getType().getBaseType() == VariableType::TYPE_FLOAT);
	int numElements = valueRange.getType().getNumElements();
	for (int elementNdx = 0; elementNdx < numElements; elementNdx++)
	{
		float xd, yd;
		getVertexInterpolationCoords(xd, yd, x, y, elementNdx);
		dst.component(elementNdx).asFloat(dstComp) = interpolateVertex(valueRange.getMin().component(elementNdx).asFloat(), valueRange.getMax().component(elementNdx).asFloat(), xd, yd);
	}
}

template <int Stride>
void interpolateFragmentInput (StridedValueAccess<Stride> dst, int dstComp, ConstValueAccess vtx0, ConstValueAccess vtx1, ConstValueAccess vtx2, ConstValueAccess vtx3, float x, float y)
{
	TCU_CHECK(dst.getType().getBaseType() == VariableType::TYPE_FLOAT);
	int numElements = dst.getType().getNumElements();
	for (int ndx = 0; ndx < numElements; ndx++)
		dst.component(ndx).asFloat(dstComp) = interpolateFragment(tcu::Vec4(vtx0.component(ndx).asFloat(), vtx1.component(ndx).asFloat(), vtx2.component(ndx).asFloat(), vtx3.component(ndx).asFloat()), x, y);
}

template <int Stride>
void copyVarying (ValueAccess dst, ConstStridedValueAccess<Stride> src, int compNdx)
{
	TCU_CHECK(dst.getType().getBaseType() == VariableType::TYPE_FLOAT);
	for (int elemNdx = 0; elemNdx < dst.getType().getNumElements(); elemNdx++)
		dst.component(elemNdx).asFloat() = src.component(elemNdx).asFloat(compNdx);
}

ProgramExecutor::ProgramExecutor (const tcu::PixelBufferAccess& dst, int gridWidth, int gridHeight)
	: m_dst			(dst)
	, m_gridWidth	(gridWidth)
	, m_gridHeight	(gridHeight)
{
}

ProgramExecutor::~ProgramExecutor (void)
{
}

void ProgramExecutor::setTexture (int samplerNdx, const tcu::Texture2D* texture, const tcu::Sampler& sampler)
{
	m_samplers2D[samplerNdx] = Sampler2D(texture, sampler);
}

void ProgramExecutor::setTexture (int samplerNdx, const tcu::TextureCube* texture, const tcu::Sampler& sampler)
{
	m_samplersCube[samplerNdx] = SamplerCube(texture, sampler);
}

inline tcu::IVec4 computeVertexIndices (float cellWidth, float cellHeight, int gridVtxWidth, int gridVtxHeight, int x, int y)
{
	DE_UNREF(gridVtxHeight);
	int x0 = (int)deFloatFloor((float)x / cellWidth);
	int y0 = (int)deFloatFloor((float)y / cellHeight);
	return tcu::IVec4(y0*gridVtxWidth + x0, y0*gridVtxWidth + x0 + 1, (y0+1)*gridVtxWidth + x0, (y0+1)*gridVtxWidth + x0 + 1);
}

inline tcu::Vec2 computeGridCellWeights (float cellWidth, float cellHeight, int x, int y)
{
	float gx = ((float)x + 0.5f) / cellWidth;
	float gy = ((float)y + 0.5f) / cellHeight;
	return tcu::Vec2(deFloatFrac(gx), deFloatFrac(gy));
}

inline tcu::RGBA toColor (tcu::Vec4 rgba)
{
	return tcu::RGBA(deClamp32(deRoundFloatToInt32(rgba.x()*255), 0, 255),
					 deClamp32(deRoundFloatToInt32(rgba.y()*255), 0, 255),
					 deClamp32(deRoundFloatToInt32(rgba.z()*255), 0, 255),
					 deClamp32(deRoundFloatToInt32(rgba.w()*255), 0, 255));
}

void ProgramExecutor::execute (const Shader& vertexShader, const Shader& fragmentShader, const vector<VariableValue>& uniformValues)
{
	int	gridVtxWidth	= m_gridWidth+1;
	int gridVtxHeight	= m_gridHeight+1;
	int numVertices		= gridVtxWidth*gridVtxHeight;

	VaryingStore varyingStore(numVertices);

	// Execute vertex shader
	{
		ExecutionContext	execCtx(m_samplers2D, m_samplersCube);
		int					numPackets	= numVertices + ((numVertices%EXEC_VEC_WIDTH) ? 1 : 0);

		const vector<ShaderInput*>& inputs	= vertexShader.getInputs();
		vector<const Variable*>		outputs;
		vertexShader.getOutputs(outputs);

		// Set uniform values
		for (vector<VariableValue>::const_iterator uniformIter = uniformValues.begin(); uniformIter != uniformValues.end(); uniformIter++)
			execCtx.getValue(uniformIter->getVariable()) = uniformIter->getValue().value();

		for (int packetNdx = 0; packetNdx < numPackets; packetNdx++)
		{
			int packetStart	= packetNdx*EXEC_VEC_WIDTH;
			int packetEnd	= deMin32((packetNdx+1)*EXEC_VEC_WIDTH, numVertices);

			// Compute values for vertex shader inputs
			for (vector<ShaderInput*>::const_iterator i = inputs.begin(); i != inputs.end(); i++)
			{
				const ShaderInput*	input	= *i;
				ExecValueAccess		access	= execCtx.getValue(input->getVariable());

				for (int vtxNdx = packetStart; vtxNdx < packetEnd; vtxNdx++)
				{
					int		y	= (vtxNdx/gridVtxWidth);
					int		x	= vtxNdx - y*gridVtxWidth;
					float	xf	= (float)x / (float)(gridVtxWidth-1);
					float	yf	= (float)y / (float)(gridVtxHeight-1);

					interpolateVertexInput(access, vtxNdx-packetStart, input->getValueRange(), xf, yf);
				}
			}

			// Execute vertex shader for packet
			vertexShader.execute(execCtx);

			// Store output values
			for (vector<const Variable*>::const_iterator i = outputs.begin(); i != outputs.end(); i++)
			{
				const Variable*			output	= *i;

				if (deStringEqual(output->getName(), "gl_Position"))
					continue; // Do not store position

				ExecConstValueAccess	access	= execCtx.getValue(output);
				VaryingStorage*			dst		= varyingStore.getStorage(output->getType(), output->getName());

				for (int vtxNdx = packetStart; vtxNdx < packetEnd; vtxNdx++)
				{
					ValueAccess varyingAccess = dst->getValue(output->getType(), vtxNdx);
					copyVarying(varyingAccess, access, vtxNdx-packetStart);
				}
			}
		}
	}

	// Execute fragment shader
	{
		ExecutionContext execCtx(m_samplers2D, m_samplersCube);

		// Assign uniform values
		for (vector<VariableValue>::const_iterator i = uniformValues.begin(); i != uniformValues.end(); i++)
			execCtx.getValue(i->getVariable()) = i->getValue().value();

		const vector<ShaderInput*>& inputs			= fragmentShader.getInputs();
		const Variable*				fragColorVar	= DE_NULL;
		vector<const Variable*>		outputs;

		// Find fragment shader output assigned to location 0. This is fragment color.
		fragmentShader.getOutputs(outputs);
		for (vector<const Variable*>::const_iterator i = outputs.begin(); i != outputs.end(); i++)
		{
			if ((*i)->getLayoutLocation() == 0)
			{
				fragColorVar = *i;
				break;
			}
		}
		TCU_CHECK(fragColorVar);

		int	width		= m_dst.getWidth();
		int height		= m_dst.getHeight();
		int numPackets	= (width*height)/EXEC_VEC_WIDTH + (((width*height)%EXEC_VEC_WIDTH) ? 1 : 0);

		float cellWidth		= (float)width	/ (float)m_gridWidth;
		float cellHeight	= (float)height	/ (float)m_gridHeight;

		for (int packetNdx = 0; packetNdx < numPackets; packetNdx++)
		{
			int packetStart	= packetNdx*EXEC_VEC_WIDTH;
			int packetEnd	= deMin32((packetNdx+1)*EXEC_VEC_WIDTH, width*height);

			// Interpolate varyings
			for (vector<ShaderInput*>::const_iterator i = inputs.begin(); i != inputs.end(); i++)
			{
				const ShaderInput*		input	= *i;
				ExecValueAccess			access	= execCtx.getValue(input->getVariable());
				const VariableType&		type	= input->getVariable()->getType();
				const VaryingStorage*	src		= varyingStore.getStorage(type, input->getVariable()->getName());

				// \todo [2011-03-08 pyry] Part of this could be pre-computed...
				for (int fragNdx = packetStart; fragNdx < packetEnd; fragNdx++)
				{
					int y = fragNdx/width;
					int x = fragNdx - y*width;
					tcu::IVec4	vtxIndices	= computeVertexIndices(cellWidth, cellHeight, gridVtxWidth, gridVtxHeight, x, y);
					tcu::Vec2	weights		= computeGridCellWeights(cellWidth, cellHeight, x, y);

					interpolateFragmentInput(access, fragNdx-packetStart,
											 src->getValue(type, vtxIndices.x()),
											 src->getValue(type, vtxIndices.y()),
											 src->getValue(type, vtxIndices.z()),
											 src->getValue(type, vtxIndices.w()),
											 weights.x(), weights.y());
				}
			}

			// Execute fragment shader
			fragmentShader.execute(execCtx);

			// Write resulting color
			ExecConstValueAccess colorValue = execCtx.getValue(fragColorVar);
			for (int fragNdx = packetStart; fragNdx < packetEnd; fragNdx++)
			{
				int			y		= fragNdx/width;
				int			x		= fragNdx - y*width;
				int			cNdx	= fragNdx-packetStart;
				tcu::Vec4	c		= tcu::Vec4(colorValue.component(0).asFloat(cNdx),
												colorValue.component(1).asFloat(cNdx),
												colorValue.component(2).asFloat(cNdx),
												colorValue.component(3).asFloat(cNdx));

				// \todo [2012-11-13 pyry] Reverse order.
				m_dst.setPixel(c, x, m_dst.getHeight()-y-1);
			}
		}
	}
}

} // rsg