/*
 * Copyright (C) 2009 Nicolai Haehnle.
 *
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial
 * portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 */

#include "radeon_program_pair.h"

#include <stdio.h>

#include "radeon_compiler.h"
#include "radeon_compiler_util.h"
#include "radeon_dataflow.h"
#include "radeon_list.h"
#include "radeon_variable.h"

#include "util/u_debug.h"

#define VERBOSE 0

#define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)

struct schedule_instruction {
	struct rc_instruction * Instruction;

	/** Next instruction in the linked list of ready instructions. */
	struct schedule_instruction *NextReady;

	/** Values that this instruction reads and writes */
	struct reg_value * WriteValues[4];
	struct reg_value * ReadValues[12];
	unsigned int NumWriteValues:3;
	unsigned int NumReadValues:4;

	/**
	 * Number of (read and write) dependencies that must be resolved before
	 * this instruction can be scheduled.
	 */
	unsigned int NumDependencies:5;

	/** List of all readers (see rc_get_readers() for the definition of
	 * "all readers"), even those outside the basic block this instruction
	 * lives in. */
	struct rc_reader_data GlobalReaders;

	/** If the scheduler has paired an RGB and an Alpha instruction together,
	 * PairedInst references the alpha insturction's dependency information.
	 */
	struct schedule_instruction * PairedInst;

	/** This scheduler uses the value of Score to determine which
	 * instruction to schedule.  Instructions with a higher value of Score
	 * will be scheduled first. */
	int Score;

	/** The number of components that read from a TEX instruction. */
	unsigned TexReadCount;

	/** For TEX instructions a list of readers */
	struct rc_list * TexReaders;
};


/**
 * Used to keep track of which instructions read a value.
 */
struct reg_value_reader {
	struct schedule_instruction *Reader;
	struct reg_value_reader *Next;
};

/**
 * Used to keep track which values are stored in each component of a
 * RC_FILE_TEMPORARY.
 */
struct reg_value {
	struct schedule_instruction * Writer;

	/**
	 * Unordered linked list of instructions that read from this value.
	 * When this value becomes available, we increase all readers'
	 * dependency count.
	 */
	struct reg_value_reader *Readers;

	/**
	 * Number of readers of this value. This is decremented each time
	 * a reader of the value is committed.
	 * When the reader cound reaches zero, the dependency count
	 * of the instruction writing \ref Next is decremented.
	 */
	unsigned int NumReaders;

	struct reg_value *Next; /**< Pointer to the next value to be written to the same register */
};

struct register_state {
	struct reg_value * Values[4];
};

struct remap_reg {
	struct rc_instruciont * Inst;
	unsigned int OldIndex:(RC_REGISTER_INDEX_BITS+1);
	unsigned int OldSwizzle:3;
	unsigned int NewIndex:(RC_REGISTER_INDEX_BITS+1);
	unsigned int NewSwizzle:3;
	unsigned int OnlyTexReads:1;
	struct remap_reg * Next;
};

struct schedule_state {
	struct radeon_compiler * C;
	struct schedule_instruction * Current;
	/** Array of the previous writers of Current's destination register
	 * indexed by channel. */
	struct schedule_instruction * PrevWriter[4];

	struct register_state Temporary[RC_REGISTER_MAX_INDEX];

	/**
	 * Linked lists of instructions that can be scheduled right now,
	 * based on which ALU/TEX resources they require.
	 */
	/*@{*/
	struct schedule_instruction *ReadyFullALU;
	struct schedule_instruction *ReadyRGB;
	struct schedule_instruction *ReadyAlpha;
	struct schedule_instruction *ReadyTEX;
	/*@}*/
	struct rc_list *PendingTEX;

	void (*CalcScore)(struct schedule_instruction *);
	long max_tex_group;
	unsigned PrevBlockHasTex:1;
	unsigned TEXCount;
	unsigned Opt:1;
};

static struct reg_value ** get_reg_valuep(struct schedule_state * s,
		rc_register_file file, unsigned int index, unsigned int chan)
{
	if (file != RC_FILE_TEMPORARY)
		return 0;

	if (index >= RC_REGISTER_MAX_INDEX) {
		rc_error(s->C, "%s: index %i out of bounds\n", __FUNCTION__, index);
		return 0;
	}

	return &s->Temporary[index].Values[chan];
}

static unsigned get_tex_read_count(struct schedule_instruction * sinst)
{
	unsigned tex_read_count = sinst->TexReadCount;
	if (sinst->PairedInst) {
		tex_read_count += sinst->PairedInst->TexReadCount;
	}
	return tex_read_count;
}

#if VERBOSE
static void print_list(struct schedule_instruction * sinst)
{
	struct schedule_instruction * ptr;
	for (ptr = sinst; ptr; ptr=ptr->NextReady) {
		unsigned tex_read_count = get_tex_read_count(ptr);
		unsigned score = sinst->Score;
		fprintf(stderr,"%u (%d) [%u],", ptr->Instruction->IP, score,
						tex_read_count);
	}
	fprintf(stderr, "\n");
}
#endif

static void remove_inst_from_list(struct schedule_instruction ** list,
					struct schedule_instruction * inst)
{
	struct schedule_instruction * prev = NULL;
	struct schedule_instruction * list_ptr;
	for (list_ptr = *list; list_ptr; prev = list_ptr,
					list_ptr = list_ptr->NextReady) {
		if (list_ptr == inst) {
			if (prev) {
				prev->NextReady = inst->NextReady;
			} else {
				*list = inst->NextReady;
			}
			inst->NextReady = NULL;
			break;
		}
	}
}

static void add_inst_to_list(struct schedule_instruction ** list, struct schedule_instruction * inst)
{
	inst->NextReady = *list;
	*list = inst;
}

static void add_inst_to_list_score(struct schedule_instruction ** list,
					struct schedule_instruction * inst)
{
	struct schedule_instruction * temp;
	struct schedule_instruction * prev;
	if (!*list) {
		*list = inst;
		return;
	}
	temp = *list;
	prev = NULL;
	while(temp && inst->Score <= temp->Score) {
		prev = temp;
		temp = temp->NextReady;
	}

	if (!prev) {
		inst->NextReady = temp;
		*list = inst;
	} else {
		prev->NextReady = inst;
		inst->NextReady = temp;
	}
}

static void instruction_ready(struct schedule_state * s, struct schedule_instruction * sinst)
{
	DBG("%i is now ready\n", sinst->Instruction->IP);

	/* Adding Ready TEX instructions to the end of the "Ready List" helps
	 * us emit TEX instructions in blocks without losing our place. */
	if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL)
		add_inst_to_list_score(&s->ReadyTEX, sinst);
	else if (sinst->Instruction->U.P.Alpha.Opcode == RC_OPCODE_NOP)
		add_inst_to_list_score(&s->ReadyRGB, sinst);
	else if (sinst->Instruction->U.P.RGB.Opcode == RC_OPCODE_NOP)
		add_inst_to_list_score(&s->ReadyAlpha, sinst);
	else
		add_inst_to_list_score(&s->ReadyFullALU, sinst);
}

static void decrease_dependencies(struct schedule_state * s, struct schedule_instruction * sinst)
{
	assert(sinst->NumDependencies > 0);
	sinst->NumDependencies--;
	if (!sinst->NumDependencies)
		instruction_ready(s, sinst);
}

/* These functions provide different heuristics for scheduling instructions.
 * The default is calc_score_readers. */

#if 0

static void calc_score_zero(struct schedule_instruction * sinst)
{
	sinst->Score = 0;
}

static void calc_score_deps(struct schedule_instruction * sinst)
{
	int i;
	sinst->Score = 0;
	for (i = 0; i < sinst->NumWriteValues; i++) {
		struct reg_value * v = sinst->WriteValues[i];
		if (v->NumReaders) {
			struct reg_value_reader * r;
			for (r = v->Readers; r; r = r->Next) {
				if (r->Reader->NumDependencies == 1) {
					sinst->Score += 100;
				}
				sinst->Score += r->Reader->NumDependencies;
			}
		}
	}
}

#endif

#define NO_OUTPUT_SCORE (1 << 24)

static void score_no_output(struct schedule_instruction * sinst)
{
	assert(sinst->Instruction->Type != RC_INSTRUCTION_NORMAL);
	if (!sinst->Instruction->U.P.RGB.OutputWriteMask &&
			!sinst->Instruction->U.P.Alpha.OutputWriteMask) {
		if (sinst->PairedInst) {
			if (!sinst->PairedInst->Instruction->U.P.
							RGB.OutputWriteMask
					&& !sinst->PairedInst->Instruction->U.P.
							Alpha.OutputWriteMask) {
				sinst->Score |= NO_OUTPUT_SCORE;
			}

		} else {
			sinst->Score |= NO_OUTPUT_SCORE;
		}
	}
}

#define PAIRED_SCORE (1 << 16)

static void calc_score_r300(struct schedule_instruction * sinst)
{
	unsigned src_idx;

	if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL) {
		sinst->Score = 0;
		return;
	}

	score_no_output(sinst);

	if (sinst->PairedInst) {
		sinst->Score |= PAIRED_SCORE;
		return;
	}

	for (src_idx = 0; src_idx < 4; src_idx++) {
		sinst->Score += sinst->Instruction->U.P.RGB.Src[src_idx].Used +
				sinst->Instruction->U.P.Alpha.Src[src_idx].Used;
	}
}

#define NO_READ_TEX_SCORE (1 << 16)

static void calc_score_readers(struct schedule_instruction * sinst)
{
	if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL) {
		sinst->Score = 0;
	} else {
		sinst->Score = sinst->NumReadValues;
		if (sinst->PairedInst) {
			sinst->Score += sinst->PairedInst->NumReadValues;
		}
		if (get_tex_read_count(sinst) == 0) {
			sinst->Score |= NO_READ_TEX_SCORE;
		}
		score_no_output(sinst);
	}
}

/**
 * This function decreases the dependencies of the next instruction that
 * wants to write to each of sinst's read values.
 */
static void commit_update_reads(struct schedule_state * s,
					struct schedule_instruction * sinst){
	unsigned int i;
	for(i = 0; i < sinst->NumReadValues; ++i) {
		struct reg_value * v = sinst->ReadValues[i];
		assert(v->NumReaders > 0);
		v->NumReaders--;
		if (!v->NumReaders) {
			if (v->Next) {
				decrease_dependencies(s, v->Next->Writer);
			}
		}
	}
	if (sinst->PairedInst) {
		commit_update_reads(s, sinst->PairedInst);
	}
}

static void commit_update_writes(struct schedule_state * s,
					struct schedule_instruction * sinst){
	unsigned int i;
	for(i = 0; i < sinst->NumWriteValues; ++i) {
		struct reg_value * v = sinst->WriteValues[i];
		if (v->NumReaders) {
			for(struct reg_value_reader * r = v->Readers; r; r = r->Next) {
				decrease_dependencies(s, r->Reader);
			}
		} else {
			/* This happens in instruction sequences of the type
			 *  OP r.x, ...;
			 *  OP r.x, r.x, ...;
			 * See also the subtlety in how instructions that both
			 * read and write the same register are scanned.
			 */
			if (v->Next)
				decrease_dependencies(s, v->Next->Writer);
		}
	}
	if (sinst->PairedInst) {
		commit_update_writes(s, sinst->PairedInst);
	}
}

static void notify_sem_wait(struct schedule_state *s)
{
	struct rc_list * pend_ptr;
	for (pend_ptr = s->PendingTEX; pend_ptr; pend_ptr = pend_ptr->Next) {
		struct rc_list * read_ptr;
		struct schedule_instruction * pending = pend_ptr->Item;
		for (read_ptr = pending->TexReaders; read_ptr;
						read_ptr = read_ptr->Next) {
			struct schedule_instruction * reader = read_ptr->Item;
			reader->TexReadCount--;
		}
	}
	s->PendingTEX = NULL;
}

static void commit_alu_instruction(struct schedule_state * s, struct schedule_instruction * sinst)
{
	DBG("%i: commit score = %d\n", sinst->Instruction->IP, sinst->Score);

	commit_update_reads(s, sinst);

	commit_update_writes(s, sinst);

	if (get_tex_read_count(sinst) > 0) {
		sinst->Instruction->U.P.SemWait = 1;
		notify_sem_wait(s);
	}
}

/**
 * Emit all ready texture instructions in a single block.
 *
 * Emit as a single block to (hopefully) sample many textures in parallel,
 * and to avoid hardware indirections on R300.
 */
static void emit_all_tex(struct schedule_state * s, struct rc_instruction * before)
{
	struct schedule_instruction *readytex;
	struct rc_instruction * inst_begin;

	assert(s->ReadyTEX);
	notify_sem_wait(s);

	/* Node marker for R300 */
	inst_begin = rc_insert_new_instruction(s->C, before->Prev);
	inst_begin->U.I.Opcode = RC_OPCODE_BEGIN_TEX;

	/* Link texture instructions back in */
	readytex = s->ReadyTEX;
	while(readytex) {
		rc_insert_instruction(before->Prev, readytex->Instruction);
		DBG("%i: commit TEX reads\n", readytex->Instruction->IP);

		/* All of the TEX instructions in the same TEX block have
		 * their source registers read from before any of the
		 * instructions in that block write to their destination
		 * registers.  This means that when we commit a TEX
		 * instruction, any other TEX instruction that wants to write
		 * to one of the committed instruction's source register can be
		 * marked as ready and should be emitted in the same TEX
		 * block. This prevents the following sequence from being
		 * emitted in two different TEX blocks:
		 * 0: TEX temp[0].xyz, temp[1].xy__, 2D[0];
		 * 1: TEX temp[1].xyz, temp[2].xy__, 2D[0];
		 */
		commit_update_reads(s, readytex);
		readytex = readytex->NextReady;
	}
	readytex = s->ReadyTEX;
	s->ReadyTEX = 0;
	while(readytex){
		DBG("%i: commit TEX writes\n", readytex->Instruction->IP);
		commit_update_writes(s, readytex);
		/* Set semaphore bits for last TEX instruction in the block */
		if (!readytex->NextReady) {
			readytex->Instruction->U.I.TexSemAcquire = 1;
			readytex->Instruction->U.I.TexSemWait = 1;
		}
		rc_list_add(&s->PendingTEX, rc_list(&s->C->Pool, readytex));
		readytex = readytex->NextReady;
	}
}

/* This is a helper function for destructive_merge_instructions().  It helps
 * merge presubtract sources from two instructions and makes sure the
 * presubtract sources end up in the correct spot.  This function assumes that
 * dst_full is an rgb instruction, meaning that it has a vector instruction(rgb)
 * but no scalar instruction (alpha).
 * @return 0 if merging the presubtract sources fails.
 * @retrun 1 if merging the presubtract sources succeeds.
 */
static int merge_presub_sources(
	struct rc_pair_instruction * dst_full,
	struct rc_pair_sub_instruction src,
	unsigned int type)
{
	unsigned int srcp_src, srcp_regs, is_rgb, is_alpha;
	struct rc_pair_sub_instruction * dst_sub;
	const struct rc_opcode_info * info;

	assert(dst_full->Alpha.Opcode == RC_OPCODE_NOP);

	switch(type) {
	case RC_SOURCE_RGB:
		is_rgb = 1;
		is_alpha = 0;
		dst_sub = &dst_full->RGB;
		break;
	case RC_SOURCE_ALPHA:
		is_rgb = 0;
		is_alpha = 1;
		dst_sub = &dst_full->Alpha;
		break;
	default:
		assert(0);
		return 0;
	}

	info = rc_get_opcode_info(dst_full->RGB.Opcode);

	if (dst_sub->Src[RC_PAIR_PRESUB_SRC].Used)
		return 0;

	srcp_regs = rc_presubtract_src_reg_count(
					src.Src[RC_PAIR_PRESUB_SRC].Index);
	for(srcp_src = 0; srcp_src < srcp_regs; srcp_src++) {
		unsigned int arg;
		int free_source;
		unsigned int one_way = 0;
		struct rc_pair_instruction_source srcp = src.Src[srcp_src];
		struct rc_pair_instruction_source temp;

		free_source = rc_pair_alloc_source(dst_full, is_rgb, is_alpha,
							srcp.File, srcp.Index);

		/* If free_source < 0 then there are no free source
		 * slots. */
		if (free_source < 0)
			return 0;

		temp = dst_sub->Src[srcp_src];
		dst_sub->Src[srcp_src] = dst_sub->Src[free_source];

		/* srcp needs src0 and src1 to be the same */
		if (free_source < srcp_src) {
			if (!temp.Used)
				continue;
			free_source = rc_pair_alloc_source(dst_full, is_rgb,
					is_alpha, temp.File, temp.Index);
			if (free_source < 0)
				return 0;
			one_way = 1;
		} else {
			dst_sub->Src[free_source] = temp;
		}

		/* If free_source == srcp_src, then the presubtract
		 * source is already in the correct place. */
		if (free_source == srcp_src)
			continue;

		/* Shuffle the sources, so we can put the
		 * presubtract source in the correct place. */
		for(arg = 0; arg < info->NumSrcRegs; arg++) {
			/*If this arg does not read from an rgb source,
			 * do nothing. */
			if (!(rc_source_type_swz(dst_full->RGB.Arg[arg].Swizzle)
								& type)) {
				continue;
			}

			if (dst_full->RGB.Arg[arg].Source == srcp_src)
				dst_full->RGB.Arg[arg].Source = free_source;
			/* We need to do this just in case register
			 * is one of the sources already, but in the
			 * wrong spot. */
			else if(dst_full->RGB.Arg[arg].Source == free_source
							&& !one_way) {
				dst_full->RGB.Arg[arg].Source = srcp_src;
			}
		}
	}
	return 1;
}


/* This function assumes that rgb.Alpha and alpha.RGB are unused */
static int destructive_merge_instructions(
		struct rc_pair_instruction * rgb,
		struct rc_pair_instruction * alpha)
{
	const struct rc_opcode_info * opcode;

	assert(rgb->Alpha.Opcode == RC_OPCODE_NOP);
	assert(alpha->RGB.Opcode == RC_OPCODE_NOP);

	/* Presubtract registers need to be merged first so that registers
	 * needed by the presubtract operation can be placed in src0 and/or
	 * src1. */

	/* Merge the rgb presubtract registers. */
	if (alpha->RGB.Src[RC_PAIR_PRESUB_SRC].Used) {
		if (!merge_presub_sources(rgb, alpha->RGB, RC_SOURCE_RGB)) {
			return 0;
		}
	}
	/* Merge the alpha presubtract registers */
	if (alpha->Alpha.Src[RC_PAIR_PRESUB_SRC].Used) {
		if(!merge_presub_sources(rgb,  alpha->Alpha, RC_SOURCE_ALPHA)){
			return 0;
		}
	}

	/* Copy alpha args into rgb */
	opcode = rc_get_opcode_info(alpha->Alpha.Opcode);

	for(unsigned int arg = 0; arg < opcode->NumSrcRegs; ++arg) {
		unsigned int srcrgb = 0;
		unsigned int srcalpha = 0;
		unsigned int oldsrc = alpha->Alpha.Arg[arg].Source;
		rc_register_file file = 0;
		unsigned int index = 0;
		int source;

		if (GET_SWZ(alpha->Alpha.Arg[arg].Swizzle, 0) < 3) {
			srcrgb = 1;
			file = alpha->RGB.Src[oldsrc].File;
			index = alpha->RGB.Src[oldsrc].Index;
		} else if (GET_SWZ(alpha->Alpha.Arg[arg].Swizzle, 0) < 4) {
			srcalpha = 1;
			file = alpha->Alpha.Src[oldsrc].File;
			index = alpha->Alpha.Src[oldsrc].Index;
		}

		source = rc_pair_alloc_source(rgb, srcrgb, srcalpha, file, index);
		if (source < 0)
			return 0;

		rgb->Alpha.Arg[arg].Source = source;
		rgb->Alpha.Arg[arg].Swizzle = alpha->Alpha.Arg[arg].Swizzle;
		rgb->Alpha.Arg[arg].Abs = alpha->Alpha.Arg[arg].Abs;
		rgb->Alpha.Arg[arg].Negate = alpha->Alpha.Arg[arg].Negate;
	}

	/* Copy alpha opcode into rgb */
	rgb->Alpha.Opcode = alpha->Alpha.Opcode;
	rgb->Alpha.DestIndex = alpha->Alpha.DestIndex;
	rgb->Alpha.WriteMask = alpha->Alpha.WriteMask;
	rgb->Alpha.OutputWriteMask = alpha->Alpha.OutputWriteMask;
	rgb->Alpha.DepthWriteMask = alpha->Alpha.DepthWriteMask;
	rgb->Alpha.Saturate = alpha->Alpha.Saturate;
	rgb->Alpha.Omod = alpha->Alpha.Omod;

	/* Merge ALU result writing */
	if (alpha->WriteALUResult) {
		if (rgb->WriteALUResult)
			return 0;

		rgb->WriteALUResult = alpha->WriteALUResult;
		rgb->ALUResultCompare = alpha->ALUResultCompare;
	}

	/* Copy SemWait */
	rgb->SemWait |= alpha->SemWait;

	return 1;
}

/**
 * Try to merge the given instructions into the rgb instructions.
 *
 * Return true on success; on failure, return false, and keep
 * the instructions untouched.
 */
static int merge_instructions(struct rc_pair_instruction * rgb, struct rc_pair_instruction * alpha)
{
	struct rc_pair_instruction backup;

	/*Instructions can't write output registers and ALU result at the
	 * same time. */
	if ((rgb->WriteALUResult && alpha->Alpha.OutputWriteMask)
		|| (rgb->RGB.OutputWriteMask && alpha->WriteALUResult)) {
		return 0;
	}

	/* Writing output registers in the middle of shaders is slow, so
	 * we don't want to pair output writes with temp writes. */
	if ((rgb->RGB.OutputWriteMask && !alpha->Alpha.OutputWriteMask)
		|| (!rgb->RGB.OutputWriteMask && alpha->Alpha.OutputWriteMask)) {
		return 0;
	}

	memcpy(&backup, rgb, sizeof(struct rc_pair_instruction));

	if (destructive_merge_instructions(rgb, alpha))
		return 1;

	memcpy(rgb, &backup, sizeof(struct rc_pair_instruction));
	return 0;
}

static void presub_nop(struct rc_instruction * emitted) {
	int prev_rgb_index, prev_alpha_index, i, num_src;

	/* We don't need a nop if the previous instruction is a TEX. */
	if (emitted->Prev->Type != RC_INSTRUCTION_PAIR) {
		return;
	}
	if (emitted->Prev->U.P.RGB.WriteMask)
		prev_rgb_index = emitted->Prev->U.P.RGB.DestIndex;
	else
		prev_rgb_index = -1;
	if (emitted->Prev->U.P.Alpha.WriteMask)
		prev_alpha_index = emitted->Prev->U.P.Alpha.DestIndex;
	else
		prev_alpha_index = 1;

	/* Check the previous rgb instruction */
	if (emitted->U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Used) {
		num_src = rc_presubtract_src_reg_count(
				emitted->U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Index);
		for (i = 0; i < num_src; i++) {
			unsigned int index = emitted->U.P.RGB.Src[i].Index;
			if (emitted->U.P.RGB.Src[i].File == RC_FILE_TEMPORARY
			    && (index  == prev_rgb_index
				|| index == prev_alpha_index)) {
				emitted->Prev->U.P.Nop = 1;
				return;
			}
		}
	}

	/* Check the previous alpha instruction. */
	if (!emitted->U.P.Alpha.Src[RC_PAIR_PRESUB_SRC].Used)
		return;

	num_src = rc_presubtract_src_reg_count(
				emitted->U.P.Alpha.Src[RC_PAIR_PRESUB_SRC].Index);
	for (i = 0; i < num_src; i++) {
		unsigned int index = emitted->U.P.Alpha.Src[i].Index;
		if(emitted->U.P.Alpha.Src[i].File == RC_FILE_TEMPORARY
		   && (index == prev_rgb_index || index == prev_alpha_index)) {
			emitted->Prev->U.P.Nop = 1;
			return;
		}
	}
}

static void rgb_to_alpha_remap (
	struct rc_instruction * inst,
	struct rc_pair_instruction_arg * arg,
	rc_register_file old_file,
	rc_swizzle old_swz,
	unsigned int new_index)
{
	int new_src_index;
	unsigned int i;

	for (i = 0; i < 3; i++) {
		if (get_swz(arg->Swizzle, i) == old_swz) {
			SET_SWZ(arg->Swizzle, i, RC_SWIZZLE_W);
		}
	}
	new_src_index = rc_pair_alloc_source(&inst->U.P, 0, 1,
							old_file, new_index);
	/* This conversion is not possible, we must have made a mistake in
	 * is_rgb_to_alpha_possible. */
	if (new_src_index < 0) {
		assert(0);
		return;
	}

	arg->Source = new_src_index;
}

static int can_remap(unsigned int opcode)
{
	switch(opcode) {
	case RC_OPCODE_DDX:
	case RC_OPCODE_DDY:
		return 0;
	default:
		return 1;
	}
}

static int can_convert_opcode_to_alpha(unsigned int opcode)
{
	switch(opcode) {
	case RC_OPCODE_DDX:
	case RC_OPCODE_DDY:
	case RC_OPCODE_DP2:
	case RC_OPCODE_DP3:
	case RC_OPCODE_DP4:
	case RC_OPCODE_DPH:
		return 0;
	default:
		return 1;
	}
}

static void is_rgb_to_alpha_possible(
	void * userdata,
	struct rc_instruction * inst,
	struct rc_pair_instruction_arg * arg,
	struct rc_pair_instruction_source * src)
{
	unsigned int read_chan = RC_SWIZZLE_UNUSED;
	unsigned int alpha_sources = 0;
	unsigned int i;
	struct rc_reader_data * reader_data = userdata;

	if (!can_remap(inst->U.P.RGB.Opcode)
	    || !can_remap(inst->U.P.Alpha.Opcode)) {
		reader_data->Abort = 1;
		return;
	}

	if (!src)
		return;

	/* XXX There are some cases where we can still do the conversion if
	 * a reader reads from a presubtract source, but for now we'll prevent
	 * it. */
	if (arg->Source == RC_PAIR_PRESUB_SRC) {
		reader_data->Abort = 1;
		return;
	}

	/* Make sure the source only reads the register component that we
	 * are going to be convering from.  It is OK if the instruction uses
	 * this component more than once.
	 * XXX If the index we will be converting to is the same as the
	 * current index, then it is OK to read from more than one component.
	 */
	for (i = 0; i < 3; i++) {
		rc_swizzle swz = get_swz(arg->Swizzle, i);
		switch(swz) {
		case RC_SWIZZLE_X:
		case RC_SWIZZLE_Y:
		case RC_SWIZZLE_Z:
		case RC_SWIZZLE_W:
			if (read_chan == RC_SWIZZLE_UNUSED) {
				read_chan = swz;
			} else if (read_chan != swz) {
				reader_data->Abort = 1;
				return;
			}
			break;
		default:
			break;
		}
	}

	/* Make sure there are enough alpha sources.
	 * XXX If we know what register all the readers are going
	 * to be remapped to, then in some situations we can still do
	 * the subsitution, even if all 3 alpha sources are being used.*/
	for (i = 0; i < 3; i++) {
		if (inst->U.P.Alpha.Src[i].Used) {
			alpha_sources++;
		}
	}
	if (alpha_sources > 2) {
		reader_data->Abort = 1;
		return;
	}
}

static int convert_rgb_to_alpha(
	struct schedule_state * s,
	struct schedule_instruction * sched_inst)
{
	struct rc_pair_instruction * pair_inst = &sched_inst->Instruction->U.P;
	unsigned int old_mask = pair_inst->RGB.WriteMask;
	unsigned int old_swz = rc_mask_to_swizzle(old_mask);
	const struct rc_opcode_info * info =
				rc_get_opcode_info(pair_inst->RGB.Opcode);
	int new_index = -1;
	unsigned int i;

	if (sched_inst->GlobalReaders.Abort)
		return 0;

	if (!pair_inst->RGB.WriteMask)
		return 0;

	if (!can_convert_opcode_to_alpha(pair_inst->RGB.Opcode)
	    || !can_convert_opcode_to_alpha(pair_inst->Alpha.Opcode)) {
		return 0;
	}

	assert(sched_inst->NumWriteValues == 1);

	if (!sched_inst->WriteValues[0]) {
		assert(0);
		return 0;
	}

	/* We start at the old index, because if we can reuse the same
	 * register and just change the swizzle then it is more likely we
	 * will be able to convert all the readers. */
	for (i = pair_inst->RGB.DestIndex; i < RC_REGISTER_MAX_INDEX; i++) {
		struct reg_value ** new_regvalp = get_reg_valuep(
						s, RC_FILE_TEMPORARY, i, 3);
		if (!*new_regvalp) {
			struct reg_value ** old_regvalp =
				get_reg_valuep(s,
					RC_FILE_TEMPORARY,
					pair_inst->RGB.DestIndex,
					rc_mask_to_swizzle(old_mask));
			new_index = i;
			*new_regvalp = *old_regvalp;
			*old_regvalp = NULL;
			new_regvalp = get_reg_valuep(s, RC_FILE_TEMPORARY, i, 3);
			break;
		}
	}
	if (new_index < 0) {
		return 0;
	}

	/* If we are converting a full instruction with RC_OPCODE_REPL_ALPHA
	 * as the RGB opcode, then the Alpha instruction will already contain
	 * the correct opcode and instruction args, so we do not want to
	 * overwrite them.
	 */
	if (pair_inst->RGB.Opcode != RC_OPCODE_REPL_ALPHA) {
		pair_inst->Alpha.Opcode = pair_inst->RGB.Opcode;
		memcpy(pair_inst->Alpha.Arg, pair_inst->RGB.Arg,
						sizeof(pair_inst->Alpha.Arg));
	}
	pair_inst->Alpha.DestIndex = new_index;
	pair_inst->Alpha.WriteMask = RC_MASK_W;
	pair_inst->Alpha.Target = pair_inst->RGB.Target;
	pair_inst->Alpha.OutputWriteMask = pair_inst->RGB.OutputWriteMask;
	pair_inst->Alpha.DepthWriteMask = pair_inst->RGB.DepthWriteMask;
	pair_inst->Alpha.Saturate = pair_inst->RGB.Saturate;
	pair_inst->Alpha.Omod = pair_inst->RGB.Omod;
	/* Move the swizzles into the first chan */
	for (i = 0; i < info->NumSrcRegs; i++) {
		unsigned int j;
		for (j = 0; j < 3; j++) {
			unsigned int swz = get_swz(pair_inst->Alpha.Arg[i].Swizzle, j);
			if (swz != RC_SWIZZLE_UNUSED) {
				pair_inst->Alpha.Arg[i].Swizzle =
							rc_init_swizzle(swz, 1);
				break;
			}
		}
	}
	pair_inst->RGB.Opcode = RC_OPCODE_NOP;
	pair_inst->RGB.DestIndex = 0;
	pair_inst->RGB.WriteMask = 0;
	pair_inst->RGB.Target = 0;
	pair_inst->RGB.OutputWriteMask = 0;
	pair_inst->RGB.DepthWriteMask = 0;
	pair_inst->RGB.Saturate = 0;
	memset(pair_inst->RGB.Arg, 0, sizeof(pair_inst->RGB.Arg));

	for(i = 0; i < sched_inst->GlobalReaders.ReaderCount; i++) {
		struct rc_reader reader = sched_inst->GlobalReaders.Readers[i];
		rgb_to_alpha_remap(reader.Inst, reader.U.P.Arg,
					RC_FILE_TEMPORARY, old_swz, new_index);
	}
	return 1;
}

static void try_convert_and_pair(
	struct schedule_state *s,
	struct schedule_instruction ** inst_list)
{
	struct schedule_instruction * list_ptr = *inst_list;
	while (list_ptr && *inst_list && (*inst_list)->NextReady) {
		int paired = 0;
		if (list_ptr->Instruction->U.P.Alpha.Opcode != RC_OPCODE_NOP
			&& list_ptr->Instruction->U.P.RGB.Opcode
						!= RC_OPCODE_REPL_ALPHA) {
				goto next;
		}
		if (list_ptr->NumWriteValues == 1
					&& convert_rgb_to_alpha(s, list_ptr)) {

			struct schedule_instruction * pair_ptr;
			remove_inst_from_list(inst_list, list_ptr);
			add_inst_to_list_score(&s->ReadyAlpha, list_ptr);

			for (pair_ptr = s->ReadyRGB; pair_ptr;
					pair_ptr = pair_ptr->NextReady) {
				if (merge_instructions(&pair_ptr->Instruction->U.P,
						&list_ptr->Instruction->U.P)) {
					remove_inst_from_list(&s->ReadyAlpha, list_ptr);
					remove_inst_from_list(&s->ReadyRGB, pair_ptr);
					pair_ptr->PairedInst = list_ptr;

					add_inst_to_list(&s->ReadyFullALU, pair_ptr);
					list_ptr = *inst_list;
					paired = 1;
					break;
				}

			}
		}
		if (!paired) {
next:
			list_ptr = list_ptr->NextReady;
		}
	}
}

/**
 * This function attempts to merge RGB and Alpha instructions together.
 */
static void pair_instructions(struct schedule_state * s)
{
	struct schedule_instruction *rgb_ptr;
	struct schedule_instruction *alpha_ptr;

	/* Some pairings might fail because they require too
	 * many source slots; try all possible pairings if necessary */
	rgb_ptr = s->ReadyRGB;
	while(rgb_ptr) {
		struct schedule_instruction * rgb_next = rgb_ptr->NextReady;
		alpha_ptr = s->ReadyAlpha;
		while(alpha_ptr) {
			struct schedule_instruction * alpha_next = alpha_ptr->NextReady;
			if (merge_instructions(&rgb_ptr->Instruction->U.P, &alpha_ptr->Instruction->U.P)) {
				/* Remove RGB and Alpha from their ready lists.
				 */
				remove_inst_from_list(&s->ReadyRGB, rgb_ptr);
				remove_inst_from_list(&s->ReadyAlpha, alpha_ptr);
				rgb_ptr->PairedInst = alpha_ptr;
				add_inst_to_list(&s->ReadyFullALU, rgb_ptr);
				break;
			}
			alpha_ptr = alpha_next;
		}
		rgb_ptr = rgb_next;
	}

	if (!s->Opt) {
		return;
	}

	/* Full instructions that have RC_OPCODE_REPL_ALPHA in the RGB
	 * slot can be converted into Alpha instructions. */
	try_convert_and_pair(s, &s->ReadyFullALU);

	/* Try to convert some of the RGB instructions to Alpha and
	 * try to pair it with another RGB. */
	try_convert_and_pair(s, &s->ReadyRGB);
}

static void update_max_score(
	struct schedule_state * s,
	struct schedule_instruction ** list,
	int * max_score,
	struct schedule_instruction ** max_inst_out,
	struct schedule_instruction *** list_out)
{
	struct schedule_instruction * list_ptr;
	for (list_ptr = *list; list_ptr; list_ptr = list_ptr->NextReady) {
		int score;
		s->CalcScore(list_ptr);
		score = list_ptr->Score;
		if (!*max_inst_out || score > *max_score) {
			*max_score = score;
			*max_inst_out = list_ptr;
			*list_out = list;
		}
	}
}

static void emit_instruction(
	struct schedule_state * s,
	struct rc_instruction * before)
{
	int max_score = -1;
	struct schedule_instruction * max_inst = NULL;
	struct schedule_instruction ** max_list = NULL;
	unsigned tex_count = 0;
	struct schedule_instruction * tex_ptr;

	pair_instructions(s);
#if VERBOSE
	fprintf(stderr, "Full:\n");
	print_list(s->ReadyFullALU);
	fprintf(stderr, "RGB:\n");
	print_list(s->ReadyRGB);
	fprintf(stderr, "Alpha:\n");
	print_list(s->ReadyAlpha);
	fprintf(stderr, "TEX:\n");
	print_list(s->ReadyTEX);
#endif

	for (tex_ptr = s->ReadyTEX; tex_ptr; tex_ptr = tex_ptr->NextReady) {
		if (tex_ptr->Instruction->U.I.Opcode == RC_OPCODE_KIL) {
			emit_all_tex(s, before);
			return;
		}
		tex_count++;
	}
	update_max_score(s, &s->ReadyFullALU, &max_score, &max_inst, &max_list);
	update_max_score(s, &s->ReadyRGB, &max_score, &max_inst, &max_list);
	update_max_score(s, &s->ReadyAlpha, &max_score, &max_inst, &max_list);

	if (tex_count >= s->max_tex_group || max_score == -1
		|| (s->TEXCount > 0 && tex_count == s->TEXCount)
		|| (!s->C->is_r500 && tex_count > 0 && max_score == -1)) {
		emit_all_tex(s, before);
	} else {


		remove_inst_from_list(max_list, max_inst);
		rc_insert_instruction(before->Prev, max_inst->Instruction);
		commit_alu_instruction(s, max_inst);

		presub_nop(before->Prev);
	}
}

static void add_tex_reader(
	struct schedule_state * s,
	struct schedule_instruction * writer,
	struct schedule_instruction * reader)
{
	if (!writer || writer->Instruction->Type != RC_INSTRUCTION_NORMAL) {
		/*Not a TEX instructions */
		return;
	}
	reader->TexReadCount++;
	rc_list_add(&writer->TexReaders, rc_list(&s->C->Pool, reader));
}

static void scan_read(void * data, struct rc_instruction * inst,
		rc_register_file file, unsigned int index, unsigned int chan)
{
	struct schedule_state * s = data;
	struct reg_value ** v = get_reg_valuep(s, file, index, chan);
	struct reg_value_reader * reader;

	if (!v)
		return;

	if (*v && (*v)->Writer == s->Current) {
		/* The instruction reads and writes to a register component.
		 * In this case, we only want to increment dependencies by one.
		 * Why?
		 * Because each instruction depends on the writers of its source
		 * registers _and_ the most recent writer of its destination
		 * register.  In this case, the current instruction (s->Current)
		 * has a dependency that both writes to one of its source
		 * registers and was the most recent writer to its destination
		 * register.  We have already marked this dependency in
		 * scan_write(), so we don't need to do it again.
		 */

		/* We need to make sure we are adding s->Current to the
		 * previous writer's list of TexReaders, if the previous writer
		 * was a TEX instruction.
		 */
		add_tex_reader(s, s->PrevWriter[chan], s->Current);

		return;
	}

	DBG("%i: read %i[%i] chan %i\n", s->Current->Instruction->IP, file, index, chan);

	reader = memory_pool_malloc(&s->C->Pool, sizeof(*reader));
	reader->Reader = s->Current;
	if (!*v) {
		/* In this situation, the instruction reads from a register
		 * that hasn't been written to or read from in the current
		 * block. */
		*v = memory_pool_malloc(&s->C->Pool, sizeof(struct reg_value));
		memset(*v, 0, sizeof(struct reg_value));
		(*v)->Readers = reader;
	} else {
		reader->Next = (*v)->Readers;
		(*v)->Readers = reader;
		/* Only update the current instruction's dependencies if the
		 * register it reads from has been written to in this block. */
		if ((*v)->Writer) {
			add_tex_reader(s, (*v)->Writer, s->Current);
			s->Current->NumDependencies++;
		}
	}
	(*v)->NumReaders++;

	if (s->Current->NumReadValues >= 12) {
		rc_error(s->C, "%s: NumReadValues overflow\n", __FUNCTION__);
	} else {
		s->Current->ReadValues[s->Current->NumReadValues++] = *v;
	}
}

static void scan_write(void * data, struct rc_instruction * inst,
		rc_register_file file, unsigned int index, unsigned int chan)
{
	struct schedule_state * s = data;
	struct reg_value ** pv = get_reg_valuep(s, file, index, chan);
	struct reg_value * newv;

	if (!pv)
		return;

	DBG("%i: write %i[%i] chan %i\n", s->Current->Instruction->IP, file, index, chan);

	newv = memory_pool_malloc(&s->C->Pool, sizeof(*newv));
	memset(newv, 0, sizeof(*newv));

	newv->Writer = s->Current;

	if (*pv) {
		(*pv)->Next = newv;
		s->Current->NumDependencies++;
		/* Keep track of the previous writer to s->Current's destination
		 * register */
		s->PrevWriter[chan] = (*pv)->Writer;
	}

	*pv = newv;

	if (s->Current->NumWriteValues >= 4) {
		rc_error(s->C, "%s: NumWriteValues overflow\n", __FUNCTION__);
	} else {
		s->Current->WriteValues[s->Current->NumWriteValues++] = newv;
	}
}

static void is_rgb_to_alpha_possible_normal(
	void * userdata,
	struct rc_instruction * inst,
	struct rc_src_register * src)
{
	struct rc_reader_data * reader_data = userdata;
	reader_data->Abort = 1;

}

static void schedule_block(struct schedule_state * s,
		struct rc_instruction * begin, struct rc_instruction * end)
{
	unsigned int ip;

	/* Scan instructions for data dependencies */
	ip = 0;
	for(struct rc_instruction * inst = begin; inst != end; inst = inst->Next) {
		s->Current = memory_pool_malloc(&s->C->Pool, sizeof(*s->Current));
		memset(s->Current, 0, sizeof(struct schedule_instruction));

		if (inst->Type == RC_INSTRUCTION_NORMAL) {
			const struct rc_opcode_info * info =
					rc_get_opcode_info(inst->U.I.Opcode);
			if (info->HasTexture) {
				s->TEXCount++;
			}
		}

		/* XXX: This causes SemWait to be set for all instructions in
		 * a block if the previous block contained a TEX instruction.
		 * We can do better here, but it will take a lot of work. */
		if (s->PrevBlockHasTex) {
			s->Current->TexReadCount = 1;
		}

		s->Current->Instruction = inst;
		inst->IP = ip++;

		DBG("%i: Scanning\n", inst->IP);

		/* The order of things here is subtle and maybe slightly
		 * counter-intuitive, to account for the case where an
		 * instruction writes to the same register as it reads
		 * from. */
		rc_for_all_writes_chan(inst, &scan_write, s);
		rc_for_all_reads_chan(inst, &scan_read, s);

		DBG("%i: Has %i dependencies\n", inst->IP, s->Current->NumDependencies);

		if (!s->Current->NumDependencies) {
			instruction_ready(s, s->Current);
		}

		/* Get global readers for possible RGB->Alpha conversion. */
		s->Current->GlobalReaders.ExitOnAbort = 1;
		rc_get_readers(s->C, inst, &s->Current->GlobalReaders,
				is_rgb_to_alpha_possible_normal,
				is_rgb_to_alpha_possible, NULL);
	}

	/* Temporarily unlink all instructions */
	begin->Prev->Next = end;
	end->Prev = begin->Prev;

	/* Schedule instructions back */
	while(!s->C->Error &&
	      (s->ReadyTEX || s->ReadyRGB || s->ReadyAlpha || s->ReadyFullALU)) {
		emit_instruction(s, end);
	}
}

static int is_controlflow(struct rc_instruction * inst)
{
	if (inst->Type == RC_INSTRUCTION_NORMAL) {
		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
		return opcode->IsFlowControl;
	}
	return 0;
}

void rc_pair_schedule(struct radeon_compiler *cc, void *user)
{
	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)cc;
	struct schedule_state s;
	struct rc_instruction * inst = c->Base.Program.Instructions.Next;
	unsigned int * opt = user;

	memset(&s, 0, sizeof(s));
	s.Opt = *opt;
	s.C = &c->Base;
	if (s.C->is_r500) {
		s.CalcScore = calc_score_readers;
	} else {
		s.CalcScore = calc_score_r300;
	}
	s.max_tex_group = debug_get_num_option("RADEON_TEX_GROUP", 8);
	while(inst != &c->Base.Program.Instructions) {
		struct rc_instruction * first;

		if (is_controlflow(inst)) {
			inst = inst->Next;
			continue;
		}

		first = inst;

		while(inst != &c->Base.Program.Instructions && !is_controlflow(inst))
			inst = inst->Next;

		DBG("Schedule one block\n");
		memset(s.Temporary, 0, sizeof(s.Temporary));
		s.TEXCount = 0;
		schedule_block(&s, first, inst);
		if (s.PendingTEX) {
			s.PrevBlockHasTex = 1;
		}
	}
}