/*
 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * Authors:
 *      Vadim Girlin
 */

#define GCM_DEBUG 0

#if GCM_DEBUG
#define GCM_DUMP(a) do { a } while(0);
#else
#define GCM_DUMP(a)
#endif

#include <map>

#include "sb_bc.h"
#include "sb_shader.h"
#include "sb_pass.h"
#include "eg_sq.h" // V_SQ_CF_INDEX_NONE

namespace r600_sb {

int gcm::run() {

	GCM_DUMP( sblog << "==== GCM ==== \n"; sh.dump_ir(); );

	collect_instructions(sh.root, true);

	init_def_count(uses, pending);

	for (node_iterator N, I = pending.begin(), E = pending.end();
			I != E; I = N) {
		N = I;
		++N;
		node *o = *I;

		GCM_DUMP(
			sblog << "pending : ";
			dump::dump_op(o);
			sblog << "\n";
		);

		if (td_is_ready(o)) {

			GCM_DUMP(
				sblog << "  ready: ";
				dump::dump_op(o);
				sblog << "\n";
			);
			pending.remove_node(o);
			ready.push_back(o);
		} else {
		}
	}

	sched_early(sh.root);

	if (!pending.empty()) {
		sblog << "##### gcm_sched_early_pass: unscheduled ops:\n";
		dump::dump_op(pending.front());
	}

	assert(pending.empty());

	GCM_DUMP( sh.dump_ir(); );

	GCM_DUMP( sblog << "\n\n ############## gcm late\n\n"; );

	collect_instructions(sh.root, false);

	init_use_count(uses, pending);

	sched_late(sh.root);
	if (!pending.empty()) {
		sblog << "##### gcm_sched_late_pass: unscheduled ops:\n";
		dump::dump_op(pending.front());
	}

	assert(ucs_level == 0);
	assert(pending.empty());

	return 0;
}


void gcm::collect_instructions(container_node *c, bool early_pass) {
	if (c->is_bb()) {

		if (early_pass) {
			for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
				node *n = *I;
				if (n->flags & NF_DONT_MOVE) {
					op_info &o = op_map[n];
					o.top_bb = o.bottom_bb = static_cast<bb_node*>(c);
				}
			}
		}

		pending.append_from(c);
		return;
	}

	for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
		if (I->is_container()) {
			collect_instructions(static_cast<container_node*>(*I), early_pass);
		}
	}
}

void gcm::sched_early(container_node *n) {

	region_node *r =
			(n->type == NT_REGION) ? static_cast<region_node*>(n) : NULL;

	if (r && r->loop_phi) {
		sched_early(r->loop_phi);
	}

	for (node_iterator I = n->begin(), E = n->end(); I != E; ++I) {
		if (I->type == NT_OP) {
			node *op = *I;
			if (op->subtype == NST_PHI) {
				td_release_uses(op->dst);
			}
		} else if (I->is_container()) {
			if (I->subtype == NST_BB) {
				bb_node* bb = static_cast<bb_node*>(*I);
				td_sched_bb(bb);
			} else {
				sched_early(static_cast<container_node*>(*I));
			}
		}
	}

	if (r && r->phi) {
		sched_early(r->phi);
	}
}

void gcm::td_schedule(bb_node *bb, node *n) {
	GCM_DUMP(
		sblog << "scheduling : ";
		dump::dump_op(n);
		sblog << "\n";
	);
	td_release_uses(n->dst);

	bb->push_back(n);

	op_map[n].top_bb = bb;

}

void gcm::td_sched_bb(bb_node* bb) {
	GCM_DUMP(
	sblog << "td scheduling BB_" << bb->id << "\n";
	);

	while (!ready.empty()) {
		for (sq_iterator N, I = ready.begin(), E = ready.end(); I != E;
				I = N) {
			N = I; ++N;
			td_schedule(bb, *I);
			ready.erase(I);
		}
	}
}

bool gcm::td_is_ready(node* n) {
	return uses[n] == 0;
}

void gcm::td_release_val(value *v) {

	GCM_DUMP(
		sblog << "td checking uses: ";
		dump::dump_val(v);
		sblog << "\n";
	);

	for (uselist::iterator I = v->uses.begin(), E = v->uses.end(); I != E; ++I) {
		node *op = *I;
		if (op->parent != &pending) {
			continue;
		}

		GCM_DUMP(
			sblog << "td    used in ";
			dump::dump_op(op);
			sblog << "\n";
		);

		assert(uses[op] > 0);
		if (--uses[op] == 0) {
			GCM_DUMP(
				sblog << "td        released : ";
				dump::dump_op(op);
				sblog << "\n";
			);

			pending.remove_node(op);
			ready.push_back(op);
		}
	}

}

void gcm::td_release_uses(vvec& v) {
	for (vvec::iterator I = v.begin(), E = v.end(); I != E; ++I) {
		value *v = *I;
		if (!v)
			continue;

		if (v->is_rel())
			td_release_uses(v->mdef);
		else
			td_release_val(v);
	}
}

void gcm::sched_late(container_node *n) {

	bool stack_pushed = false;

	if (n->is_depart()) {
		depart_node *d = static_cast<depart_node*>(n);
		push_uc_stack();
		stack_pushed = true;
		bu_release_phi_defs(d->target->phi, d->dep_id);
	} else if (n->is_repeat()) {
		repeat_node *r = static_cast<repeat_node*>(n);
		assert(r->target->loop_phi);
		push_uc_stack();
		stack_pushed = true;
		bu_release_phi_defs(r->target->loop_phi, r->rep_id);
	}

	for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
		if (I->is_container()) {
			if (I->subtype == NST_BB) {
				bb_node* bb = static_cast<bb_node*>(*I);
				bu_sched_bb(bb);
			} else {
				sched_late(static_cast<container_node*>(*I));
			}
		}
	}

	if (n->type == NT_IF) {
		if_node *f = static_cast<if_node*>(n);
		if (f->cond)
			pending_defs.push_back(f->cond);
	} else if (n->type == NT_REGION) {
		region_node *r = static_cast<region_node*>(n);
		if (r->loop_phi)
			bu_release_phi_defs(r->loop_phi, 0);
	}

	if (stack_pushed)
		pop_uc_stack();

}

void gcm::bu_sched_bb(bb_node* bb) {
	GCM_DUMP(
	sblog << "bu scheduling BB_" << bb->id << "\n";
	);

	bu_bb = bb;

	if (!pending_nodes.empty()) {
		GCM_DUMP(
				sblog << "pending nodes:\n";
		);

		// TODO consider sorting the exports by array_base,
		// possibly it can improve performance

		for (node_list::iterator I = pending_nodes.begin(),
				E = pending_nodes.end(); I != E; ++I) {
			bu_release_op(*I);
		}
		pending_nodes.clear();
		GCM_DUMP(
			sblog << "pending nodes processed...\n";
		);
	}


	if (!pending_defs.empty()) {
		for (vvec::iterator I = pending_defs.begin(), E = pending_defs.end();
				I != E; ++I) {
			bu_release_val(*I);
		}
		pending_defs.clear();
	}

	for (sched_queue::iterator N, I = ready_above.begin(), E = ready_above.end();
			I != E;	I = N) {
		N = I;
		++N;
		node *n = *I;
		if (op_map[n].bottom_bb == bb) {
			add_ready(*I);
			ready_above.erase(I);
		}
	}

	unsigned cnt_ready[SQ_NUM];

	container_node *clause = NULL;
	unsigned last_inst_type = ~0;
	unsigned last_count = 0;

	bool s = true;
	while (s) {
		node *n;

		s = false;

		unsigned ready_mask = 0;

		for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
			if (!bu_ready[sq].empty() || !bu_ready_next[sq].empty())
				ready_mask |= (1 << sq);
		}

		if (!ready_mask) {
			for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
				if (!bu_ready_early[sq].empty()) {
					node *n = bu_ready_early[sq].front();
					bu_ready_early[sq].pop_front();
					bu_ready[sq].push_back(n);
					break;
				}
			}
		}

		for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {

			if (sq == SQ_CF && pending_exec_mask_update) {
				pending_exec_mask_update = false;
				sq = SQ_ALU;
				--sq;
				continue;
			}

			if (sq != SQ_ALU && outstanding_lds_oq)
				continue;

			if (!bu_ready_next[sq].empty())
				bu_ready[sq].splice(bu_ready[sq].end(), bu_ready_next[sq]);

			cnt_ready[sq] = bu_ready[sq].size();

			if ((sq == SQ_TEX || sq == SQ_VTX) && live_count <= rp_threshold &&
					cnt_ready[sq] < ctx.max_fetch/2	&&
					!bu_ready_next[SQ_ALU].empty()) {
				sq = SQ_ALU;
				--sq;
				continue;
			}

			while (!bu_ready[sq].empty()) {

				if (last_inst_type != sq) {
					clause = NULL;
					last_count = 0;
					last_inst_type = sq;
				}

				// simple heuristic to limit register pressure,
				if (sq == SQ_ALU && live_count > rp_threshold && !outstanding_lds_oq &&
						(!bu_ready[SQ_TEX].empty() ||
						 !bu_ready[SQ_VTX].empty() ||
						 !bu_ready_next[SQ_TEX].empty() ||
						 !bu_ready_next[SQ_VTX].empty())) {
					GCM_DUMP( sblog << "switching to fetch (regpressure)\n"; );
					break;
				}

				n = bu_ready[sq].front();

				// real count (e.g. SAMPLE_G will be expanded to 3 instructions,
				// 2 SET_GRAD_ + 1 SAMPLE_G
				unsigned ncnt = 1;
				if (n->is_fetch_inst() && n->src.size() == 12) {
					ncnt = 3;
				}

				bool sampler_indexing = false;
				if (n->is_fetch_inst() &&
					static_cast<fetch_node *>(n)->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE)
				{
					sampler_indexing = true; // Give sampler indexed ops get their own clause
					ncnt = sh.get_ctx().is_cayman() ? 2 : 3; // MOVA + SET_CF_IDX0/1
				}

				if ((sq == SQ_TEX || sq == SQ_VTX) &&
						((last_count >= ctx.max_fetch/2 &&
						check_alu_ready_count(24)) ||
								last_count + ncnt > ctx.max_fetch))
					break;
				else if (sq == SQ_CF && last_count > 4 &&
						check_alu_ready_count(24))
					break;


				if (sq == SQ_ALU && n->consumes_lds_oq() &&
				    (bu_ready[SQ_TEX].size() || bu_ready[SQ_VTX].size() || bu_ready[SQ_GDS].size())) {
					GCM_DUMP( sblog << "switching scheduling due to lds op\n"; );
					break;
				}
				bu_ready[sq].pop_front();

				if (sq != SQ_CF) {
					if (!clause || sampler_indexing) {
						node_subtype nst;
						switch (sq) {
						case SQ_ALU:
							nst = NST_ALU_CLAUSE;
							break;
						case SQ_TEX:
							nst = NST_TEX_CLAUSE;
							break;
						case SQ_GDS:
							nst = NST_GDS_CLAUSE;
							break;
						default:
							nst = NST_VTX_CLAUSE;
							break;
						}
						clause = sh.create_clause(nst);
						bb->push_front(clause);
					}
				} else {
					clause = bb;
				}

				bu_schedule(clause, n);
				s = true;
				last_count += ncnt;
			}
		}
	}

	bu_bb = NULL;

	GCM_DUMP(
		sblog << "bu finished scheduling BB_" << bb->id << "\n";
	);
}

void gcm::bu_release_defs(vvec& v, bool src) {
	for (vvec::reverse_iterator I = v.rbegin(), E = v.rend(); I != E; ++I) {
		value *v = *I;
		if (!v || v->is_readonly())
			continue;

		if (v->is_rel()) {
			if (!v->rel->is_readonly())
				bu_release_val(v->rel);
			bu_release_defs(v->muse, true);
		} else if (src)
			bu_release_val(v);
		else {
			if (live.remove_val(v)) {
				--live_count;
			}
		}
	}
}

void gcm::push_uc_stack() {
	GCM_DUMP(
		sblog << "pushing use count stack prev_level " << ucs_level
			<< "   new level " << (ucs_level + 1) << "\n";
	);
	++ucs_level;
	if (ucs_level == nuc_stk.size()) {
		nuc_stk.resize(ucs_level + 1);
	}
	else {
		nuc_stk[ucs_level].clear();
	}
}

bool gcm::bu_is_ready(node* n) {
	nuc_map &cm = nuc_stk[ucs_level];
	nuc_map::iterator F = cm.find(n);
	unsigned uc = (F == cm.end() ? 0 : F->second);
	return uc == uses[n];
}

void gcm::bu_schedule(container_node* c, node* n) {
	GCM_DUMP(
		sblog << "bu scheduling : ";
		dump::dump_op(n);
		sblog << "\n";
	);

	assert(op_map[n].bottom_bb == bu_bb);

	if (n->produces_lds_oq())
		outstanding_lds_oq--;
	if (n->consumes_lds_oq())
		outstanding_lds_oq++;
	bu_release_defs(n->src, true);
	bu_release_defs(n->dst, false);

	c->push_front(n);
}

void gcm::dump_uc_stack() {
	sblog << "##### uc_stk start ####\n";
	for (unsigned l = 0; l <= ucs_level; ++l) {
		nuc_map &m = nuc_stk[l];

		sblog << "nuc_stk[" << l << "] :   @" << &m << "\n";

		for (nuc_map::iterator I = m.begin(), E = m.end(); I != E; ++I) {
			sblog << "    uc " << I->second << " for ";
			dump::dump_op(I->first);
			sblog << "\n";
		}
	}
	sblog << "##### uc_stk end ####\n";
}

void gcm::pop_uc_stack() {
	nuc_map &pm = nuc_stk[ucs_level];
	--ucs_level;
	nuc_map &cm = nuc_stk[ucs_level];

	GCM_DUMP(
		sblog << "merging use stack from level " << (ucs_level+1)
			<< " to " << ucs_level << "\n";
	);

	for (nuc_map::iterator N, I = pm.begin(), E = pm.end(); I != E; ++I) {
		node *n = I->first;

		GCM_DUMP(
			sblog << "      " << cm[n] << " += " << I->second << "  for ";
			dump::dump_op(n);
			sblog << "\n";
		);

		unsigned uc = cm[n] += I->second;

		if (n->parent == &pending && uc == uses[n]) {
			cm.erase(n);
			pending_nodes.push_back(n);
			GCM_DUMP(
				sblog << "pushed pending_node due to stack pop ";
				dump::dump_op(n);
				sblog << "\n";
			);
		}
	}
}

void gcm::bu_find_best_bb(node *n, op_info &oi) {

	GCM_DUMP(
		sblog << "  find best bb : ";
		dump::dump_op(n);
		sblog << "\n";
	);

	if (oi.bottom_bb)
		return;

	// don't hoist generated copies
	if (n->flags & NF_DONT_HOIST) {
		oi.bottom_bb = bu_bb;
		return;
	}

	bb_node* best_bb = bu_bb;
	bb_node* top_bb = oi.top_bb;
	assert(oi.top_bb && !oi.bottom_bb);

	node *c = best_bb;

	// FIXME top_bb may be located inside the loop so we'll never enter it
	// in the loop below, and the instruction will be incorrectly placed at the
	// beginning of the shader.
	// For now just check if top_bb's loop_level is higher than of
	// current bb and abort the search for better bb in such case,
	// but this problem may require more complete (and more expensive) fix
	if (top_bb->loop_level <= best_bb->loop_level) {
		while (c && c != top_bb) {

			if (c->prev) {
				c = c->prev;
			} else {
				c = c->parent;
				if (!c)
					break;
				continue;
			}

			if (c->subtype == NST_BB) {
				bb_node *bb = static_cast<bb_node*>(c);
				if (bb->loop_level < best_bb->loop_level)
					best_bb = bb;
			}
		}
	}

	oi.bottom_bb = best_bb;
}

void gcm::add_ready(node *n) {
	sched_queue_id sq = sh.get_queue_id(n);
	if (n->flags & NF_SCHEDULE_EARLY)
		bu_ready_early[sq].push_back(n);
	else if (sq == SQ_ALU && n->is_copy_mov())
		bu_ready[sq].push_front(n);
	else if (n->is_alu_inst()) {
		alu_node *a = static_cast<alu_node*>(n);
		if (a->bc.op_ptr->flags & AF_PRED && a->dst[2]) {
			// PRED_SET instruction that updates exec mask
			pending_exec_mask_update = true;
		}
		bu_ready_next[sq].push_back(n);
	} else
		bu_ready_next[sq].push_back(n);
}

void gcm::bu_release_op(node * n) {
	op_info &oi = op_map[n];

	GCM_DUMP(
	sblog << "  bu release op  ";
	dump::dump_op(n);
	);

	nuc_stk[ucs_level].erase(n);
	pending.remove_node(n);

	bu_find_best_bb(n, oi);

	if (oi.bottom_bb == bu_bb) {
		GCM_DUMP( sblog << "   ready\n";);
		add_ready(n);
	} else {
		GCM_DUMP( sblog << "   ready_above\n";);
		ready_above.push_back(n);
	}
}

void gcm::bu_release_phi_defs(container_node* p, unsigned op)
{
	for (node_riterator I = p->rbegin(), E = p->rend(); I != E; ++I) {
		node *o = *I;
		value *v = o->src[op];
		if (v && !v->is_readonly())
			pending_defs.push_back(o->src[op]);

	}
}

unsigned gcm::get_uc_vec(vvec &vv) {
	unsigned c = 0;
	for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
		value *v = *I;
		if (!v)
			continue;

		if (v->is_rel())
			c += get_uc_vec(v->mdef);
		else
			c += v->use_count();
	}
	return c;
}

void gcm::init_use_count(nuc_map& m, container_node &s) {
	m.clear();
	for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) {
		node *n = *I;
		unsigned uc = get_uc_vec(n->dst);
		GCM_DUMP(
			sblog << "uc " << uc << "  ";
			dump::dump_op(n);
			sblog << "\n";
		);
		if (!uc) {
			pending_nodes.push_back(n);
			GCM_DUMP(
				sblog << "pushed pending_node in init ";
				dump::dump_op(n);
				sblog << "\n";
			);

		} else
			m[n] = uc;
	}
}

void gcm::bu_release_val(value* v) {
	node *n = v->any_def();

	if (n && n->parent == &pending) {
		nuc_map &m = nuc_stk[ucs_level];
		unsigned uc = ++m[n];
		unsigned uc2 = uses[n];

		if (live.add_val(v)) {
			++live_count;
			GCM_DUMP ( sblog << "live_count: " << live_count << "\n"; );
		}

		GCM_DUMP(
			sblog << "release val ";
			dump::dump_val(v);
			sblog << "  for node ";
			dump::dump_op(n);
			sblog << "    new uc=" << uc << ", total " << uc2 << "\n";
		);

		if (uc == uc2)
			bu_release_op(n);
	}

}

void gcm::init_def_count(nuc_map& m, container_node& s) {
	m.clear();
	for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) {
		node *n = *I;
		unsigned dc = get_dc_vec(n->src, true) + get_dc_vec(n->dst, false);
		m[n] = dc;

		GCM_DUMP(
			sblog << "dc " << dc << "  ";
			dump::dump_op(n);
			sblog << "\n";
		);
	}
}

unsigned gcm::get_dc_vec(vvec& vv, bool src) {
	unsigned c = 0;
	for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
		value *v = *I;
		if (!v || v->is_readonly())
			continue;

		if (v->is_rel()) {
			c += v->rel->def != NULL;
			c += get_dc_vec(v->muse, true);
		}
		else if (src) {
			c += v->def != NULL;
			c += v->adef != NULL;
		}
	}
	return c;
}

unsigned gcm::real_alu_count(sched_queue& q, unsigned max) {
	sq_iterator I(q.begin()), E(q.end());
	unsigned c = 0;

	while (I != E && c < max) {
		node *n = *I;
		if (n->is_alu_inst()) {
			if (!n->is_copy_mov() || !n->src[0]->is_any_gpr())
				++c;
		} else if (n->is_alu_packed()) {
			c += static_cast<container_node*>(n)->count();
		}
		++I;
	}

	return c;
}

bool gcm::check_alu_ready_count(unsigned threshold) {
	unsigned r = real_alu_count(bu_ready[SQ_ALU], threshold);
	if (r >= threshold)
		return true;
	r += real_alu_count(bu_ready_next[SQ_ALU], threshold - r);
	return r >= threshold;
}

} // namespace r600_sb