/*
* Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* Authors:
* Vadim Girlin
*/
#define GCM_DEBUG 0
#if GCM_DEBUG
#define GCM_DUMP(a) do { a } while(0);
#else
#define GCM_DUMP(a)
#endif
#include <map>
#include "sb_bc.h"
#include "sb_shader.h"
#include "sb_pass.h"
#include "eg_sq.h" // V_SQ_CF_INDEX_NONE
namespace r600_sb {
int gcm::run() {
GCM_DUMP( sblog << "==== GCM ==== \n"; sh.dump_ir(); );
collect_instructions(sh.root, true);
init_def_count(uses, pending);
for (node_iterator N, I = pending.begin(), E = pending.end();
I != E; I = N) {
N = I;
++N;
node *o = *I;
GCM_DUMP(
sblog << "pending : ";
dump::dump_op(o);
sblog << "\n";
);
if (td_is_ready(o)) {
GCM_DUMP(
sblog << " ready: ";
dump::dump_op(o);
sblog << "\n";
);
pending.remove_node(o);
ready.push_back(o);
} else {
}
}
sched_early(sh.root);
if (!pending.empty()) {
sblog << "##### gcm_sched_early_pass: unscheduled ops:\n";
dump::dump_op(pending.front());
}
assert(pending.empty());
GCM_DUMP( sh.dump_ir(); );
GCM_DUMP( sblog << "\n\n ############## gcm late\n\n"; );
collect_instructions(sh.root, false);
init_use_count(uses, pending);
sched_late(sh.root);
if (!pending.empty()) {
sblog << "##### gcm_sched_late_pass: unscheduled ops:\n";
dump::dump_op(pending.front());
}
assert(ucs_level == 0);
assert(pending.empty());
return 0;
}
void gcm::collect_instructions(container_node *c, bool early_pass) {
if (c->is_bb()) {
if (early_pass) {
for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
node *n = *I;
if (n->flags & NF_DONT_MOVE) {
op_info &o = op_map[n];
o.top_bb = o.bottom_bb = static_cast<bb_node*>(c);
}
}
}
pending.append_from(c);
return;
}
for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
if (I->is_container()) {
collect_instructions(static_cast<container_node*>(*I), early_pass);
}
}
}
void gcm::sched_early(container_node *n) {
region_node *r =
(n->type == NT_REGION) ? static_cast<region_node*>(n) : NULL;
if (r && r->loop_phi) {
sched_early(r->loop_phi);
}
for (node_iterator I = n->begin(), E = n->end(); I != E; ++I) {
if (I->type == NT_OP) {
node *op = *I;
if (op->subtype == NST_PHI) {
td_release_uses(op->dst);
}
} else if (I->is_container()) {
if (I->subtype == NST_BB) {
bb_node* bb = static_cast<bb_node*>(*I);
td_sched_bb(bb);
} else {
sched_early(static_cast<container_node*>(*I));
}
}
}
if (r && r->phi) {
sched_early(r->phi);
}
}
void gcm::td_schedule(bb_node *bb, node *n) {
GCM_DUMP(
sblog << "scheduling : ";
dump::dump_op(n);
sblog << "\n";
);
td_release_uses(n->dst);
bb->push_back(n);
op_map[n].top_bb = bb;
}
void gcm::td_sched_bb(bb_node* bb) {
GCM_DUMP(
sblog << "td scheduling BB_" << bb->id << "\n";
);
while (!ready.empty()) {
for (sq_iterator N, I = ready.begin(), E = ready.end(); I != E;
I = N) {
N = I; ++N;
td_schedule(bb, *I);
ready.erase(I);
}
}
}
bool gcm::td_is_ready(node* n) {
return uses[n] == 0;
}
void gcm::td_release_val(value *v) {
GCM_DUMP(
sblog << "td checking uses: ";
dump::dump_val(v);
sblog << "\n";
);
for (uselist::iterator I = v->uses.begin(), E = v->uses.end(); I != E; ++I) {
node *op = *I;
if (op->parent != &pending) {
continue;
}
GCM_DUMP(
sblog << "td used in ";
dump::dump_op(op);
sblog << "\n";
);
assert(uses[op] > 0);
if (--uses[op] == 0) {
GCM_DUMP(
sblog << "td released : ";
dump::dump_op(op);
sblog << "\n";
);
pending.remove_node(op);
ready.push_back(op);
}
}
}
void gcm::td_release_uses(vvec& v) {
for (vvec::iterator I = v.begin(), E = v.end(); I != E; ++I) {
value *v = *I;
if (!v)
continue;
if (v->is_rel())
td_release_uses(v->mdef);
else
td_release_val(v);
}
}
void gcm::sched_late(container_node *n) {
bool stack_pushed = false;
if (n->is_depart()) {
depart_node *d = static_cast<depart_node*>(n);
push_uc_stack();
stack_pushed = true;
bu_release_phi_defs(d->target->phi, d->dep_id);
} else if (n->is_repeat()) {
repeat_node *r = static_cast<repeat_node*>(n);
assert(r->target->loop_phi);
push_uc_stack();
stack_pushed = true;
bu_release_phi_defs(r->target->loop_phi, r->rep_id);
}
for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
if (I->is_container()) {
if (I->subtype == NST_BB) {
bb_node* bb = static_cast<bb_node*>(*I);
bu_sched_bb(bb);
} else {
sched_late(static_cast<container_node*>(*I));
}
}
}
if (n->type == NT_IF) {
if_node *f = static_cast<if_node*>(n);
if (f->cond)
pending_defs.push_back(f->cond);
} else if (n->type == NT_REGION) {
region_node *r = static_cast<region_node*>(n);
if (r->loop_phi)
bu_release_phi_defs(r->loop_phi, 0);
}
if (stack_pushed)
pop_uc_stack();
}
void gcm::bu_sched_bb(bb_node* bb) {
GCM_DUMP(
sblog << "bu scheduling BB_" << bb->id << "\n";
);
bu_bb = bb;
if (!pending_nodes.empty()) {
GCM_DUMP(
sblog << "pending nodes:\n";
);
// TODO consider sorting the exports by array_base,
// possibly it can improve performance
for (node_list::iterator I = pending_nodes.begin(),
E = pending_nodes.end(); I != E; ++I) {
bu_release_op(*I);
}
pending_nodes.clear();
GCM_DUMP(
sblog << "pending nodes processed...\n";
);
}
if (!pending_defs.empty()) {
for (vvec::iterator I = pending_defs.begin(), E = pending_defs.end();
I != E; ++I) {
bu_release_val(*I);
}
pending_defs.clear();
}
for (sched_queue::iterator N, I = ready_above.begin(), E = ready_above.end();
I != E; I = N) {
N = I;
++N;
node *n = *I;
if (op_map[n].bottom_bb == bb) {
add_ready(*I);
ready_above.erase(I);
}
}
unsigned cnt_ready[SQ_NUM];
container_node *clause = NULL;
unsigned last_inst_type = ~0;
unsigned last_count = 0;
bool s = true;
while (s) {
node *n;
s = false;
unsigned ready_mask = 0;
for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
if (!bu_ready[sq].empty() || !bu_ready_next[sq].empty())
ready_mask |= (1 << sq);
}
if (!ready_mask) {
for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
if (!bu_ready_early[sq].empty()) {
node *n = bu_ready_early[sq].front();
bu_ready_early[sq].pop_front();
bu_ready[sq].push_back(n);
break;
}
}
}
for (unsigned sq = SQ_CF; sq < SQ_NUM; ++sq) {
if (sq == SQ_CF && pending_exec_mask_update) {
pending_exec_mask_update = false;
sq = SQ_ALU;
--sq;
continue;
}
if (sq != SQ_ALU && outstanding_lds_oq)
continue;
if (!bu_ready_next[sq].empty())
bu_ready[sq].splice(bu_ready[sq].end(), bu_ready_next[sq]);
cnt_ready[sq] = bu_ready[sq].size();
if ((sq == SQ_TEX || sq == SQ_VTX) && live_count <= rp_threshold &&
cnt_ready[sq] < ctx.max_fetch/2 &&
!bu_ready_next[SQ_ALU].empty()) {
sq = SQ_ALU;
--sq;
continue;
}
while (!bu_ready[sq].empty()) {
if (last_inst_type != sq) {
clause = NULL;
last_count = 0;
last_inst_type = sq;
}
// simple heuristic to limit register pressure,
if (sq == SQ_ALU && live_count > rp_threshold && !outstanding_lds_oq &&
(!bu_ready[SQ_TEX].empty() ||
!bu_ready[SQ_VTX].empty() ||
!bu_ready_next[SQ_TEX].empty() ||
!bu_ready_next[SQ_VTX].empty())) {
GCM_DUMP( sblog << "switching to fetch (regpressure)\n"; );
break;
}
n = bu_ready[sq].front();
// real count (e.g. SAMPLE_G will be expanded to 3 instructions,
// 2 SET_GRAD_ + 1 SAMPLE_G
unsigned ncnt = 1;
if (n->is_fetch_inst() && n->src.size() == 12) {
ncnt = 3;
}
bool sampler_indexing = false;
if (n->is_fetch_inst() &&
static_cast<fetch_node *>(n)->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE)
{
sampler_indexing = true; // Give sampler indexed ops get their own clause
ncnt = sh.get_ctx().is_cayman() ? 2 : 3; // MOVA + SET_CF_IDX0/1
}
if ((sq == SQ_TEX || sq == SQ_VTX) &&
((last_count >= ctx.max_fetch/2 &&
check_alu_ready_count(24)) ||
last_count + ncnt > ctx.max_fetch))
break;
else if (sq == SQ_CF && last_count > 4 &&
check_alu_ready_count(24))
break;
if (sq == SQ_ALU && n->consumes_lds_oq() &&
(bu_ready[SQ_TEX].size() || bu_ready[SQ_VTX].size() || bu_ready[SQ_GDS].size())) {
GCM_DUMP( sblog << "switching scheduling due to lds op\n"; );
break;
}
bu_ready[sq].pop_front();
if (sq != SQ_CF) {
if (!clause || sampler_indexing) {
node_subtype nst;
switch (sq) {
case SQ_ALU:
nst = NST_ALU_CLAUSE;
break;
case SQ_TEX:
nst = NST_TEX_CLAUSE;
break;
case SQ_GDS:
nst = NST_GDS_CLAUSE;
break;
default:
nst = NST_VTX_CLAUSE;
break;
}
clause = sh.create_clause(nst);
bb->push_front(clause);
}
} else {
clause = bb;
}
bu_schedule(clause, n);
s = true;
last_count += ncnt;
}
}
}
bu_bb = NULL;
GCM_DUMP(
sblog << "bu finished scheduling BB_" << bb->id << "\n";
);
}
void gcm::bu_release_defs(vvec& v, bool src) {
for (vvec::reverse_iterator I = v.rbegin(), E = v.rend(); I != E; ++I) {
value *v = *I;
if (!v || v->is_readonly())
continue;
if (v->is_rel()) {
if (!v->rel->is_readonly())
bu_release_val(v->rel);
bu_release_defs(v->muse, true);
} else if (src)
bu_release_val(v);
else {
if (live.remove_val(v)) {
--live_count;
}
}
}
}
void gcm::push_uc_stack() {
GCM_DUMP(
sblog << "pushing use count stack prev_level " << ucs_level
<< " new level " << (ucs_level + 1) << "\n";
);
++ucs_level;
if (ucs_level == nuc_stk.size()) {
nuc_stk.resize(ucs_level + 1);
}
else {
nuc_stk[ucs_level].clear();
}
}
bool gcm::bu_is_ready(node* n) {
nuc_map &cm = nuc_stk[ucs_level];
nuc_map::iterator F = cm.find(n);
unsigned uc = (F == cm.end() ? 0 : F->second);
return uc == uses[n];
}
void gcm::bu_schedule(container_node* c, node* n) {
GCM_DUMP(
sblog << "bu scheduling : ";
dump::dump_op(n);
sblog << "\n";
);
assert(op_map[n].bottom_bb == bu_bb);
if (n->produces_lds_oq())
outstanding_lds_oq--;
if (n->consumes_lds_oq())
outstanding_lds_oq++;
bu_release_defs(n->src, true);
bu_release_defs(n->dst, false);
c->push_front(n);
}
void gcm::dump_uc_stack() {
sblog << "##### uc_stk start ####\n";
for (unsigned l = 0; l <= ucs_level; ++l) {
nuc_map &m = nuc_stk[l];
sblog << "nuc_stk[" << l << "] : @" << &m << "\n";
for (nuc_map::iterator I = m.begin(), E = m.end(); I != E; ++I) {
sblog << " uc " << I->second << " for ";
dump::dump_op(I->first);
sblog << "\n";
}
}
sblog << "##### uc_stk end ####\n";
}
void gcm::pop_uc_stack() {
nuc_map &pm = nuc_stk[ucs_level];
--ucs_level;
nuc_map &cm = nuc_stk[ucs_level];
GCM_DUMP(
sblog << "merging use stack from level " << (ucs_level+1)
<< " to " << ucs_level << "\n";
);
for (nuc_map::iterator N, I = pm.begin(), E = pm.end(); I != E; ++I) {
node *n = I->first;
GCM_DUMP(
sblog << " " << cm[n] << " += " << I->second << " for ";
dump::dump_op(n);
sblog << "\n";
);
unsigned uc = cm[n] += I->second;
if (n->parent == &pending && uc == uses[n]) {
cm.erase(n);
pending_nodes.push_back(n);
GCM_DUMP(
sblog << "pushed pending_node due to stack pop ";
dump::dump_op(n);
sblog << "\n";
);
}
}
}
void gcm::bu_find_best_bb(node *n, op_info &oi) {
GCM_DUMP(
sblog << " find best bb : ";
dump::dump_op(n);
sblog << "\n";
);
if (oi.bottom_bb)
return;
// don't hoist generated copies
if (n->flags & NF_DONT_HOIST) {
oi.bottom_bb = bu_bb;
return;
}
bb_node* best_bb = bu_bb;
bb_node* top_bb = oi.top_bb;
assert(oi.top_bb && !oi.bottom_bb);
node *c = best_bb;
// FIXME top_bb may be located inside the loop so we'll never enter it
// in the loop below, and the instruction will be incorrectly placed at the
// beginning of the shader.
// For now just check if top_bb's loop_level is higher than of
// current bb and abort the search for better bb in such case,
// but this problem may require more complete (and more expensive) fix
if (top_bb->loop_level <= best_bb->loop_level) {
while (c && c != top_bb) {
if (c->prev) {
c = c->prev;
} else {
c = c->parent;
if (!c)
break;
continue;
}
if (c->subtype == NST_BB) {
bb_node *bb = static_cast<bb_node*>(c);
if (bb->loop_level < best_bb->loop_level)
best_bb = bb;
}
}
}
oi.bottom_bb = best_bb;
}
void gcm::add_ready(node *n) {
sched_queue_id sq = sh.get_queue_id(n);
if (n->flags & NF_SCHEDULE_EARLY)
bu_ready_early[sq].push_back(n);
else if (sq == SQ_ALU && n->is_copy_mov())
bu_ready[sq].push_front(n);
else if (n->is_alu_inst()) {
alu_node *a = static_cast<alu_node*>(n);
if (a->bc.op_ptr->flags & AF_PRED && a->dst[2]) {
// PRED_SET instruction that updates exec mask
pending_exec_mask_update = true;
}
bu_ready_next[sq].push_back(n);
} else
bu_ready_next[sq].push_back(n);
}
void gcm::bu_release_op(node * n) {
op_info &oi = op_map[n];
GCM_DUMP(
sblog << " bu release op ";
dump::dump_op(n);
);
nuc_stk[ucs_level].erase(n);
pending.remove_node(n);
bu_find_best_bb(n, oi);
if (oi.bottom_bb == bu_bb) {
GCM_DUMP( sblog << " ready\n";);
add_ready(n);
} else {
GCM_DUMP( sblog << " ready_above\n";);
ready_above.push_back(n);
}
}
void gcm::bu_release_phi_defs(container_node* p, unsigned op)
{
for (node_riterator I = p->rbegin(), E = p->rend(); I != E; ++I) {
node *o = *I;
value *v = o->src[op];
if (v && !v->is_readonly())
pending_defs.push_back(o->src[op]);
}
}
unsigned gcm::get_uc_vec(vvec &vv) {
unsigned c = 0;
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
value *v = *I;
if (!v)
continue;
if (v->is_rel())
c += get_uc_vec(v->mdef);
else
c += v->use_count();
}
return c;
}
void gcm::init_use_count(nuc_map& m, container_node &s) {
m.clear();
for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) {
node *n = *I;
unsigned uc = get_uc_vec(n->dst);
GCM_DUMP(
sblog << "uc " << uc << " ";
dump::dump_op(n);
sblog << "\n";
);
if (!uc) {
pending_nodes.push_back(n);
GCM_DUMP(
sblog << "pushed pending_node in init ";
dump::dump_op(n);
sblog << "\n";
);
} else
m[n] = uc;
}
}
void gcm::bu_release_val(value* v) {
node *n = v->any_def();
if (n && n->parent == &pending) {
nuc_map &m = nuc_stk[ucs_level];
unsigned uc = ++m[n];
unsigned uc2 = uses[n];
if (live.add_val(v)) {
++live_count;
GCM_DUMP ( sblog << "live_count: " << live_count << "\n"; );
}
GCM_DUMP(
sblog << "release val ";
dump::dump_val(v);
sblog << " for node ";
dump::dump_op(n);
sblog << " new uc=" << uc << ", total " << uc2 << "\n";
);
if (uc == uc2)
bu_release_op(n);
}
}
void gcm::init_def_count(nuc_map& m, container_node& s) {
m.clear();
for (node_iterator I = s.begin(), E = s.end(); I != E; ++I) {
node *n = *I;
unsigned dc = get_dc_vec(n->src, true) + get_dc_vec(n->dst, false);
m[n] = dc;
GCM_DUMP(
sblog << "dc " << dc << " ";
dump::dump_op(n);
sblog << "\n";
);
}
}
unsigned gcm::get_dc_vec(vvec& vv, bool src) {
unsigned c = 0;
for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
value *v = *I;
if (!v || v->is_readonly())
continue;
if (v->is_rel()) {
c += v->rel->def != NULL;
c += get_dc_vec(v->muse, true);
}
else if (src) {
c += v->def != NULL;
c += v->adef != NULL;
}
}
return c;
}
unsigned gcm::real_alu_count(sched_queue& q, unsigned max) {
sq_iterator I(q.begin()), E(q.end());
unsigned c = 0;
while (I != E && c < max) {
node *n = *I;
if (n->is_alu_inst()) {
if (!n->is_copy_mov() || !n->src[0]->is_any_gpr())
++c;
} else if (n->is_alu_packed()) {
c += static_cast<container_node*>(n)->count();
}
++I;
}
return c;
}
bool gcm::check_alu_ready_count(unsigned threshold) {
unsigned r = real_alu_count(bu_ready[SQ_ALU], threshold);
if (r >= threshold)
return true;
r += real_alu_count(bu_ready_next[SQ_ALU], threshold - r);
return r >= threshold;
}
} // namespace r600_sb