/* * Copyright (C) 2009 Nicolai Haehnle. * Copyright 2011 Tom Stellard <tstellar@gmail.com> * * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial * portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * */ #include "radeon_program_pair.h" #include <stdio.h> #include "main/glheader.h" #include "program/register_allocate.h" #include "ralloc.h" #include "r300_fragprog_swizzle.h" #include "radeon_compiler.h" #include "radeon_compiler_util.h" #include "radeon_dataflow.h" #include "radeon_list.h" #include "radeon_variable.h" #define VERBOSE 0 #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0) struct register_info { struct live_intervals Live[4]; unsigned int Used:1; unsigned int Allocated:1; unsigned int File:3; unsigned int Index:RC_REGISTER_INDEX_BITS; unsigned int Writemask; }; struct regalloc_state { struct radeon_compiler * C; struct register_info * Input; unsigned int NumInputs; struct register_info * Temporary; unsigned int NumTemporaries; unsigned int Simple; int LoopEnd; }; enum rc_reg_class { RC_REG_CLASS_SINGLE, RC_REG_CLASS_DOUBLE, RC_REG_CLASS_TRIPLE, RC_REG_CLASS_ALPHA, RC_REG_CLASS_SINGLE_PLUS_ALPHA, RC_REG_CLASS_DOUBLE_PLUS_ALPHA, RC_REG_CLASS_TRIPLE_PLUS_ALPHA, RC_REG_CLASS_X, RC_REG_CLASS_Y, RC_REG_CLASS_Z, RC_REG_CLASS_XY, RC_REG_CLASS_YZ, RC_REG_CLASS_XZ, RC_REG_CLASS_XW, RC_REG_CLASS_YW, RC_REG_CLASS_ZW, RC_REG_CLASS_XYW, RC_REG_CLASS_YZW, RC_REG_CLASS_XZW, RC_REG_CLASS_COUNT }; struct rc_class { enum rc_reg_class Class; unsigned int WritemaskCount; /** This is 1 if this class is being used by the register allocator * and 0 otherwise */ unsigned int Used; /** This is the ID number assigned to this class by ra. */ unsigned int Id; /** List of writemasks that belong to this class */ unsigned int Writemasks[3]; }; static void print_live_intervals(struct live_intervals * src) { if (!src || !src->Used) { DBG("(null)"); return; } DBG("(%i,%i)", src->Start, src->End); } static int overlap_live_intervals(struct live_intervals * a, struct live_intervals * b) { if (VERBOSE) { DBG("overlap_live_intervals: "); print_live_intervals(a); DBG(" to "); print_live_intervals(b); DBG("\n"); } if (!a->Used || !b->Used) { DBG(" unused interval\n"); return 0; } if (a->Start > b->Start) { if (a->Start < b->End) { DBG(" overlap\n"); return 1; } } else if (b->Start > a->Start) { if (b->Start < a->End) { DBG(" overlap\n"); return 1; } } else { /* a->Start == b->Start */ if (a->Start != a->End && b->Start != b->End) { DBG(" overlap\n"); return 1; } } DBG(" no overlap\n"); return 0; } static void scan_read_callback(void * data, struct rc_instruction * inst, rc_register_file file, unsigned int index, unsigned int mask) { struct regalloc_state * s = data; struct register_info * reg; unsigned int i; if (file != RC_FILE_INPUT) return; s->Input[index].Used = 1; reg = &s->Input[index]; for (i = 0; i < 4; i++) { if (!((mask >> i) & 0x1)) { continue; } reg->Live[i].Used = 1; reg->Live[i].Start = 0; reg->Live[i].End = s->LoopEnd > inst->IP ? s->LoopEnd : inst->IP; } } static void remap_register(void * data, struct rc_instruction * inst, rc_register_file * file, unsigned int * index) { struct regalloc_state * s = data; const struct register_info * reg; if (*file == RC_FILE_TEMPORARY && s->Simple) reg = &s->Temporary[*index]; else if (*file == RC_FILE_INPUT) reg = &s->Input[*index]; else return; if (reg->Allocated) { *index = reg->Index; } } static void alloc_input_simple(void * data, unsigned int input, unsigned int hwreg) { struct regalloc_state * s = data; if (input >= s->NumInputs) return; s->Input[input].Allocated = 1; s->Input[input].File = RC_FILE_TEMPORARY; s->Input[input].Index = hwreg; } /* This functions offsets the temporary register indices by the number * of input registers, because input registers are actually temporaries and * should not occupy the same space. * * This pass is supposed to be used to maintain correct allocation of inputs * if the standard register allocation is disabled. */ static void do_regalloc_inputs_only(struct regalloc_state * s) { for (unsigned i = 0; i < s->NumTemporaries; i++) { s->Temporary[i].Allocated = 1; s->Temporary[i].File = RC_FILE_TEMPORARY; s->Temporary[i].Index = i + s->NumInputs; } } static unsigned int is_derivative(rc_opcode op) { return (op == RC_OPCODE_DDX || op == RC_OPCODE_DDY); } static int find_class( struct rc_class * classes, unsigned int writemask, unsigned int max_writemask_count) { unsigned int i; for (i = 0; i < RC_REG_CLASS_COUNT; i++) { unsigned int j; if (classes[i].WritemaskCount > max_writemask_count) { continue; } for (j = 0; j < 3; j++) { if (classes[i].Writemasks[j] == writemask) { return i; } } } return -1; } struct variable_get_class_cb_data { unsigned int * can_change_writemask; unsigned int conversion_swizzle; }; static void variable_get_class_read_cb( void * userdata, struct rc_instruction * inst, struct rc_pair_instruction_arg * arg, struct rc_pair_instruction_source * src) { struct variable_get_class_cb_data * d = userdata; unsigned int new_swizzle = rc_adjust_channels(arg->Swizzle, d->conversion_swizzle); if (!r300_swizzle_is_native_basic(new_swizzle)) { *d->can_change_writemask = 0; } } static enum rc_reg_class variable_get_class( struct rc_variable * variable, struct rc_class * classes) { unsigned int i; unsigned int can_change_writemask= 1; unsigned int writemask = rc_variable_writemask_sum(variable); struct rc_list * readers = rc_variable_readers_union(variable); int class_index; if (!variable->C->is_r500) { struct rc_class c; struct rc_variable * var_ptr; /* The assumption here is that if an instruction has type * RC_INSTRUCTION_NORMAL then it is a TEX instruction. * r300 and r400 can't swizzle the result of a TEX lookup. */ for (var_ptr = variable; var_ptr; var_ptr = var_ptr->Friend) { if (var_ptr->Inst->Type == RC_INSTRUCTION_NORMAL) { writemask = RC_MASK_XYZW; } } /* Check if it is possible to do swizzle packing for r300/r400 * without creating non-native swizzles. */ class_index = find_class(classes, writemask, 3); if (class_index < 0) { goto error; } c = classes[class_index]; if (c.WritemaskCount == 1) { goto done; } for (i = 0; i < c.WritemaskCount; i++) { struct rc_variable * var_ptr; for (var_ptr = variable; var_ptr; var_ptr = var_ptr->Friend) { int j; unsigned int conversion_swizzle = rc_make_conversion_swizzle( writemask, c.Writemasks[i]); struct variable_get_class_cb_data d; d.can_change_writemask = &can_change_writemask; d.conversion_swizzle = conversion_swizzle; /* If we get this far var_ptr->Inst has to * be a pair instruction. If variable or any * of its friends are normal instructions, * then the writemask will be set to RC_MASK_XYZW * and the function will return before it gets * here. */ rc_pair_for_all_reads_arg(var_ptr->Inst, variable_get_class_read_cb, &d); for (j = 0; j < var_ptr->ReaderCount; j++) { unsigned int old_swizzle; unsigned int new_swizzle; struct rc_reader r = var_ptr->Readers[j]; if (r.Inst->Type == RC_INSTRUCTION_PAIR ) { old_swizzle = r.U.P.Arg->Swizzle; } else { old_swizzle = r.U.I.Src->Swizzle; } new_swizzle = rc_adjust_channels( old_swizzle, conversion_swizzle); if (!r300_swizzle_is_native_basic( new_swizzle)) { can_change_writemask = 0; break; } } if (!can_change_writemask) { break; } } if (!can_change_writemask) { break; } } } if (variable->Inst->Type == RC_INSTRUCTION_PAIR) { /* DDX/DDY seem to always fail when their writemasks are * changed.*/ if (is_derivative(variable->Inst->U.P.RGB.Opcode) || is_derivative(variable->Inst->U.P.Alpha.Opcode)) { can_change_writemask = 0; } } for ( ; readers; readers = readers->Next) { struct rc_reader * r = readers->Item; if (r->Inst->Type == RC_INSTRUCTION_PAIR) { if (r->U.P.Arg->Source == RC_PAIR_PRESUB_SRC) { can_change_writemask = 0; break; } /* DDX/DDY also fail when their swizzles are changed. */ if (is_derivative(r->Inst->U.P.RGB.Opcode) || is_derivative(r->Inst->U.P.Alpha.Opcode)) { can_change_writemask = 0; break; } } } class_index = find_class(classes, writemask, can_change_writemask ? 3 : 1); done: if (class_index > -1) { return classes[class_index].Class; } else { error: rc_error(variable->C, "Could not find class for index=%u mask=%u\n", variable->Dst.Index, writemask); return 0; } } static unsigned int overlap_live_intervals_array( struct live_intervals * a, struct live_intervals * b) { unsigned int a_chan, b_chan; for (a_chan = 0; a_chan < 4; a_chan++) { for (b_chan = 0; b_chan < 4; b_chan++) { if (overlap_live_intervals(&a[a_chan], &b[b_chan])) { return 1; } } } return 0; } static unsigned int reg_get_index(int reg) { return reg / RC_MASK_XYZW; } static unsigned int reg_get_writemask(int reg) { return (reg % RC_MASK_XYZW) + 1; } static int get_reg_id(unsigned int index, unsigned int writemask) { assert(writemask); if (writemask == 0) { return 0; } return (index * RC_MASK_XYZW) + (writemask - 1); } #if VERBOSE static void print_reg(int reg) { unsigned int index = reg_get_index(reg); unsigned int mask = reg_get_writemask(reg); fprintf(stderr, "Temp[%u].%c%c%c%c", index, mask & RC_MASK_X ? 'x' : '_', mask & RC_MASK_Y ? 'y' : '_', mask & RC_MASK_Z ? 'z' : '_', mask & RC_MASK_W ? 'w' : '_'); } #endif static void add_register_conflicts( struct ra_regs * regs, unsigned int max_temp_regs) { unsigned int index, a_mask, b_mask; for (index = 0; index < max_temp_regs; index++) { for(a_mask = 1; a_mask <= RC_MASK_XYZW; a_mask++) { for (b_mask = a_mask + 1; b_mask <= RC_MASK_XYZW; b_mask++) { if (a_mask & b_mask) { ra_add_reg_conflict(regs, get_reg_id(index, a_mask), get_reg_id(index, b_mask)); } } } } } static void do_advanced_regalloc(struct regalloc_state * s) { struct rc_class rc_class_list [] = { {RC_REG_CLASS_SINGLE, 3, 0, 0, {RC_MASK_X, RC_MASK_Y, RC_MASK_Z}}, {RC_REG_CLASS_DOUBLE, 3, 0, 0, {RC_MASK_X | RC_MASK_Y, RC_MASK_X | RC_MASK_Z, RC_MASK_Y | RC_MASK_Z}}, {RC_REG_CLASS_TRIPLE, 1, 0, 0, {RC_MASK_X | RC_MASK_Y | RC_MASK_Z, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_ALPHA, 1, 0, 0, {RC_MASK_W, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_SINGLE_PLUS_ALPHA, 3, 0, 0, {RC_MASK_X | RC_MASK_W, RC_MASK_Y | RC_MASK_W, RC_MASK_Z | RC_MASK_W}}, {RC_REG_CLASS_DOUBLE_PLUS_ALPHA, 3, 0, 0, {RC_MASK_X | RC_MASK_Y | RC_MASK_W, RC_MASK_X | RC_MASK_Z | RC_MASK_W, RC_MASK_Y | RC_MASK_Z | RC_MASK_W}}, {RC_REG_CLASS_TRIPLE_PLUS_ALPHA, 1, 0, 0, {RC_MASK_X | RC_MASK_Y | RC_MASK_Z | RC_MASK_W, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_X, 1, 0, 0, {RC_MASK_X, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_Y, 1, 0, 0, {RC_MASK_Y, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_Z, 1, 0, 0, {RC_MASK_Z, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_XY, 1, 0, 0, {RC_MASK_X | RC_MASK_Y, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_YZ, 1, 0, 0, {RC_MASK_Y | RC_MASK_Z, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_XZ, 1, 0, 0, {RC_MASK_X | RC_MASK_Z, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_XW, 1, 0, 0, {RC_MASK_X | RC_MASK_W, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_YW, 1, 0, 0, {RC_MASK_Y | RC_MASK_W, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_ZW, 1, 0, 0, {RC_MASK_Z | RC_MASK_W, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_XYW, 1, 0, 0, {RC_MASK_X | RC_MASK_Y | RC_MASK_W, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_YZW, 1, 0, 0, {RC_MASK_Y | RC_MASK_Z | RC_MASK_W, RC_MASK_NONE, RC_MASK_NONE}}, {RC_REG_CLASS_XZW, 1, 0, 0, {RC_MASK_X | RC_MASK_Z | RC_MASK_W, RC_MASK_NONE, RC_MASK_NONE}} }; unsigned int i, j, index, input_node, node_count, node_index; unsigned int * node_classes; unsigned int * input_classes; struct rc_instruction * inst; struct rc_list * var_ptr; struct rc_list * variables; struct ra_regs * regs; struct ra_graph * graph; /* Allocate the main ra data structure */ regs = ra_alloc_reg_set(NULL, s->C->max_temp_regs * RC_MASK_XYZW); /* Get list of program variables */ variables = rc_get_variables(s->C); node_count = rc_list_count(variables); node_classes = memory_pool_malloc(&s->C->Pool, node_count * sizeof(unsigned int)); input_classes = memory_pool_malloc(&s->C->Pool, s->NumInputs * sizeof(unsigned int)); for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) { unsigned int class_index; /* Compute the live intervals */ rc_variable_compute_live_intervals(var_ptr->Item); class_index = variable_get_class(var_ptr->Item, rc_class_list); /* If we haven't used this register class yet, mark it * as used and allocate space for it. */ if (!rc_class_list[class_index].Used) { rc_class_list[class_index].Used = 1; rc_class_list[class_index].Id = ra_alloc_reg_class(regs); } node_classes[node_index] = rc_class_list[class_index].Id; } /* Assign registers to the classes */ for (i = 0; i < RC_REG_CLASS_COUNT; i++) { struct rc_class class = rc_class_list[i]; if (!class.Used) { continue; } for (index = 0; index < s->C->max_temp_regs; index++) { for (j = 0; j < class.WritemaskCount; j++) { int reg_id = get_reg_id(index, class.Writemasks[j]); ra_class_add_reg(regs, class.Id, reg_id); } } } /* Add register conflicts */ add_register_conflicts(regs, s->C->max_temp_regs); /* Calculate live intervals for input registers */ for (inst = s->C->Program.Instructions.Next; inst != &s->C->Program.Instructions; inst = inst->Next) { rc_opcode op = rc_get_flow_control_inst(inst); if (op == RC_OPCODE_BGNLOOP) { struct rc_instruction * endloop = rc_match_bgnloop(inst); if (endloop->IP > s->LoopEnd) { s->LoopEnd = endloop->IP; } } rc_for_all_reads_mask(inst, scan_read_callback, s); } /* Create classes for input registers */ for (i = 0; i < s->NumInputs; i++) { unsigned int chan, class_id, writemask = 0; for (chan = 0; chan < 4; chan++) { if (s->Input[i].Live[chan].Used) { writemask |= (1 << chan); } } s->Input[i].Writemask = writemask; if (!writemask) { continue; } class_id = ra_alloc_reg_class(regs); input_classes[i] = class_id; ra_class_add_reg(regs, class_id, get_reg_id(s->Input[i].Index, writemask)); } ra_set_finalize(regs); graph = ra_alloc_interference_graph(regs, node_count + s->NumInputs); /* Build the interference graph */ for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next,node_index++) { struct rc_list * a, * b; unsigned int b_index; ra_set_node_class(graph, node_index, node_classes[node_index]); for (a = var_ptr, b = var_ptr->Next, b_index = node_index + 1; b; b = b->Next, b_index++) { struct rc_variable * var_a = a->Item; while (var_a) { struct rc_variable * var_b = b->Item; while (var_b) { if (overlap_live_intervals_array(var_a->Live, var_b->Live)) { ra_add_node_interference(graph, node_index, b_index); } var_b = var_b->Friend; } var_a = var_a->Friend; } } } /* Add input registers to the interference graph */ for (i = 0, input_node = 0; i< s->NumInputs; i++) { if (!s->Input[i].Writemask) { continue; } ra_set_node_class(graph, node_count + input_node, input_classes[i]); for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) { struct rc_variable * var = var_ptr->Item; if (overlap_live_intervals_array(s->Input[i].Live, var->Live)) { ra_add_node_interference(graph, node_index, node_count + input_node); } } /* Manually allocate a register for this input */ ra_set_node_reg(graph, node_count + input_node, get_reg_id( s->Input[i].Index, s->Input[i].Writemask)); input_node++; } if (!ra_allocate_no_spills(graph)) { rc_error(s->C, "Ran out of hardware temporaries\n"); return; } /* Rewrite the registers */ for (var_ptr = variables, node_index = 0; var_ptr; var_ptr = var_ptr->Next, node_index++) { int reg = ra_get_node_reg(graph, node_index); unsigned int writemask = reg_get_writemask(reg); unsigned int index = reg_get_index(reg); struct rc_variable * var = var_ptr->Item; if (!s->C->is_r500 && var->Inst->Type == RC_INSTRUCTION_NORMAL) { writemask = rc_variable_writemask_sum(var); } if (var->Dst.File == RC_FILE_INPUT) { continue; } rc_variable_change_dst(var, index, writemask); } ralloc_free(graph); ralloc_free(regs); } /** * @param user This parameter should be a pointer to an integer value. If this * integer value is zero, then a simple register allocator will be used that * only allocates space for input registers (\sa do_regalloc_inputs_only). If * user is non-zero, then the regular register allocator will be used * (\sa do_regalloc). */ void rc_pair_regalloc(struct radeon_compiler *cc, void *user) { struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)cc; struct regalloc_state s; int * do_full_regalloc = (int*)user; memset(&s, 0, sizeof(s)); s.C = cc; s.NumInputs = rc_get_max_index(cc, RC_FILE_INPUT) + 1; s.Input = memory_pool_malloc(&cc->Pool, s.NumInputs * sizeof(struct register_info)); memset(s.Input, 0, s.NumInputs * sizeof(struct register_info)); s.NumTemporaries = rc_get_max_index(cc, RC_FILE_TEMPORARY) + 1; s.Temporary = memory_pool_malloc(&cc->Pool, s.NumTemporaries * sizeof(struct register_info)); memset(s.Temporary, 0, s.NumTemporaries * sizeof(struct register_info)); rc_recompute_ips(s.C); c->AllocateHwInputs(c, &alloc_input_simple, &s); if (*do_full_regalloc) { do_advanced_regalloc(&s); } else { s.Simple = 1; do_regalloc_inputs_only(&s); } /* Rewrite inputs and if we are doing the simple allocation, rewrite * temporaries too. */ for (struct rc_instruction *inst = s.C->Program.Instructions.Next; inst != &s.C->Program.Instructions; inst = inst->Next) { rc_remap_registers(inst, &remap_register, &s); } }