/*
 * Copyright © 2014 Broadcom
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include <stdbool.h>
#include "util/ralloc.h"
#include "vc4_qir.h"
#include "vc4_qpu.h"

#define QPU_MUX(mux, muxfield)                                  \
        QPU_SET_FIELD(mux != QPU_MUX_SMALL_IMM ? mux : QPU_MUX_B, muxfield)

static uint64_t
set_src_raddr(uint64_t inst, struct qpu_reg src)
{
        if (src.mux == QPU_MUX_A) {
                assert(QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_NOP ||
                       QPU_GET_FIELD(inst, QPU_RADDR_A) == src.addr);
                return QPU_UPDATE_FIELD(inst, src.addr, QPU_RADDR_A);
        }

        if (src.mux == QPU_MUX_B) {
                assert((QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_NOP ||
                        QPU_GET_FIELD(inst, QPU_RADDR_B) == src.addr) &&
                       QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM);
                return QPU_UPDATE_FIELD(inst, src.addr, QPU_RADDR_B);
        }

        if (src.mux == QPU_MUX_SMALL_IMM) {
                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM) {
                        assert(QPU_GET_FIELD(inst, QPU_RADDR_B) == src.addr);
                } else {
                        inst = qpu_set_sig(inst, QPU_SIG_SMALL_IMM);
                        assert(QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_NOP);
                }
                return ((inst & ~QPU_RADDR_B_MASK) |
                        QPU_SET_FIELD(src.addr, QPU_RADDR_B));
        }

        return inst;
}

uint64_t
qpu_NOP()
{
        uint64_t inst = 0;

        inst |= QPU_SET_FIELD(QPU_A_NOP, QPU_OP_ADD);
        inst |= QPU_SET_FIELD(QPU_M_NOP, QPU_OP_MUL);

        /* Note: These field values are actually non-zero */
        inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD);
        inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL);
        inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A);
        inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B);
        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);

        return inst;
}

static uint64_t
qpu_a_dst(struct qpu_reg dst)
{
        uint64_t inst = 0;

        if (dst.mux <= QPU_MUX_R5) {
                /* Translate the mux to the ACCn values. */
                inst |= QPU_SET_FIELD(32 + dst.mux, QPU_WADDR_ADD);
        } else {
                inst |= QPU_SET_FIELD(dst.addr, QPU_WADDR_ADD);
                if (dst.mux == QPU_MUX_B)
                        inst |= QPU_WS;
        }

        return inst;
}

static uint64_t
qpu_m_dst(struct qpu_reg dst)
{
        uint64_t inst = 0;

        if (dst.mux <= QPU_MUX_R5) {
                /* Translate the mux to the ACCn values. */
                inst |= QPU_SET_FIELD(32 + dst.mux, QPU_WADDR_MUL);
        } else {
                inst |= QPU_SET_FIELD(dst.addr, QPU_WADDR_MUL);
                if (dst.mux == QPU_MUX_A)
                        inst |= QPU_WS;
        }

        return inst;
}

uint64_t
qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src)
{
        uint64_t inst = 0;

        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
        inst |= QPU_SET_FIELD(QPU_A_OR, QPU_OP_ADD);
        inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A);
        inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B);
        inst |= qpu_a_dst(dst);
        inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_ADD);
        inst |= QPU_MUX(src.mux, QPU_ADD_A);
        inst |= QPU_MUX(src.mux, QPU_ADD_B);
        inst = set_src_raddr(inst, src);
        inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL);

        return inst;
}

uint64_t
qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src)
{
        uint64_t inst = 0;

        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
        inst |= QPU_SET_FIELD(QPU_M_V8MIN, QPU_OP_MUL);
        inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A);
        inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B);
        inst |= qpu_m_dst(dst);
        inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_MUL);
        inst |= QPU_MUX(src.mux, QPU_MUL_A);
        inst |= QPU_MUX(src.mux, QPU_MUL_B);
        inst = set_src_raddr(inst, src);
        inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD);

        return inst;
}

uint64_t
qpu_load_imm_ui(struct qpu_reg dst, uint32_t val)
{
        uint64_t inst = 0;

        inst |= qpu_a_dst(dst);
        inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL);
        inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_ADD);
        inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_MUL);
        inst |= QPU_SET_FIELD(QPU_SIG_LOAD_IMM, QPU_SIG);
        inst |= val;

        return inst;
}

uint64_t
qpu_load_imm_u2(struct qpu_reg dst, uint32_t val)
{
        return qpu_load_imm_ui(dst, val) | QPU_SET_FIELD(QPU_LOAD_IMM_MODE_U2,
                                                         QPU_LOAD_IMM_MODE);
}

uint64_t
qpu_load_imm_i2(struct qpu_reg dst, uint32_t val)
{
        return qpu_load_imm_ui(dst, val) | QPU_SET_FIELD(QPU_LOAD_IMM_MODE_I2,
                                                         QPU_LOAD_IMM_MODE);
}

uint64_t
qpu_branch(uint32_t cond, uint32_t target)
{
        uint64_t inst = 0;

        inst |= qpu_a_dst(qpu_ra(QPU_W_NOP));
        inst |= qpu_m_dst(qpu_rb(QPU_W_NOP));
        inst |= QPU_SET_FIELD(cond, QPU_BRANCH_COND);
        inst |= QPU_SET_FIELD(QPU_SIG_BRANCH, QPU_SIG);
        inst |= QPU_SET_FIELD(target, QPU_BRANCH_TARGET);

        return inst;
}

uint64_t
qpu_a_alu2(enum qpu_op_add op,
           struct qpu_reg dst, struct qpu_reg src0, struct qpu_reg src1)
{
        uint64_t inst = 0;

        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
        inst |= QPU_SET_FIELD(op, QPU_OP_ADD);
        inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A);
        inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B);
        inst |= qpu_a_dst(dst);
        inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_ADD);
        inst |= QPU_MUX(src0.mux, QPU_ADD_A);
        inst = set_src_raddr(inst, src0);
        inst |= QPU_MUX(src1.mux, QPU_ADD_B);
        inst = set_src_raddr(inst, src1);
        inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL);

        return inst;
}

uint64_t
qpu_m_alu2(enum qpu_op_mul op,
           struct qpu_reg dst, struct qpu_reg src0, struct qpu_reg src1)
{
        uint64_t inst = 0;

        inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG);
        inst |= QPU_SET_FIELD(op, QPU_OP_MUL);
        inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A);
        inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B);
        inst |= qpu_m_dst(dst);
        inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_MUL);
        inst |= QPU_MUX(src0.mux, QPU_MUL_A);
        inst = set_src_raddr(inst, src0);
        inst |= QPU_MUX(src1.mux, QPU_MUL_B);
        inst = set_src_raddr(inst, src1);
        inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD);

        return inst;
}

uint64_t
qpu_m_rot(struct qpu_reg dst, struct qpu_reg src0, int rot)
{
	uint64_t inst = 0;
	inst = qpu_m_alu2(QPU_M_V8MIN, dst, src0, src0);

	inst = QPU_UPDATE_FIELD(inst, QPU_SIG_SMALL_IMM, QPU_SIG);
	inst = QPU_UPDATE_FIELD(inst, QPU_SMALL_IMM_MUL_ROT + rot,
                                QPU_SMALL_IMM);

	return inst;
}

static bool
merge_fields(uint64_t *merge,
             uint64_t a, uint64_t b,
             uint64_t mask, uint64_t ignore)
{
        if ((a & mask) == ignore) {
                *merge = (*merge & ~mask) | (b & mask);
        } else if ((b & mask) == ignore) {
                *merge = (*merge & ~mask) | (a & mask);
        } else {
                if ((a & mask) != (b & mask))
                        return false;
        }

        return true;
}

int
qpu_num_sf_accesses(uint64_t inst)
{
        int accesses = 0;
        static const uint32_t specials[] = {
                QPU_W_TLB_COLOR_MS,
                QPU_W_TLB_COLOR_ALL,
                QPU_W_TLB_Z,
                QPU_W_TMU0_S,
                QPU_W_TMU0_T,
                QPU_W_TMU0_R,
                QPU_W_TMU0_B,
                QPU_W_TMU1_S,
                QPU_W_TMU1_T,
                QPU_W_TMU1_R,
                QPU_W_TMU1_B,
                QPU_W_SFU_RECIP,
                QPU_W_SFU_RECIPSQRT,
                QPU_W_SFU_EXP,
                QPU_W_SFU_LOG,
        };
        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
        uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
        uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);

        for (int j = 0; j < ARRAY_SIZE(specials); j++) {
                if (waddr_add == specials[j])
                        accesses++;
                if (waddr_mul == specials[j])
                        accesses++;
        }

        if (raddr_a == QPU_R_MUTEX_ACQUIRE)
                accesses++;
        if (raddr_b == QPU_R_MUTEX_ACQUIRE &&
            QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM)
                accesses++;

        /* XXX: semaphore, combined color read/write? */
        switch (QPU_GET_FIELD(inst, QPU_SIG)) {
        case QPU_SIG_COLOR_LOAD:
        case QPU_SIG_COLOR_LOAD_END:
        case QPU_SIG_LOAD_TMU0:
        case QPU_SIG_LOAD_TMU1:
                accesses++;
        }

        return accesses;
}

static bool
qpu_waddr_ignores_ws(uint32_t waddr)
{
        switch(waddr) {
        case QPU_W_ACC0:
        case QPU_W_ACC1:
        case QPU_W_ACC2:
        case QPU_W_ACC3:
        case QPU_W_NOP:
        case QPU_W_TLB_Z:
        case QPU_W_TLB_COLOR_MS:
        case QPU_W_TLB_COLOR_ALL:
        case QPU_W_TLB_ALPHA_MASK:
        case QPU_W_VPM:
        case QPU_W_SFU_RECIP:
        case QPU_W_SFU_RECIPSQRT:
        case QPU_W_SFU_EXP:
        case QPU_W_SFU_LOG:
        case QPU_W_TMU0_S:
        case QPU_W_TMU0_T:
        case QPU_W_TMU0_R:
        case QPU_W_TMU0_B:
        case QPU_W_TMU1_S:
        case QPU_W_TMU1_T:
        case QPU_W_TMU1_R:
        case QPU_W_TMU1_B:
                return true;
        }

        return false;
}

static void
swap_ra_file_mux_helper(uint64_t *merge, uint64_t *a, uint32_t mux_shift)
{
        uint64_t mux_mask = (uint64_t)0x7 << mux_shift;
        uint64_t mux_a_val = (uint64_t)QPU_MUX_A << mux_shift;
        uint64_t mux_b_val = (uint64_t)QPU_MUX_B << mux_shift;

        if ((*a & mux_mask) == mux_a_val) {
                *a = (*a & ~mux_mask) | mux_b_val;
                *merge = (*merge & ~mux_mask) | mux_b_val;
        }
}

static bool
try_swap_ra_file(uint64_t *merge, uint64_t *a, uint64_t *b)
{
        uint32_t raddr_a_a = QPU_GET_FIELD(*a, QPU_RADDR_A);
        uint32_t raddr_a_b = QPU_GET_FIELD(*a, QPU_RADDR_B);
        uint32_t raddr_b_a = QPU_GET_FIELD(*b, QPU_RADDR_A);
        uint32_t raddr_b_b = QPU_GET_FIELD(*b, QPU_RADDR_B);

        if (raddr_a_b != QPU_R_NOP)
                return false;

        switch (raddr_a_a) {
        case QPU_R_UNIF:
        case QPU_R_VARY:
                break;
        default:
                return false;
        }

        if (!(*merge & QPU_PM) &&
            QPU_GET_FIELD(*merge, QPU_UNPACK) != QPU_UNPACK_NOP) {
                return false;
        }

        if (raddr_b_b != QPU_R_NOP &&
            raddr_b_b != raddr_a_a)
                return false;

        /* Move raddr A to B in instruction a. */
        *a = (*a & ~QPU_RADDR_A_MASK) | QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A);
        *a = (*a & ~QPU_RADDR_B_MASK) | QPU_SET_FIELD(raddr_a_a, QPU_RADDR_B);
        *merge = QPU_UPDATE_FIELD(*merge, raddr_b_a, QPU_RADDR_A);
        *merge = QPU_UPDATE_FIELD(*merge, raddr_a_a, QPU_RADDR_B);
        swap_ra_file_mux_helper(merge, a, QPU_ADD_A_SHIFT);
        swap_ra_file_mux_helper(merge, a, QPU_ADD_B_SHIFT);
        swap_ra_file_mux_helper(merge, a, QPU_MUL_A_SHIFT);
        swap_ra_file_mux_helper(merge, a, QPU_MUL_B_SHIFT);

        return true;
}

static bool
convert_mov(uint64_t *inst)
{
        uint32_t add_a = QPU_GET_FIELD(*inst, QPU_ADD_A);
        uint32_t waddr_add = QPU_GET_FIELD(*inst, QPU_WADDR_ADD);
        uint32_t cond_add = QPU_GET_FIELD(*inst, QPU_COND_ADD);

        /* Is it a MOV? */
        if (QPU_GET_FIELD(*inst, QPU_OP_ADD) != QPU_A_OR ||
            (add_a != QPU_GET_FIELD(*inst, QPU_ADD_B))) {
                return false;
        }

        if (QPU_GET_FIELD(*inst, QPU_SIG) != QPU_SIG_NONE)
                return false;

        /* We could maybe support this in the .8888 and .8a-.8d cases. */
        if (*inst & QPU_PM)
                return false;

        *inst = QPU_UPDATE_FIELD(*inst, QPU_A_NOP, QPU_OP_ADD);
        *inst = QPU_UPDATE_FIELD(*inst, QPU_M_V8MIN, QPU_OP_MUL);

        *inst = QPU_UPDATE_FIELD(*inst, add_a, QPU_MUL_A);
        *inst = QPU_UPDATE_FIELD(*inst, add_a, QPU_MUL_B);
        *inst = QPU_UPDATE_FIELD(*inst, QPU_MUX_R0, QPU_ADD_A);
        *inst = QPU_UPDATE_FIELD(*inst, QPU_MUX_R0, QPU_ADD_B);

        *inst = QPU_UPDATE_FIELD(*inst, waddr_add, QPU_WADDR_MUL);
        *inst = QPU_UPDATE_FIELD(*inst, QPU_W_NOP, QPU_WADDR_ADD);

        *inst = QPU_UPDATE_FIELD(*inst, cond_add, QPU_COND_MUL);
        *inst = QPU_UPDATE_FIELD(*inst, QPU_COND_NEVER, QPU_COND_ADD);

        if (!qpu_waddr_ignores_ws(waddr_add))
                *inst ^= QPU_WS;

        return true;
}

static bool
writes_a_file(uint64_t inst)
{
        if (!(inst & QPU_WS))
                return QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32;
        else
                return QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32;
}

static bool
reads_r4(uint64_t inst)
{
        return (QPU_GET_FIELD(inst, QPU_ADD_A) == QPU_MUX_R4 ||
                QPU_GET_FIELD(inst, QPU_ADD_B) == QPU_MUX_R4 ||
                QPU_GET_FIELD(inst, QPU_MUL_A) == QPU_MUX_R4 ||
                QPU_GET_FIELD(inst, QPU_MUL_B) == QPU_MUX_R4);
}

uint64_t
qpu_merge_inst(uint64_t a, uint64_t b)
{
        uint64_t merge = a | b;
        bool ok = true;
        uint32_t a_sig = QPU_GET_FIELD(a, QPU_SIG);
        uint32_t b_sig = QPU_GET_FIELD(b, QPU_SIG);

        if (QPU_GET_FIELD(a, QPU_OP_ADD) != QPU_A_NOP &&
            QPU_GET_FIELD(b, QPU_OP_ADD) != QPU_A_NOP) {
                if (QPU_GET_FIELD(a, QPU_OP_MUL) != QPU_M_NOP ||
                    QPU_GET_FIELD(b, QPU_OP_MUL) != QPU_M_NOP ||
                    !(convert_mov(&a) || convert_mov(&b))) {
                        return 0;
                } else {
                        merge = a | b;
                }
        }

        if (QPU_GET_FIELD(a, QPU_OP_MUL) != QPU_M_NOP &&
            QPU_GET_FIELD(b, QPU_OP_MUL) != QPU_M_NOP)
                return 0;

        if (qpu_num_sf_accesses(a) && qpu_num_sf_accesses(b))
                return 0;

        if (a_sig == QPU_SIG_LOAD_IMM ||
            b_sig == QPU_SIG_LOAD_IMM ||
            a_sig == QPU_SIG_SMALL_IMM ||
            b_sig == QPU_SIG_SMALL_IMM ||
            a_sig == QPU_SIG_BRANCH ||
            b_sig == QPU_SIG_BRANCH) {
                return 0;
        }

        ok = ok && merge_fields(&merge, a, b, QPU_SIG_MASK,
                                QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG));

        /* Misc fields that have to match exactly. */
        ok = ok && merge_fields(&merge, a, b, QPU_SF, ~0);

        if (!merge_fields(&merge, a, b, QPU_RADDR_A_MASK,
                          QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A))) {
                /* Since we tend to use regfile A by default both for register
                 * allocation and for our special values (uniforms and
                 * varyings), try swapping uniforms and varyings to regfile B
                 * to resolve raddr A conflicts.
                 */
                if (!try_swap_ra_file(&merge, &a, &b) &&
                    !try_swap_ra_file(&merge, &b, &a)) {
                        return 0;
                }
        }

        ok = ok && merge_fields(&merge, a, b, QPU_RADDR_B_MASK,
                                QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B));

        ok = ok && merge_fields(&merge, a, b, QPU_WADDR_ADD_MASK,
                                QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD));
        ok = ok && merge_fields(&merge, a, b, QPU_WADDR_MUL_MASK,
                                QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL));

        /* Allow disagreement on WS (swapping A vs B physical reg file as the
         * destination for ADD/MUL) if one of the original instructions
         * ignores it (probably because it's just writing to accumulators).
         */
        if (qpu_waddr_ignores_ws(QPU_GET_FIELD(a, QPU_WADDR_ADD)) &&
            qpu_waddr_ignores_ws(QPU_GET_FIELD(a, QPU_WADDR_MUL))) {
                merge = (merge & ~QPU_WS) | (b & QPU_WS);
        } else if (qpu_waddr_ignores_ws(QPU_GET_FIELD(b, QPU_WADDR_ADD)) &&
                   qpu_waddr_ignores_ws(QPU_GET_FIELD(b, QPU_WADDR_MUL))) {
                merge = (merge & ~QPU_WS) | (a & QPU_WS);
        } else {
                if ((a & QPU_WS) != (b & QPU_WS))
                        return 0;
        }

        if (!merge_fields(&merge, a, b, QPU_PM, ~0)) {
                /* If one instruction has PM bit set and the other not, the
                 * one without PM shouldn't do packing/unpacking, and we
                 * have to make sure non-NOP packing/unpacking from PM
                 * instruction aren't added to it.
                 */
                uint64_t temp;

                /* Let a be the one with PM bit */
                if (!(a & QPU_PM)) {
                        temp = a;
                        a = b;
                        b = temp;
                }

                if ((b & (QPU_PACK_MASK | QPU_UNPACK_MASK)) != 0)
                        return 0;

                if ((a & QPU_PACK_MASK) != 0 &&
                    QPU_GET_FIELD(b, QPU_OP_MUL) != QPU_M_NOP)
                        return 0;

                if ((a & QPU_UNPACK_MASK) != 0 && reads_r4(b))
                        return 0;
        } else {
                /* packing: Make sure that non-NOP packs agree, then deal with
                 * special-case failing of adding a non-NOP pack to something
                 * with a NOP pack.
                 */
                if (!merge_fields(&merge, a, b, QPU_PACK_MASK, 0))
                        return 0;
                bool new_a_pack = (QPU_GET_FIELD(a, QPU_PACK) !=
                                QPU_GET_FIELD(merge, QPU_PACK));
                bool new_b_pack = (QPU_GET_FIELD(b, QPU_PACK) !=
                                QPU_GET_FIELD(merge, QPU_PACK));
                if (!(merge & QPU_PM)) {
                        /* Make sure we're not going to be putting a new
                         * a-file packing on either half.
                         */
                        if (new_a_pack && writes_a_file(a))
                                return 0;

                        if (new_b_pack && writes_a_file(b))
                                return 0;
                } else {
                        /* Make sure we're not going to be putting new MUL
                         * packing oneither half.
                         */
                        if (new_a_pack &&
                            QPU_GET_FIELD(a, QPU_OP_MUL) != QPU_M_NOP)
                                return 0;

                        if (new_b_pack &&
                            QPU_GET_FIELD(b, QPU_OP_MUL) != QPU_M_NOP)
                                return 0;
                }

                /* unpacking: Make sure that non-NOP unpacks agree, then deal
                 * with special-case failing of adding a non-NOP unpack to
                 * something with a NOP unpack.
                 */
                if (!merge_fields(&merge, a, b, QPU_UNPACK_MASK, 0))
                        return 0;
                bool new_a_unpack = (QPU_GET_FIELD(a, QPU_UNPACK) !=
                                QPU_GET_FIELD(merge, QPU_UNPACK));
                bool new_b_unpack = (QPU_GET_FIELD(b, QPU_UNPACK) !=
                                QPU_GET_FIELD(merge, QPU_UNPACK));
                if (!(merge & QPU_PM)) {
                        /* Make sure we're not going to be putting a new
                         * a-file packing on either half.
                         */
                        if (new_a_unpack &&
                            QPU_GET_FIELD(a, QPU_RADDR_A) != QPU_R_NOP)
                                return 0;

                        if (new_b_unpack &&
                            QPU_GET_FIELD(b, QPU_RADDR_A) != QPU_R_NOP)
                                return 0;
                } else {
                        /* Make sure we're not going to be putting new r4
                         * unpack on either half.
                         */
                        if (new_a_unpack && reads_r4(a))
                                return 0;

                        if (new_b_unpack && reads_r4(b))
                                return 0;
                }
        }

        if (ok)
                return merge;
        else
                return 0;
}

uint64_t
qpu_set_sig(uint64_t inst, uint32_t sig)
{
        assert(QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_NONE);
        return QPU_UPDATE_FIELD(inst, sig, QPU_SIG);
}

uint64_t
qpu_set_cond_add(uint64_t inst, uint32_t cond)
{
        assert(QPU_GET_FIELD(inst, QPU_COND_ADD) == QPU_COND_ALWAYS);
        return QPU_UPDATE_FIELD(inst, cond, QPU_COND_ADD);
}

uint64_t
qpu_set_cond_mul(uint64_t inst, uint32_t cond)
{
        assert(QPU_GET_FIELD(inst, QPU_COND_MUL) == QPU_COND_ALWAYS);
        return QPU_UPDATE_FIELD(inst, cond, QPU_COND_MUL);
}

bool
qpu_waddr_is_tlb(uint32_t waddr)
{
        switch (waddr) {
        case QPU_W_TLB_COLOR_ALL:
        case QPU_W_TLB_COLOR_MS:
        case QPU_W_TLB_Z:
                return true;
        default:
                return false;
        }
}

bool
qpu_inst_is_tlb(uint64_t inst)
{
        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);

        return (qpu_waddr_is_tlb(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) ||
                qpu_waddr_is_tlb(QPU_GET_FIELD(inst, QPU_WADDR_MUL)) ||
                sig == QPU_SIG_COLOR_LOAD ||
                sig == QPU_SIG_WAIT_FOR_SCOREBOARD);
}

/**
 * Returns the small immediate value to be encoded in to the raddr b field if
 * the argument can be represented as one, or ~0 otherwise.
 */
uint32_t
qpu_encode_small_immediate(uint32_t i)
{
        if (i <= 15)
                return i;
        if ((int)i < 0 && (int)i >= -16)
                return i + 32;

        switch (i) {
        case 0x3f800000:
                return 32;
        case 0x40000000:
                return 33;
        case 0x40800000:
                return 34;
        case 0x41000000:
                return 35;
        case 0x41800000:
                return 36;
        case 0x42000000:
                return 37;
        case 0x42800000:
                return 38;
        case 0x43000000:
                return 39;
        case 0x3b800000:
                return 40;
        case 0x3c000000:
                return 41;
        case 0x3c800000:
                return 42;
        case 0x3d000000:
                return 43;
        case 0x3d800000:
                return 44;
        case 0x3e000000:
                return 45;
        case 0x3e800000:
                return 46;
        case 0x3f000000:
                return 47;
        }

        return ~0;
}

void
qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst)
{
        if (c->qpu_inst_count >= c->qpu_inst_size) {
                c->qpu_inst_size = MAX2(16, c->qpu_inst_size * 2);
                c->qpu_insts = reralloc(c, c->qpu_insts,
                                        uint64_t, c->qpu_inst_size);
        }
        c->qpu_insts[c->qpu_inst_count++] = inst;
}