/*
 * This file was generated automatically by gen-template.py for 'mips'.
 *
 * --> DO NOT EDIT <--
 */

/* File: mips/header.S */
/*
 * Copyright (C) 2008 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#if defined(WITH_JIT)

/*
 * This is a #include, not a %include, because we want the C pre-processor
 * to expand the macros into assembler assignment statements.
 */
#include "../../../mterp/common/asm-constants.h"
#include "../../../mterp/common/mips-defines.h"
#include "../../../mterp/common/jit-config.h"
#include <asm/regdef.h>
#include <asm/fpregdef.h>

#ifdef	__mips_hard_float
#define		HARD_FLOAT
#else
#define		SOFT_FLOAT
#endif

/* MIPS definitions and declarations

   reg	nick		purpose
   s0	rPC		interpreted program counter, used for fetching instructions
   s1	rFP		interpreted frame pointer, used for accessing locals and args
   s2	rSELF		pointer to thread
   s3	rIBASE		interpreted instruction base pointer, used for computed goto
   s4	rINST		first 16-bit code unit of current instruction
*/

/* register offsets */
#define r_ZERO      0
#define r_AT        1
#define r_V0        2
#define r_V1        3
#define r_A0        4
#define r_A1        5
#define r_A2        6
#define r_A3        7
#define r_T0        8
#define r_T1        9
#define r_T2        10
#define r_T3        11
#define r_T4        12
#define r_T5        13
#define r_T6        14
#define r_T7        15
#define r_S0        16
#define r_S1        17
#define r_S2        18
#define r_S3        19
#define r_S4        20
#define r_S5        21
#define r_S6        22
#define r_S7        23
#define r_T8        24
#define r_T9        25
#define r_K0        26
#define r_K1        27
#define r_GP        28
#define r_SP        29
#define r_FP        30
#define r_RA        31
#define r_F0        32
#define r_F1        33
#define r_F2        34
#define r_F3        35
#define r_F4        36
#define r_F5        37
#define r_F6        38
#define r_F7        39
#define r_F8        40
#define r_F9        41
#define r_F10       42
#define r_F11       43
#define r_F12       44
#define r_F13       45
#define r_F14       46
#define r_F15       47
#define r_F16       48
#define r_F17       49
#define r_F18       50
#define r_F19       51
#define r_F20       52
#define r_F21       53
#define r_F22       54
#define r_F23       55
#define r_F24       56
#define r_F25       57
#define r_F26       58
#define r_F27       59
#define r_F28       60
#define r_F29       61
#define r_F30       62
#define r_F31       63

/* single-purpose registers, given names for clarity */
#define rPC	s0
#define rFP	s1
#define rSELF	s2
#define rIBASE	s3
#define rINST	s4
#define rOBJ	s5
#define rBIX	s6
#define rTEMP	s7

/* The long arguments sent to function calls in Big-endian mode should be register
swapped when sent to functions in little endian mode. In other words long variable
sent as a0(MSW), a1(LSW) for a function call in LE mode should be sent as a1, a0 in
Big Endian mode */

#ifdef HAVE_LITTLE_ENDIAN
#define rARG0     a0
#define rARG1     a1
#define rARG2     a2
#define rARG3     a3
#define rRESULT0  v0
#define rRESULT1  v1
#else
#define rARG0     a1
#define rARG1     a0
#define rARG2     a3
#define rARG3     a2
#define rRESULT0  v1
#define rRESULT1  v0
#endif


/* save/restore the PC and/or FP from the thread struct */
#define LOAD_PC_FROM_SELF()	lw	rPC, offThread_pc(rSELF)
#define SAVE_PC_TO_SELF()	sw	rPC, offThread_pc(rSELF)
#define LOAD_FP_FROM_SELF()	lw	rFP, offThread_curFrame(rSELF)
#define SAVE_FP_TO_SELF()	sw	rFP, offThread_curFrame(rSELF)

#define EXPORT_PC() \
	sw	rPC, (offStackSaveArea_currentPc - sizeofStackSaveArea)(rFP)

#define SAVEAREA_FROM_FP(rd, _fpreg) \
	subu	rd, _fpreg, sizeofStackSaveArea

#define FETCH_INST()			lhu	rINST, (rPC)

#define FETCH_ADVANCE_INST(_count)	lhu     rINST, (_count*2)(rPC); \
					addu	rPC, rPC, (_count * 2)

#define FETCH_ADVANCE_INST_RB(rd)	addu	rPC, rPC, rd;	\
					lhu     rINST, (rPC)

#define FETCH(rd, _count)		lhu	rd, (_count * 2)(rPC)
#define FETCH_S(rd, _count)		lh	rd, (_count * 2)(rPC)

#ifdef HAVE_LITTLE_ENDIAN

#define FETCH_B(rd, _count)            lbu     rd, (_count * 2)(rPC)
#define FETCH_C(rd, _count)            lbu     rd, (_count * 2 + 1)(rPC)

#else

#define FETCH_B(rd, _count)            lbu     rd, (_count * 2 + 1)(rPC)
#define FETCH_C(rd, _count)            lbu     rd, (_count * 2)(rPC)

#endif

#define GET_INST_OPCODE(rd)		and	rd, rINST, 0xFF

#define GOTO_OPCODE(rd)			sll  rd, rd, -1000;	\
					addu rd, rIBASE, rd;	\
					jr  rd


#define LOAD(rd, rbase)			lw  rd, 0(rbase)
#define LOAD_F(rd, rbase)		l.s rd, (rbase)
#define STORE(rd, rbase)		sw  rd, 0(rbase)
#define STORE_F(rd, rbase)		s.s rd, (rbase)

#define GET_VREG(rd, rix)		LOAD_eas2(rd,rFP,rix)

#define GET_VREG_F(rd, rix)		EAS2(AT, rFP, rix);		\
					.set noat;  l.s rd, (AT); .set at

#define SET_VREG(rd, rix)		STORE_eas2(rd, rFP, rix)

#define SET_VREG_GOTO(rd, rix, dst)	.set noreorder;		\
					sll  dst, dst, -1000;	\
					addu dst, rIBASE, dst;			\
					sll  t8, rix, 2;	\
					addu t8, t8, rFP;	\
					jr  dst;		\
					sw  rd, 0(t8);		\
					.set reorder

#define SET_VREG_F(rd, rix)		EAS2(AT, rFP, rix);		\
					.set noat;  s.s	rd, (AT); .set at


#define GET_OPA(rd)			srl     rd, rINST, 8
#ifndef		MIPS32R2
#define GET_OPA4(rd)			GET_OPA(rd);  and  rd, 0xf
#else
#define GET_OPA4(rd)			ext	rd, rd, 8, 4
#endif
#define GET_OPB(rd)			srl     rd, rINST, 12

#define LOAD_rSELF_OFF(rd,off)		lw    rd, offThread_##off##(rSELF)

#define LOAD_rSELF_method(rd)		LOAD_rSELF_OFF(rd, method)
#define LOAD_rSELF_methodClassDex(rd)	LOAD_rSELF_OFF(rd, methodClassDex)
#define LOAD_rSELF_interpStackEnd(rd)	LOAD_rSELF_OFF(rd, interpStackEnd)
#define LOAD_rSELF_retval(rd)		LOAD_rSELF_OFF(rd, retval)
#define LOAD_rSELF_pActiveProfilers(rd)	LOAD_rSELF_OFF(rd, pActiveProfilers)
#define LOAD_rSELF_bailPtr(rd)		LOAD_rSELF_OFF(rd, bailPtr)

#define GET_JIT_PROF_TABLE(rd)		LOAD_rSELF_OFF(rd,pJitProfTable)
#define GET_JIT_THRESHOLD(rd)		LOAD_rSELF_OFF(rd,jitThreshold)

/*
 * Form an Effective Address rd = rbase + roff<<n;
 * Uses reg AT
 */
#define EASN(rd,rbase,roff,rshift)	.set noat;		\
					sll  AT, roff, rshift;	\
					addu rd, rbase, AT;	\
					.set at

#define EAS1(rd,rbase,roff)		EASN(rd,rbase,roff,1)
#define EAS2(rd,rbase,roff)		EASN(rd,rbase,roff,2)
#define EAS3(rd,rbase,roff)		EASN(rd,rbase,roff,3)
#define EAS4(rd,rbase,roff)		EASN(rd,rbase,roff,4)

/*
 * Form an Effective Shift Right rd = rbase + roff>>n;
 * Uses reg AT
 */
#define ESRN(rd,rbase,roff,rshift)	.set noat;		\
					srl  AT, roff, rshift;	\
					addu rd, rbase, AT;	\
					.set at

#define LOAD_eas2(rd,rbase,roff)	EAS2(AT, rbase, roff);  \
					.set noat;  lw  rd, 0(AT); .set at

#define STORE_eas2(rd,rbase,roff)	EAS2(AT, rbase, roff);  \
					.set noat;  sw  rd, 0(AT); .set at

#define LOAD_RB_OFF(rd,rbase,off)	lw	rd, off(rbase)
#define LOADu2_RB_OFF(rd,rbase,off)	lhu	rd, off(rbase)
#define STORE_RB_OFF(rd,rbase,off)	sw	rd, off(rbase)

#ifdef HAVE_LITTLE_ENDIAN

#define STORE64_off(rlo,rhi,rbase,off)	        sw	rlo, off(rbase);	\
					        sw	rhi, (off+4)(rbase)
#define LOAD64_off(rlo,rhi,rbase,off)	        lw	rlo, off(rbase);	\
					        lw	rhi, (off+4)(rbase)

#define STORE64_off_F(rlo,rhi,rbase,off)	s.s	rlo, off(rbase);	\
						s.s	rhi, (off+4)(rbase)
#define LOAD64_off_F(rlo,rhi,rbase,off)		l.s	rlo, off(rbase);	\
						l.s	rhi, (off+4)(rbase)
#else

#define STORE64_off(rlo,rhi,rbase,off)	        sw	rlo, (off+4)(rbase);	\
					        sw	rhi, (off)(rbase)
#define LOAD64_off(rlo,rhi,rbase,off)	        lw	rlo, (off+4)(rbase);	\
					        lw	rhi, (off)(rbase)
#define STORE64_off_F(rlo,rhi,rbase,off)	s.s	rlo, (off+4)(rbase);	\
						s.s	rhi, (off)(rbase)
#define LOAD64_off_F(rlo,rhi,rbase,off)		l.s	rlo, (off+4)(rbase);	\
						l.s	rhi, (off)(rbase)
#endif

#define STORE64(rlo,rhi,rbase)		STORE64_off(rlo,rhi,rbase,0)
#define LOAD64(rlo,rhi,rbase)		LOAD64_off(rlo,rhi,rbase,0)

#define STORE64_F(rlo,rhi,rbase)	STORE64_off_F(rlo,rhi,rbase,0)
#define LOAD64_F(rlo,rhi,rbase)		LOAD64_off_F(rlo,rhi,rbase,0)

#define STORE64_lo(rd,rbase)		sw	rd, 0(rbase)
#define STORE64_hi(rd,rbase)		sw	rd, 4(rbase)


#define LOAD_offThread_exception(rd,rbase)		LOAD_RB_OFF(rd,rbase,offThread_exception)
#define LOAD_base_offArrayObject_length(rd,rbase)	LOAD_RB_OFF(rd,rbase,offArrayObject_length)
#define LOAD_base_offClassObject_accessFlags(rd,rbase)	LOAD_RB_OFF(rd,rbase,offClassObject_accessFlags)
#define LOAD_base_offClassObject_descriptor(rd,rbase)   LOAD_RB_OFF(rd,rbase,offClassObject_descriptor)
#define LOAD_base_offClassObject_super(rd,rbase)	LOAD_RB_OFF(rd,rbase,offClassObject_super)

#define LOAD_base_offClassObject_vtable(rd,rbase)	LOAD_RB_OFF(rd,rbase,offClassObject_vtable)
#define LOAD_base_offClassObject_vtableCount(rd,rbase)	LOAD_RB_OFF(rd,rbase,offClassObject_vtableCount)
#define LOAD_base_offDvmDex_pResClasses(rd,rbase)	LOAD_RB_OFF(rd,rbase,offDvmDex_pResClasses)
#define LOAD_base_offDvmDex_pResFields(rd,rbase)	LOAD_RB_OFF(rd,rbase,offDvmDex_pResFields)

#define LOAD_base_offDvmDex_pResMethods(rd,rbase)	LOAD_RB_OFF(rd,rbase,offDvmDex_pResMethods)
#define LOAD_base_offDvmDex_pResStrings(rd,rbase)	LOAD_RB_OFF(rd,rbase,offDvmDex_pResStrings)
#define LOAD_base_offInstField_byteOffset(rd,rbase)	LOAD_RB_OFF(rd,rbase,offInstField_byteOffset)
#define LOAD_base_offStaticField_value(rd,rbase)	LOAD_RB_OFF(rd,rbase,offStaticField_value)
#define LOAD_base_offMethod_clazz(rd,rbase)		LOAD_RB_OFF(rd,rbase,offMethod_clazz)

#define LOAD_base_offMethod_name(rd,rbase)		LOAD_RB_OFF(rd,rbase,offMethod_name)
#define LOAD_base_offObject_clazz(rd,rbase)		LOAD_RB_OFF(rd,rbase,offObject_clazz)

#define LOADu2_offMethod_methodIndex(rd,rbase)		LOADu2_RB_OFF(rd,rbase,offMethod_methodIndex)


#define STORE_offThread_exception(rd,rbase)		STORE_RB_OFF(rd,rbase,offThread_exception)


#define	STACK_STORE(rd,off)	sw   rd, off(sp)
#define	STACK_LOAD(rd,off)	lw   rd, off(sp)
#define CREATE_STACK(n)	 	subu sp, sp, n
#define DELETE_STACK(n)	 	addu sp, sp, n

#define SAVE_RA(offset)	 	STACK_STORE(ra, offset)
#define LOAD_RA(offset)	 	STACK_LOAD(ra, offset)

#define LOAD_ADDR(dest,addr)	la   dest, addr
#define LOAD_IMM(dest, imm)	li   dest, imm
#define MOVE_REG(dest,src)	move dest, src
#define	RETURN			jr   ra
#define	STACK_SIZE		128

#define STACK_OFFSET_ARG04	16
#define STACK_OFFSET_GP		84
#define STACK_OFFSET_rFP	112

/* This directive will make sure all subsequent jal restore gp at a known offset */
        .cprestore STACK_OFFSET_GP

#define JAL(func)		move rTEMP, ra;				\
				jal  func;				\
				move ra, rTEMP

#define JALR(reg)		move rTEMP, ra;				\
				jalr ra, reg;				\
				move ra, rTEMP

#define BAL(n)			bal  n

#define	STACK_STORE_RA()  	CREATE_STACK(STACK_SIZE);		\
				STACK_STORE(gp, STACK_OFFSET_GP);	\
				STACK_STORE(ra, 124)

#define	STACK_STORE_S0()  	STACK_STORE_RA();			\
				STACK_STORE(s0, 116)

#define	STACK_STORE_S0S1()  	STACK_STORE_S0();			\
				STACK_STORE(s1, STACK_OFFSET_rFP)

#define	STACK_LOAD_RA()		STACK_LOAD(ra, 124);			\
				STACK_LOAD(gp, STACK_OFFSET_GP);	\
				DELETE_STACK(STACK_SIZE)

#define	STACK_LOAD_S0()  	STACK_LOAD(s0, 116);			\
				STACK_LOAD_RA()

#define	STACK_LOAD_S0S1()  	STACK_LOAD(s1, STACK_OFFSET_rFP);	\
				STACK_LOAD_S0()

#define STACK_STORE_FULL()	CREATE_STACK(STACK_SIZE);	\
				STACK_STORE(ra, 124);		\
				STACK_STORE(fp, 120);		\
				STACK_STORE(s0, 116);		\
				STACK_STORE(s1, STACK_OFFSET_rFP);	\
				STACK_STORE(s2, 108);		\
				STACK_STORE(s3, 104);		\
				STACK_STORE(s4, 100);		\
				STACK_STORE(s5, 96);		\
				STACK_STORE(s6, 92);		\
				STACK_STORE(s7, 88);

#define STACK_LOAD_FULL()	STACK_LOAD(gp, STACK_OFFSET_GP);	\
				STACK_LOAD(s7, 88);	\
				STACK_LOAD(s6, 92);	\
				STACK_LOAD(s5, 96);	\
				STACK_LOAD(s4, 100);	\
				STACK_LOAD(s3, 104);	\
				STACK_LOAD(s2, 108);	\
				STACK_LOAD(s1, STACK_OFFSET_rFP);	\
				STACK_LOAD(s0, 116);	\
				STACK_LOAD(fp, 120);	\
				STACK_LOAD(ra, 124);	\
				DELETE_STACK(STACK_SIZE)

/*
 * first 8 words are reserved for function calls
 * Maximum offset is STACK_OFFSET_SCRMX-STACK_OFFSET_SCR
 */
#define STACK_OFFSET_SCR   32
#define SCRATCH_STORE(r,off) \
    STACK_STORE(r, STACK_OFFSET_SCR+off);
#define SCRATCH_LOAD(r,off) \
    STACK_LOAD(r, STACK_OFFSET_SCR+off);

/* File: mips/platform.S */
/*
 * ===========================================================================
 *  CPU-version-specific defines and utility
 * ===========================================================================
 */



    .global dvmCompilerTemplateStart
    .type   dvmCompilerTemplateStart, %function
    .section .data.rel.ro

dvmCompilerTemplateStart:

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_CMP_LONG
dvmCompiler_TEMPLATE_CMP_LONG:
/* File: mips/TEMPLATE_CMP_LONG.S */
    /*
     * Compare two 64-bit values
     *    x = y     return  0
     *    x < y     return -1
     *    x > y     return  1
     *
     * I think I can improve on the ARM code by the following observation
     *    slt   t0,  x.hi, y.hi;        # (x.hi < y.hi) ? 1:0
     *    sgt   t1,  x.hi, y.hi;        # (y.hi > x.hi) ? 1:0
     *    subu  v0, t0, t1              # v0= -1:1:0 for [ < > = ]
     *
     * This code assumes the register pair ordering will depend on endianess (a1:a0 or a0:a1).
     *    a1:a0 => vBB
     *    a3:a2 => vCC
     */
    /* cmp-long vAA, vBB, vCC */
    slt    t0, rARG1, rARG3             # compare hi
    sgt    t1, rARG1, rARG3
    subu   v0, t1, t0                   # v0<- (-1,1,0)
    bnez   v0, .LTEMPLATE_CMP_LONG_finish
                                        # at this point x.hi==y.hi
    sltu   t0, rARG0, rARG2             # compare lo
    sgtu   t1, rARG0, rARG2
    subu   v0, t1, t0                   # v0<- (-1,1,0) for [< > =]
.LTEMPLATE_CMP_LONG_finish:
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_RETURN
dvmCompiler_TEMPLATE_RETURN:
/* File: mips/TEMPLATE_RETURN.S */
    /*
     * Unwind a frame from the Dalvik stack for compiled OP_RETURN_XXX.
     * If the stored value in returnAddr
     * is non-zero, the caller is compiled by the JIT thus return to the
     * address in the code cache following the invoke instruction. Otherwise
     * return to the special dvmJitToInterpNoChain entry point.
     */
#if defined(TEMPLATE_INLINE_PROFILING)
    # preserve a0-a2 and ra
    SCRATCH_STORE(a0, 0)
    SCRATCH_STORE(a1, 4)
    SCRATCH_STORE(a2, 8)
    SCRATCH_STORE(ra, 12)

    # a0=rSELF
    move    a0, rSELF
    la      t9, dvmFastMethodTraceExit
    JALR(t9)
    lw      gp, STACK_OFFSET_GP(sp)

    # restore a0-a2 and ra
    SCRATCH_LOAD(ra, 12)
    SCRATCH_LOAD(a2, 8)
    SCRATCH_LOAD(a1, 4)
    SCRATCH_LOAD(a0, 0)
#endif
    SAVEAREA_FROM_FP(a0, rFP)           # a0<- saveArea (old)
    lw      t0, offStackSaveArea_prevFrame(a0)     # t0<- saveArea->prevFrame
    lbu     t1, offThread_breakFlags(rSELF)        # t1<- breakFlags
    lw      rPC, offStackSaveArea_savedPc(a0)      # rPC<- saveArea->savedPc
#if !defined(WITH_SELF_VERIFICATION)
    lw      t2,  offStackSaveArea_returnAddr(a0)   # t2<- chaining cell ret
#else
    move    t2, zero                               # disable chaining
#endif
    lw      a2, offStackSaveArea_method - sizeofStackSaveArea(t0)
                                                   # a2<- method we're returning to
#if !defined(WITH_SELF_VERIFICATION)
    beq     a2, zero, 1f                           # bail to interpreter
#else
    bne     a2, zero, 2f
    JALR(ra)                                       # punt to interpreter and compare state
    # DOUG: assume this does not return ???
2:
#endif
    la      t4, .LdvmJitToInterpNoChainNoProfile   # defined in footer.S
    lw      a1, (t4)
    move    rFP, t0                                # publish new FP
    beq     a2, zero, 4f
    lw      t0, offMethod_clazz(a2)                # t0<- method->clazz
4:

    sw      a2, offThread_method(rSELF)            # self->method = newSave->method
    lw      a0, offClassObject_pDvmDex(t0)         # a0<- method->clazz->pDvmDex
    sw      rFP, offThread_curFrame(rSELF)         # self->curFrame = fp
    add     rPC, rPC, 3*2                          # publish new rPC
    sw      a0, offThread_methodClassDex(rSELF)
    movn    t2, zero, t1                           # check the breadFlags and
                                                   # clear the chaining cell address
    sw      t2, offThread_inJitCodeCache(rSELF)    # in code cache or not
    beq     t2, zero, 3f                           # chaining cell exists?
    JALR(t2)                                       # jump to the chaining cell
    # DOUG: assume this does not return ???
3:
#if defined(WITH_JIT_TUNING)
    li      a0, kCallsiteInterpreted
#endif
    j       a1                                     # callsite is interpreted
1:
    sw      zero, offThread_inJitCodeCache(rSELF)  # reset inJitCodeCache
    SAVE_PC_TO_SELF()                              # SAVE_PC_FP_TO_SELF()
    SAVE_FP_TO_SELF()
    la      t4, .LdvmMterpStdBail                  # defined in footer.S
    lw      a2, (t4)
    move    a0, rSELF                              # Expecting rSELF in a0
    JALR(a2)                                       # exit the interpreter
    # DOUG: assume this does not return ???

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT
dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT:
/* File: mips/TEMPLATE_INVOKE_METHOD_NO_OPT.S */
    /*
     * For polymorphic callsites - setup the Dalvik frame and load Dalvik PC
     * into rPC then jump to dvmJitToInterpNoChain to dispatch the
     * runtime-resolved callee.
     */
    # a0 = methodToCall, a1 = returnCell, rPC = dalvikCallsite
    lh     t7, offMethod_registersSize(a0)        # t7<- methodToCall->regsSize
    lh     a2, offMethod_outsSize(a0)             # a2<- methodToCall->outsSize
    lw     t9, offThread_interpStackEnd(rSELF)    # t9<- interpStackEnd
    lbu    t8, offThread_breakFlags(rSELF)        # t8<- breakFlags
    move   a3, a1                                 # a3<- returnCell
    SAVEAREA_FROM_FP(a1, rFP)                     # a1<- stack save area
    sll    t6, t7, 2                              # multiply regsSize by 4 (4 bytes per reg)
    sub    a1, a1, t6                             # a1<- newFp(old savearea-regsSize)
    SAVEAREA_FROM_FP(t0, a1)                      # t0<- stack save area
    sll    t6, a2, 2                              # multiply outsSize by 4 (4 bytes per reg)
    sub    t0, t0, t6                             # t0<- bottom (newsave-outsSize)
    bgeu   t0, t9, 1f                             # bottom < interpStackEnd?
    RETURN                                        # return to raise stack overflow excep.

1:
    # a1 = newFP, a0 = methodToCall, a3 = returnCell, rPC = dalvikCallsite
    lw     t9, offMethod_clazz(a0)                # t9<- methodToCall->clazz
    lw     t0, offMethod_accessFlags(a0)          # t0<- methodToCall->accessFlags
    sw     rPC, (offStackSaveArea_currentPc - sizeofStackSaveArea)(rFP)
    sw     rPC, (offStackSaveArea_savedPc - sizeofStackSaveArea)(a1)
    lw     rPC, offMethod_insns(a0)               # rPC<- methodToCall->insns

    # set up newSaveArea
    sw     rFP, (offStackSaveArea_prevFrame - sizeofStackSaveArea)(a1)
    sw     a3, (offStackSaveArea_returnAddr - sizeofStackSaveArea)(a1)
    sw     a0, (offStackSaveArea_method - sizeofStackSaveArea)(a1)
    beqz   t8, 2f                                 # breakFlags != 0
    RETURN                                        # bail to the interpreter

2:
    and    t6, t0, ACC_NATIVE
    beqz   t6, 3f
#if !defined(WITH_SELF_VERIFICATION)
    j      .LinvokeNative
#else
    RETURN                                        # bail to the interpreter
#endif

3:
    # continue executing the next instruction through the interpreter
    la     t0, .LdvmJitToInterpTraceSelectNoChain # defined in footer.S
    lw     rTEMP, (t0)
    lw     a3, offClassObject_pDvmDex(t9)         # a3<- method->clazz->pDvmDex

    # Update "thread" values for the new method
    sw     a0, offThread_method(rSELF)            # self->method = methodToCall
    sw     a3, offThread_methodClassDex(rSELF)    # self->methodClassDex = ...
    move   rFP, a1                                # fp = newFp
    sw     rFP, offThread_curFrame(rSELF)         # self->curFrame = newFp
#if defined(TEMPLATE_INLINE_PROFILING)
    # preserve a0-a3
    SCRATCH_STORE(a0, 0)
    SCRATCH_STORE(a1, 4)
    SCRATCH_STORE(a2, 8)
    SCRATCH_STORE(a3, 12)

    # a0=methodToCall, a1=rSELF
    move   a1, rSELF
    la     t9, dvmFastMethodTraceEnter
    JALR(t9)
    lw     gp, STACK_OFFSET_GP(sp)

    # restore a0-a3
    SCRATCH_LOAD(a3, 12)
    SCRATCH_LOAD(a2, 8)
    SCRATCH_LOAD(a1, 4)
    SCRATCH_LOAD(a0, 0)
#endif

    # Start executing the callee
#if defined(WITH_JIT_TUNING)
    li     a0, kInlineCacheMiss
#endif
    jr     rTEMP                                  # dvmJitToInterpTraceSelectNoChain

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN
dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN:
/* File: mips/TEMPLATE_INVOKE_METHOD_CHAIN.S */
    /*
     * For monomorphic callsite, setup the Dalvik frame and return to the
     * Thumb code through the link register to transfer control to the callee
     * method through a dedicated chaining cell.
     */
    # a0 = methodToCall, a1 = returnCell, rPC = dalvikCallsite
    # methodToCall is guaranteed to be non-native
.LinvokeChain:
    lh     t7, offMethod_registersSize(a0)        # t7<- methodToCall->regsSize
    lh     a2, offMethod_outsSize(a0)             # a2<- methodToCall->outsSize
    lw     t9, offThread_interpStackEnd(rSELF)    # t9<- interpStackEnd
    lbu    t8, offThread_breakFlags(rSELF)        # t8<- breakFlags
    move   a3, a1                                 # a3<- returnCell
    SAVEAREA_FROM_FP(a1, rFP)                     # a1<- stack save area
    sll    t6, t7, 2                              # multiply regsSize by 4 (4 bytes per reg)
    sub    a1, a1, t6                             # a1<- newFp(old savearea-regsSize)
    SAVEAREA_FROM_FP(t0, a1)                      # t0<- stack save area
    add    t2, ra, 8                              # setup the punt-to-interp address
                                                  # 8 bytes skips branch and delay slot
    sll    t6, a2, 2                              # multiply outsSize by 4 (4 bytes per reg)
    sub    t0, t0, t6                             # t0<- bottom (newsave-outsSize)
    bgeu   t0, t9, 1f                             # bottom < interpStackEnd?
    jr     t2                                     # return to raise stack overflow excep.

1:
    # a1 = newFP, a0 = methodToCall, a3 = returnCell, rPC = dalvikCallsite
    lw     t9, offMethod_clazz(a0)                # t9<- methodToCall->clazz
    sw     rPC, (offStackSaveArea_currentPc - sizeofStackSaveArea)(rFP)
    sw     rPC, (offStackSaveArea_savedPc - sizeofStackSaveArea)(a1)
    lw     rPC, offMethod_insns(a0)               # rPC<- methodToCall->insns

    # set up newSaveArea
    sw     rFP, (offStackSaveArea_prevFrame - sizeofStackSaveArea)(a1)
    sw     a3, (offStackSaveArea_returnAddr - sizeofStackSaveArea)(a1)
    sw     a0, (offStackSaveArea_method - sizeofStackSaveArea)(a1)
    beqz   t8, 2f                                 # breakFlags != 0
    jr     t2                                     # bail to the interpreter

2:
    lw     a3, offClassObject_pDvmDex(t9)         # a3<- methodToCall->clazz->pDvmDex

    # Update "thread" values for the new method
    sw     a0, offThread_method(rSELF)            # self->method = methodToCall
    sw     a3, offThread_methodClassDex(rSELF)    # self->methodClassDex = ...
    move   rFP, a1                                # fp = newFp
    sw     rFP, offThread_curFrame(rSELF)         # self->curFrame = newFp
#if defined(TEMPLATE_INLINE_PROFILING)
    # preserve a0-a2 and ra
    SCRATCH_STORE(a0, 0)
    SCRATCH_STORE(a1, 4)
    SCRATCH_STORE(a2, 8)
    SCRATCH_STORE(ra, 12)

    move   a1, rSELF
    # a0=methodToCall, a1=rSELF
    la     t9, dvmFastMethodTraceEnter
    jalr   t9
    lw     gp, STACK_OFFSET_GP(sp)

    # restore a0-a2 and ra
    SCRATCH_LOAD(ra, 12)
    SCRATCH_LOAD(a2, 8)
    SCRATCH_LOAD(a1, 4)
    SCRATCH_LOAD(a0, 0)
#endif
    RETURN                                        # return to the callee-chaining cell

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN
dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN:
/* File: mips/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S */
    /*
     * For polymorphic callsite, check whether the cached class pointer matches
     * the current one. If so setup the Dalvik frame and return to the
     * Thumb code through the link register to transfer control to the callee
     * method through a dedicated chaining cell.
     *
     * The predicted chaining cell is declared in ArmLIR.h with the
     * following layout:
     *
     *  typedef struct PredictedChainingCell {
     *      u4 branch;
     *      u4 delay_slot;
     *      const ClassObject *clazz;
     *      const Method *method;
     *      u4 counter;
     *  } PredictedChainingCell;
     *
     * Upon returning to the callsite:
     *    - lr   : to branch to the chaining cell
     *    - lr+8 : to punt to the interpreter
     *    - lr+16: to fully resolve the callee and may rechain.
     *             a3 <- class
     */
    # a0 = this, a1 = returnCell, a2 = predictedChainCell, rPC = dalvikCallsite
    lw      a3, offObject_clazz(a0)     # a3 <- this->class
    lw      rIBASE, 8(a2)                   # t0 <- predictedChainCell->clazz
    lw      a0, 12(a2)                  # a0 <- predictedChainCell->method
    lw      t1, offThread_icRechainCount(rSELF)    # t1 <- shared rechainCount

#if defined(WITH_JIT_TUNING)
    la      rINST, .LdvmICHitCount
    #add     t2, t2, 1
    bne    a3, rIBASE, 1f
    nop
    lw      t2, 0(rINST)
    add     t2, t2, 1
    sw      t2, 0(rINST)
1:
    #add     t2, t2, 1
#endif
    beq     a3, rIBASE, .LinvokeChain       # branch if predicted chain is valid
    lw      rINST, offClassObject_vtable(a3)     # rINST <- this->class->vtable
    beqz    rIBASE, 2f                      # initialized class or not
    sub     a1, t1, 1                   # count--
    sw      a1, offThread_icRechainCount(rSELF)   # write back to InterpState
    b       3f
2:
    move    a1, zero
3:
    add     ra, ra, 16                  # return to fully-resolve landing pad
    /*
     * a1 <- count
     * a2 <- &predictedChainCell
     * a3 <- this->class
     * rPC <- dPC
     * rINST <- this->class->vtable
     */
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE
dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE:
/* File: mips/TEMPLATE_INVOKE_METHOD_NATIVE.S */
    # a0 = methodToCall, a1 = returnCell, rPC = dalvikCallsite
    lh     t7, offMethod_registersSize(a0)        # t7<- methodToCall->regsSize
    lw     t9, offThread_interpStackEnd(rSELF)    # t9<- interpStackEnd
    lbu    t8, offThread_breakFlags(rSELF)        # t8<- breakFlags
    move   a3, a1                                 # a3<- returnCell
    SAVEAREA_FROM_FP(a1, rFP)                     # a1<- stack save area
    sll    t6, t7, 2                              # multiply regsSize by 4 (4 bytes per reg)
    sub    a1, a1, t6                             # a1<- newFp(old savearea-regsSize)
    SAVEAREA_FROM_FP(t0, a1)                      # t0<- stack save area
    bgeu   t0, t9, 1f                             # bottom < interpStackEnd?
    RETURN                                        # return to raise stack overflow excep.

1:
    # a1 = newFP, a0 = methodToCall, a3 = returnCell, rPC = dalvikCallsite
    sw     rPC, (offStackSaveArea_currentPc - sizeofStackSaveArea)(rFP)
    sw     rPC, (offStackSaveArea_savedPc - sizeofStackSaveArea)(a1)
    lw     rPC, offMethod_insns(a0)               # rPC<- methodToCall->insns

    # set up newSaveArea
    sw     rFP, (offStackSaveArea_prevFrame - sizeofStackSaveArea)(a1)
    sw     a3, (offStackSaveArea_returnAddr - sizeofStackSaveArea)(a1)
    sw     a0, (offStackSaveArea_method - sizeofStackSaveArea)(a1)
    lw     rTEMP, offMethod_nativeFunc(a0)        # t9<- method->nativeFunc
#if !defined(WITH_SELF_VERIFICATION)
    beqz   t8, 2f                                 # breakFlags != 0
    RETURN                                        # bail to the interpreter
2:
#else
    RETURN                                        # bail to the interpreter unconditionally
#endif

    # go ahead and transfer control to the native code
    lw     t6, offThread_jniLocal_topCookie(rSELF)  # t6<- thread->localRef->...
    sw     a1, offThread_curFrame(rSELF)          # self->curFrame = newFp
    sw     zero, offThread_inJitCodeCache(rSELF)  # not in the jit code cache
    sw     t6, (offStackSaveArea_localRefCookie - sizeofStackSaveArea)(a1)
                                                  # newFp->localRefCookie=top
    SAVEAREA_FROM_FP(rBIX, a1)                    # rBIX<- new stack save area
    move   a2, a0                                 # a2<- methodToCall
    move   a0, a1                                 # a0<- newFp
    add    a1, rSELF, offThread_retval            # a1<- &retval
    move   a3, rSELF                              # a3<- self
#if defined(TEMPLATE_INLINE_PROFILING)
    # a2: methodToCall
    # preserve a0-a3
    SCRATCH_STORE(a0, 0)
    SCRATCH_STORE(a1, 4)
    SCRATCH_STORE(a2, 8)
    SCRATCH_STORE(a3, 12)

    move   a0, a2
    move   a1, rSELF
    # a0=JNIMethod, a1=rSELF
    la      t9, dvmFastMethodTraceEnter
    JALR(t9)                                      # off to the native code
    lw     gp, STACK_OFFSET_GP(sp)

    # restore a0-a3
    SCRATCH_LOAD(a3, 12)
    SCRATCH_LOAD(a2, 8)
    SCRATCH_LOAD(a1, 4)
    SCRATCH_LOAD(a0, 0)

    move   rOBJ, a2                               # save a2
#endif

    JALR(rTEMP)                                   # off to the native code
    lw     gp, STACK_OFFSET_GP(sp)

#if defined(TEMPLATE_INLINE_PROFILING)
    move   a0, rOBJ
    move   a1, rSELF
    # a0=JNIMethod, a1=rSELF
    la      t9, dvmFastNativeMethodTraceExit
    JALR(t9)
    lw     gp, STACK_OFFSET_GP(sp)
#endif

    # native return; rBIX=newSaveArea
    # equivalent to dvmPopJniLocals
    lw     a2, offStackSaveArea_returnAddr(rBIX)     # a2 = chaining cell ret addr
    lw     a0, offStackSaveArea_localRefCookie(rBIX) # a0<- saved->top
    lw     a1, offThread_exception(rSELF)            # check for exception
    sw     rFP, offThread_curFrame(rSELF)            # self->curFrame = fp
    sw     a0, offThread_jniLocal_topCookie(rSELF)   # new top <- old top
    lw     a0, (offStackSaveArea_currentPc - sizeofStackSaveArea)(rFP)

    # a0 = dalvikCallsitePC
    bnez   a1, .LhandleException                     # handle exception if any

    sw     a2, offThread_inJitCodeCache(rSELF)       # set the mode properly
    beqz   a2, 3f
    jr     a2                                        # go if return chaining cell still exist

3:
    # continue executing the next instruction through the interpreter
    la     a1, .LdvmJitToInterpTraceSelectNoChain    # defined in footer.S
    lw     a1, (a1)
    add    rPC, a0, 3*2                              # reconstruct new rPC (advance 3 dalvik instr)

#if defined(WITH_JIT_TUNING)
    li     a0, kCallsiteInterpreted
#endif
    jr     a1

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_MUL_LONG
dvmCompiler_TEMPLATE_MUL_LONG:
/* File: mips/TEMPLATE_MUL_LONG.S */
    /*
     * Signed 64-bit integer multiply.
     *
     * For JIT: op1 in a0/a1, op2 in a2/a3, return in v0/v1
     *
     * Consider WXxYZ (a1a0 x a3a2) with a long multiply:
     *
     *         a1   a0
     *   x     a3   a2
     *   -------------
     *       a2a1 a2a0
     *       a3a0
     *  a3a1 (<= unused)
     *  ---------------
     *         v1   v0
     *
     */
    /* mul-long vAA, vBB, vCC */
    mul     rRESULT1,rARG3,rARG0              #  v1= a3a0
    multu   rARG2,rARG0
    mfhi    t1
    mflo    rRESULT0                          #  v0= a2a0
    mul     t0,rARG2,rARG1                    #  t0= a2a1
    addu    rRESULT1,rRESULT1,t1              #  v1= a3a0 + hi(a2a0)
    addu    rRESULT1,rRESULT1,t0              #  v1= a3a0 + hi(a2a0) + a2a1;
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_SHL_LONG
dvmCompiler_TEMPLATE_SHL_LONG:
/* File: mips/TEMPLATE_SHL_LONG.S */
    /*
     * Long integer shift.  This is different from the generic 32/64-bit
     * binary operations because vAA/vBB are 64-bit but vCC (the shift
     * distance) is 32-bit.  Also, Dalvik requires us to ignore all but the low
     * 6 bits.
     */
    /* shl-long vAA:vBB(rARG1:rARG0), vCC(a2) - result in (rRESULT1:rRESULT0) */
    sll     rRESULT0, rARG0, a2		#  rlo<- alo << (shift&31)
    not     rRESULT1, a2		#  rhi<- 31-shift  (shift is 5b)
    srl     rARG0, 1
    srl     rARG0, rRESULT1		#  alo<- alo >> (32-(shift&31))
    sll     rRESULT1, rARG1, a2		#  rhi<- ahi << (shift&31)
    or      rRESULT1, rARG0		#  rhi<- rhi | alo
    andi    a2, 0x20			#  shift< shift & 0x20
    movn    rRESULT1, rRESULT0, a2	#  rhi<- rlo (if shift&0x20)
    movn    rRESULT0, zero, a2		#  rlo<- 0  (if shift&0x20)
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_SHR_LONG
dvmCompiler_TEMPLATE_SHR_LONG:
/* File: mips/TEMPLATE_SHR_LONG.S */
    /*
     * Long integer shift.  This is different from the generic 32/64-bit
     * binary operations because vAA/vBB are 64-bit but vCC (the shift
     * distance) is 32-bit.  Also, Dalvik requires us to ignore all but the low
     * 6 bits.
     */
    /* shr-long vAA:vBB(rARG1:rARG0), vCC(a2) - result in (rRESULT1:rRESULT0) */
    sra     rRESULT1, rARG1, a2		#  rhi<- ahi >> (shift&31)
    srl     rRESULT0, rARG0, a2		#  rlo<- alo >> (shift&31)
    sra     a3, rARG1, 31		#  a3<- sign(ah)
    not     rARG0, a2			#  alo<- 31-shift (shift is 5b)
    sll     rARG1, 1
    sll     rARG1, rARG0		#  ahi<- ahi << (32-(shift&31))
    or      rRESULT0, rARG1		#  rlo<- rlo | ahi
    andi    a2, 0x20			#  shift & 0x20
    movn    rRESULT0, rRESULT1, a2	#  rlo<- rhi (if shift&0x20)
    movn    rRESULT1, a3, a2		#  rhi<- sign(ahi) (if shift&0x20)
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_USHR_LONG
dvmCompiler_TEMPLATE_USHR_LONG:
/* File: mips/TEMPLATE_USHR_LONG.S */
    /*
     * Long integer shift.  This is different from the generic 32/64-bit
     * binary operations because vAA/vBB are 64-bit but vCC (the shift
     * distance) is 32-bit.  Also, Dalvik requires us to ignore all but the low
     * 6 bits.
     */
    /* ushr-long vAA:vBB(rARG1:rARG0), vCC(a2) - result in (rRESULT1:rRESULT0) */
    srl     rRESULT1, rARG1, a2		#  rhi<- ahi >> (shift&31)
    srl     rRESULT0, rARG0, a2		#  rlo<- alo >> (shift&31)
    not     rARG0, a2			#  alo<- 31-n  (shift is 5b)
    sll     rARG1, 1
    sll     rARG1, rARG0		#  ahi<- ahi << (32-(shift&31))
    or      rRESULT0, rARG1		#  rlo<- rlo | ahi
    andi    a2, 0x20			#  shift & 0x20
    movn    rRESULT0, rRESULT1, a2	#  rlo<- rhi (if shift&0x20)
    movn    rRESULT1, zero, a2		#  rhi<- 0 (if shift&0x20)
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_ADD_FLOAT_VFP
dvmCompiler_TEMPLATE_ADD_FLOAT_VFP:
/* File: mips/TEMPLATE_ADD_FLOAT_VFP.S */
/* File: mips/fbinop.S */
    /*
     * Generic 32-bit binary float operation. a0 = a1 op a2.
     *
     * For: add-fp, sub-fp, mul-fp, div-fp
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = op1 address
     *     a2 = op2 address
     *
     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
     *
     */
    move rOBJ, a0                       # save a0
#ifdef  SOFT_FLOAT
    LOAD(a0, a1)                        # a0<- vBB
    LOAD(a1, a2)                        # a1<- vCC
    .if 0
    beqz    a1, common_errDivideByZero  # is second operand zero?
    .endif
                               # optional op
    JAL(__addsf3)                              # v0 = result
    STORE(v0, rOBJ)                     # vAA <- v0
#else
    LOAD_F(fa0, a1)                     # fa0<- vBB
    LOAD_F(fa1, a2)                     # fa1<- vCC
    .if 0
    # is second operand zero?
    li.s        ft0, 0
    c.eq.s      fcc0, ft0, fa1          # condition bit and comparision with 0
    bc1t        fcc0, common_errDivideByZero
    .endif
                               # optional op
    add.s fv0, fa0, fa1                            # fv0 = result
    STORE_F(fv0, rOBJ)                  # vAA <- fv0
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_SUB_FLOAT_VFP
dvmCompiler_TEMPLATE_SUB_FLOAT_VFP:
/* File: mips/TEMPLATE_SUB_FLOAT_VFP.S */
/* File: mips/fbinop.S */
    /*
     * Generic 32-bit binary float operation. a0 = a1 op a2.
     *
     * For: add-fp, sub-fp, mul-fp, div-fp
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = op1 address
     *     a2 = op2 address
     *
     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
     *
     */
    move rOBJ, a0                       # save a0
#ifdef  SOFT_FLOAT
    LOAD(a0, a1)                        # a0<- vBB
    LOAD(a1, a2)                        # a1<- vCC
    .if 0
    beqz    a1, common_errDivideByZero  # is second operand zero?
    .endif
                               # optional op
    JAL(__subsf3)                              # v0 = result
    STORE(v0, rOBJ)                     # vAA <- v0
#else
    LOAD_F(fa0, a1)                     # fa0<- vBB
    LOAD_F(fa1, a2)                     # fa1<- vCC
    .if 0
    # is second operand zero?
    li.s        ft0, 0
    c.eq.s      fcc0, ft0, fa1          # condition bit and comparision with 0
    bc1t        fcc0, common_errDivideByZero
    .endif
                               # optional op
    sub.s fv0, fa0, fa1                            # fv0 = result
    STORE_F(fv0, rOBJ)                  # vAA <- fv0
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_MUL_FLOAT_VFP
dvmCompiler_TEMPLATE_MUL_FLOAT_VFP:
/* File: mips/TEMPLATE_MUL_FLOAT_VFP.S */
/* File: mips/fbinop.S */
    /*
     * Generic 32-bit binary float operation. a0 = a1 op a2.
     *
     * For: add-fp, sub-fp, mul-fp, div-fp
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = op1 address
     *     a2 = op2 address
     *
     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
     *
     */
    move rOBJ, a0                       # save a0
#ifdef  SOFT_FLOAT
    LOAD(a0, a1)                        # a0<- vBB
    LOAD(a1, a2)                        # a1<- vCC
    .if 0
    beqz    a1, common_errDivideByZero  # is second operand zero?
    .endif
                               # optional op
    JAL(__mulsf3)                              # v0 = result
    STORE(v0, rOBJ)                     # vAA <- v0
#else
    LOAD_F(fa0, a1)                     # fa0<- vBB
    LOAD_F(fa1, a2)                     # fa1<- vCC
    .if 0
    # is second operand zero?
    li.s        ft0, 0
    c.eq.s      fcc0, ft0, fa1          # condition bit and comparision with 0
    bc1t        fcc0, common_errDivideByZero
    .endif
                               # optional op
    mul.s fv0, fa0, fa1                            # fv0 = result
    STORE_F(fv0, rOBJ)                  # vAA <- fv0
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_DIV_FLOAT_VFP
dvmCompiler_TEMPLATE_DIV_FLOAT_VFP:
/* File: mips/TEMPLATE_DIV_FLOAT_VFP.S */
/* File: mips/fbinop.S */
    /*
     * Generic 32-bit binary float operation. a0 = a1 op a2.
     *
     * For: add-fp, sub-fp, mul-fp, div-fp
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = op1 address
     *     a2 = op2 address
     *
     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
     *
     */
    move rOBJ, a0                       # save a0
#ifdef  SOFT_FLOAT
    LOAD(a0, a1)                        # a0<- vBB
    LOAD(a1, a2)                        # a1<- vCC
    .if 0
    beqz    a1, common_errDivideByZero  # is second operand zero?
    .endif
                               # optional op
    JAL(__divsf3)                              # v0 = result
    STORE(v0, rOBJ)                     # vAA <- v0
#else
    LOAD_F(fa0, a1)                     # fa0<- vBB
    LOAD_F(fa1, a2)                     # fa1<- vCC
    .if 0
    # is second operand zero?
    li.s        ft0, 0
    c.eq.s      fcc0, ft0, fa1          # condition bit and comparision with 0
    bc1t        fcc0, common_errDivideByZero
    .endif
                               # optional op
    div.s fv0, fa0, fa1                            # fv0 = result
    STORE_F(fv0, rOBJ)                  # vAA <- fv0
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_ADD_DOUBLE_VFP
dvmCompiler_TEMPLATE_ADD_DOUBLE_VFP:
/* File: mips/TEMPLATE_ADD_DOUBLE_VFP.S */
/* File: mips/fbinopWide.S */
    /*
     * Generic 64-bit binary operation.  Provide an "instr" line that
     * specifies an instruction that performs "result = a0-a1 op a2-a3".
     * This could be an MIPS instruction or a function call.
     * If "chkzero" is set to 1, we perform a divide-by-zero check on
     * vCC (a1).  Useful for integer division and modulus.
     *
     * for: add-long, sub-long, div-long, rem-long, and-long, or-long,
     *      xor-long, add-double, sub-double, mul-double, div-double,
     *      rem-double
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = op1 address
     *     a2 = op2 address
     *
     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
     */
    move rOBJ, a0                       # save a0
#ifdef  SOFT_FLOAT
    move t0, a1                         # save a1
    move t1, a2                         # save a2
    LOAD64(rARG0, rARG1, t0)            # a0/a1<- vBB/vBB+1
    LOAD64(rARG2, rARG3, t1)            # a2/a3<- vCC/vCC+1
    .if 0
    or          t0, rARG2, rARG3        # second arg (a2-a3) is zero?
    beqz        t0, common_errDivideByZero
    .endif
                               # optional op
    JAL(__adddf3)                              # result<- op, a0-a3 changed
    STORE64(rRESULT0, rRESULT1, rOBJ)
#else
    LOAD64_F(fa0, fa0f, a1)
    LOAD64_F(fa1, fa1f, a2)
    .if 0
    li.d        ft0, 0
    c.eq.d      fcc0, fa1, ft0
    bc1t        fcc0, common_errDivideByZero
    .endif
                               # optional op
    add.d fv0, fa0, fa1
    STORE64_F(fv0, fv0f, rOBJ)
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_SUB_DOUBLE_VFP
dvmCompiler_TEMPLATE_SUB_DOUBLE_VFP:
/* File: mips/TEMPLATE_SUB_DOUBLE_VFP.S */
/* File: mips/fbinopWide.S */
    /*
     * Generic 64-bit binary operation.  Provide an "instr" line that
     * specifies an instruction that performs "result = a0-a1 op a2-a3".
     * This could be an MIPS instruction or a function call.
     * If "chkzero" is set to 1, we perform a divide-by-zero check on
     * vCC (a1).  Useful for integer division and modulus.
     *
     * for: add-long, sub-long, div-long, rem-long, and-long, or-long,
     *      xor-long, add-double, sub-double, mul-double, div-double,
     *      rem-double
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = op1 address
     *     a2 = op2 address
     *
     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
     */
    move rOBJ, a0                       # save a0
#ifdef  SOFT_FLOAT
    move t0, a1                         # save a1
    move t1, a2                         # save a2
    LOAD64(rARG0, rARG1, t0)            # a0/a1<- vBB/vBB+1
    LOAD64(rARG2, rARG3, t1)            # a2/a3<- vCC/vCC+1
    .if 0
    or          t0, rARG2, rARG3        # second arg (a2-a3) is zero?
    beqz        t0, common_errDivideByZero
    .endif
                               # optional op
    JAL(__subdf3)                              # result<- op, a0-a3 changed
    STORE64(rRESULT0, rRESULT1, rOBJ)
#else
    LOAD64_F(fa0, fa0f, a1)
    LOAD64_F(fa1, fa1f, a2)
    .if 0
    li.d        ft0, 0
    c.eq.d      fcc0, fa1, ft0
    bc1t        fcc0, common_errDivideByZero
    .endif
                               # optional op
    sub.d fv0, fa0, fa1
    STORE64_F(fv0, fv0f, rOBJ)
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_MUL_DOUBLE_VFP
dvmCompiler_TEMPLATE_MUL_DOUBLE_VFP:
/* File: mips/TEMPLATE_MUL_DOUBLE_VFP.S */
/* File: mips/fbinopWide.S */
    /*
     * Generic 64-bit binary operation.  Provide an "instr" line that
     * specifies an instruction that performs "result = a0-a1 op a2-a3".
     * This could be an MIPS instruction or a function call.
     * If "chkzero" is set to 1, we perform a divide-by-zero check on
     * vCC (a1).  Useful for integer division and modulus.
     *
     * for: add-long, sub-long, div-long, rem-long, and-long, or-long,
     *      xor-long, add-double, sub-double, mul-double, div-double,
     *      rem-double
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = op1 address
     *     a2 = op2 address
     *
     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
     */
    move rOBJ, a0                       # save a0
#ifdef  SOFT_FLOAT
    move t0, a1                         # save a1
    move t1, a2                         # save a2
    LOAD64(rARG0, rARG1, t0)            # a0/a1<- vBB/vBB+1
    LOAD64(rARG2, rARG3, t1)            # a2/a3<- vCC/vCC+1
    .if 0
    or          t0, rARG2, rARG3        # second arg (a2-a3) is zero?
    beqz        t0, common_errDivideByZero
    .endif
                               # optional op
    JAL(__muldf3)                              # result<- op, a0-a3 changed
    STORE64(rRESULT0, rRESULT1, rOBJ)
#else
    LOAD64_F(fa0, fa0f, a1)
    LOAD64_F(fa1, fa1f, a2)
    .if 0
    li.d        ft0, 0
    c.eq.d      fcc0, fa1, ft0
    bc1t        fcc0, common_errDivideByZero
    .endif
                               # optional op
    mul.d fv0, fa0, fa1
    STORE64_F(fv0, fv0f, rOBJ)
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_DIV_DOUBLE_VFP
dvmCompiler_TEMPLATE_DIV_DOUBLE_VFP:
/* File: mips/TEMPLATE_DIV_DOUBLE_VFP.S */
/* File: mips/fbinopWide.S */
    /*
     * Generic 64-bit binary operation.  Provide an "instr" line that
     * specifies an instruction that performs "result = a0-a1 op a2-a3".
     * This could be an MIPS instruction or a function call.
     * If "chkzero" is set to 1, we perform a divide-by-zero check on
     * vCC (a1).  Useful for integer division and modulus.
     *
     * for: add-long, sub-long, div-long, rem-long, and-long, or-long,
     *      xor-long, add-double, sub-double, mul-double, div-double,
     *      rem-double
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = op1 address
     *     a2 = op2 address
     *
     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
     */
    move rOBJ, a0                       # save a0
#ifdef  SOFT_FLOAT
    move t0, a1                         # save a1
    move t1, a2                         # save a2
    LOAD64(rARG0, rARG1, t0)            # a0/a1<- vBB/vBB+1
    LOAD64(rARG2, rARG3, t1)            # a2/a3<- vCC/vCC+1
    .if 0
    or          t0, rARG2, rARG3        # second arg (a2-a3) is zero?
    beqz        t0, common_errDivideByZero
    .endif
                               # optional op
    JAL(__divdf3)                              # result<- op, a0-a3 changed
    STORE64(rRESULT0, rRESULT1, rOBJ)
#else
    LOAD64_F(fa0, fa0f, a1)
    LOAD64_F(fa1, fa1f, a2)
    .if 0
    li.d        ft0, 0
    c.eq.d      fcc0, fa1, ft0
    bc1t        fcc0, common_errDivideByZero
    .endif
                               # optional op
    div.d fv0, fa0, fa1
    STORE64_F(fv0, fv0f, rOBJ)
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_DOUBLE_TO_FLOAT_VFP
dvmCompiler_TEMPLATE_DOUBLE_TO_FLOAT_VFP:
/* File: mips/TEMPLATE_DOUBLE_TO_FLOAT_VFP.S */
/* File: mips/funopNarrower.S */
    /*
     * Generic 64bit-to-32bit unary operation.  Provide an "instr" line
     * that specifies an instruction that performs "result = op a0/a1", where
     * "result" is a 32-bit quantity in a0.
     *
     * For: long-to-float, double-to-int, double-to-float
     * If hard floating point support is available, use fa0 as the parameter, except for
     * long-to-float opcode.
     * (This would work for long-to-int, but that instruction is actually
     * an exact match for OP_MOVE.)
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = src dalvik register address
     *
     */
    move rINST, a0                      # save a0
#ifdef  SOFT_FLOAT
    move t0, a1                         # save a1
    LOAD64(rARG0, rARG1, t0)            # a0/a1<- vB/vB+1
                               # optional op
    JAL(__truncdfsf2)                              # v0<- op, a0-a3 changed
.LTEMPLATE_DOUBLE_TO_FLOAT_VFP_set_vreg:
    STORE(v0, rINST)                    # vA<- v0
#else
    LOAD64_F(fa0, fa0f, a1)
                               # optional op
    cvt.s.d  fv0,fa0                            # fv0 = result
.LTEMPLATE_DOUBLE_TO_FLOAT_VFP_set_vreg_f:
    STORE_F(fv0, rINST)                 # vA<- fv0
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_DOUBLE_TO_INT_VFP
dvmCompiler_TEMPLATE_DOUBLE_TO_INT_VFP:
/* File: mips/TEMPLATE_DOUBLE_TO_INT_VFP.S */
/* File: mips/funopNarrower.S */
    /*
     * Generic 64bit-to-32bit unary operation.  Provide an "instr" line
     * that specifies an instruction that performs "result = op a0/a1", where
     * "result" is a 32-bit quantity in a0.
     *
     * For: long-to-float, double-to-int, double-to-float
     * If hard floating point support is available, use fa0 as the parameter, except for
     * long-to-float opcode.
     * (This would work for long-to-int, but that instruction is actually
     * an exact match for OP_MOVE.)
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = src dalvik register address
     *
     */
    move rINST, a0                      # save a0
#ifdef  SOFT_FLOAT
    move t0, a1                         # save a1
    LOAD64(rARG0, rARG1, t0)            # a0/a1<- vB/vB+1
                               # optional op
    b    d2i_doconv                              # v0<- op, a0-a3 changed
.LTEMPLATE_DOUBLE_TO_INT_VFP_set_vreg:
    STORE(v0, rINST)                    # vA<- v0
#else
    LOAD64_F(fa0, fa0f, a1)
                               # optional op
    b    d2i_doconv                            # fv0 = result
.LTEMPLATE_DOUBLE_TO_INT_VFP_set_vreg_f:
    STORE_F(fv0, rINST)                 # vA<- fv0
#endif
    RETURN


/*
 * Convert the double in a0/a1 to an int in a0.
 *
 * We have to clip values to int min/max per the specification.  The
 * expected common case is a "reasonable" value that converts directly
 * to modest integer.  The EABI convert function isn't doing this for us.
 * Use rBIX / rOBJ as global to hold arguments (they are not bound to a global var)
 */

d2i_doconv:
#ifdef SOFT_FLOAT
    la          t0, .LDOUBLE_TO_INT_max
    LOAD64(rARG2, rARG3, t0)
    move        rBIX, rARG0                       # save a0
    move        rOBJ, rARG1                       #  and a1
    JAL(__gedf2)                               # is arg >= maxint?

    move        t0, v0
    li          v0, ~0x80000000                # return maxint (7fffffff)
    bgez        t0, .LTEMPLATE_DOUBLE_TO_INT_VFP_set_vreg       # nonzero == yes

    move        rARG0, rBIX                       # recover arg
    move        rARG1, rOBJ
    la          t0, .LDOUBLE_TO_INT_min
    LOAD64(rARG2, rARG3, t0)
    JAL(__ledf2)                               # is arg <= minint?

    move        t0, v0
    li          v0, 0x80000000                 # return minint (80000000)
    blez        t0, .LTEMPLATE_DOUBLE_TO_INT_VFP_set_vreg       # nonzero == yes

    move        rARG0, rBIX                  # recover arg
    move        rARG1, rOBJ
    move        rARG2, rBIX                  # compare against self
    move        rARG3, rOBJ
    JAL(__nedf2)                        # is arg == self?

    move        t0, v0                  # zero == no
    li          v0, 0
    bnez        t0, .LTEMPLATE_DOUBLE_TO_INT_VFP_set_vreg        # return zero for NaN

    move        rARG0, rBIX                  # recover arg
    move        rARG1, rOBJ
    JAL(__fixdfsi)                      # convert double to int
    b           .LTEMPLATE_DOUBLE_TO_INT_VFP_set_vreg
#else
    la          t0, .LDOUBLE_TO_INT_max
    LOAD64_F(fa1, fa1f, t0)
    c.ole.d     fcc0, fa1, fa0
    l.s         fv0, .LDOUBLE_TO_INT_maxret
    bc1t        .LTEMPLATE_DOUBLE_TO_INT_VFP_set_vreg_f

    la          t0, .LDOUBLE_TO_INT_min
    LOAD64_F(fa1, fa1f, t0)
    c.ole.d     fcc0, fa0, fa1
    l.s         fv0, .LDOUBLE_TO_INT_minret
    bc1t        .LTEMPLATE_DOUBLE_TO_INT_VFP_set_vreg_f

    mov.d       fa1, fa0
    c.un.d      fcc0, fa0, fa1
    li.s        fv0, 0
    bc1t        .LTEMPLATE_DOUBLE_TO_INT_VFP_set_vreg_f

    trunc.w.d   fv0, fa0
    b           .LTEMPLATE_DOUBLE_TO_INT_VFP_set_vreg_f
#endif


.LDOUBLE_TO_INT_max:
    .dword   0x41dfffffffc00000
.LDOUBLE_TO_INT_min:
    .dword   0xc1e0000000000000                  # minint, as a double (high word)
.LDOUBLE_TO_INT_maxret:
    .word   0x7fffffff
.LDOUBLE_TO_INT_minret:
    .word   0x80000000

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_FLOAT_TO_DOUBLE_VFP
dvmCompiler_TEMPLATE_FLOAT_TO_DOUBLE_VFP:
/* File: mips/TEMPLATE_FLOAT_TO_DOUBLE_VFP.S */
/* File: mips/funopWider.S */
    /*
     * Generic 32bit-to-64bit floating point unary operation.  Provide an
     * "instr" line that specifies an instruction that performs "d0 = op s0".
     *
     * For: int-to-double, float-to-double
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = src dalvik register address
     */
    /* unop vA, vB */
    move rOBJ, a0                       # save a0
#ifdef  SOFT_FLOAT
    LOAD(a0, a1)                        # a0<- vB
                               # optional op
    JAL(__extendsfdf2)                              # result<- op, a0-a3 changed

.LTEMPLATE_FLOAT_TO_DOUBLE_VFP_set_vreg:
    STORE64(rRESULT0, rRESULT1, rOBJ)   # vA/vA+1<- v0/v1
#else
    LOAD_F(fa0, a1)                     # fa0<- vB
                               # optional op
    cvt.d.s fv0, fa0

.LTEMPLATE_FLOAT_TO_DOUBLE_VFP_set_vreg:
    STORE64_F(fv0, fv0f, rOBJ)                          # vA/vA+1<- fv0/fv0f
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_FLOAT_TO_INT_VFP
dvmCompiler_TEMPLATE_FLOAT_TO_INT_VFP:
/* File: mips/TEMPLATE_FLOAT_TO_INT_VFP.S */
/* File: mips/funop.S */
    /*
     * Generic 32-bit unary operation.  Provide an "instr" line that
     * specifies an instruction that performs "result = op a0".
     * This could be a MIPS instruction or a function call.
     *
     * for: int-to-float, float-to-int
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = src dalvik register address
     *
     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
     *
     */
    move rOBJ, a0                       # save a0
#ifdef SOFT_FLOAT
    LOAD(a0, a1)                        # a0<- vBB
                               # optional op
    b    f2i_doconv                              # v0<- op, a0-a3 changed
.LTEMPLATE_FLOAT_TO_INT_VFP_set_vreg:
    STORE(v0, rOBJ)                     # vAA<- v0
#else
    LOAD_F(fa0, a1)                     # fa0<- vBB
                               # optional op
    b        f2i_doconv                            # fv0 = result
.LTEMPLATE_FLOAT_TO_INT_VFP_set_vreg_f:
    STORE_F(fv0, rOBJ)                  # vAA <- fv0
#endif
    RETURN


/*
 * Not an entry point as it is used only once !!
 */
f2i_doconv:
#ifdef SOFT_FLOAT
        li      a1, 0x4f000000  # (float)maxint
        move    rBIX, a0
        JAL(__gesf2)            # is arg >= maxint?
        move    t0, v0
        li      v0, ~0x80000000 # return maxint (7fffffff)
        bgez    t0, .LTEMPLATE_FLOAT_TO_INT_VFP_set_vreg

        move    a0, rBIX                # recover arg
        li      a1, 0xcf000000  # (float)minint
        JAL(__lesf2)

        move    t0, v0
        li      v0, 0x80000000  # return minint (80000000)
        blez    t0, .LTEMPLATE_FLOAT_TO_INT_VFP_set_vreg
        move    a0, rBIX
        move    a1, rBIX
        JAL(__nesf2)

        move    t0, v0
        li      v0, 0           # return zero for NaN
        bnez    t0, .LTEMPLATE_FLOAT_TO_INT_VFP_set_vreg

        move    a0, rBIX
        JAL(__fixsfsi)
        b .LTEMPLATE_FLOAT_TO_INT_VFP_set_vreg
#else
        l.s             fa1, .LFLOAT_TO_INT_max
        c.ole.s         fcc0, fa1, fa0
        l.s             fv0, .LFLOAT_TO_INT_ret_max
        bc1t            .LTEMPLATE_FLOAT_TO_INT_VFP_set_vreg_f

        l.s             fa1, .LFLOAT_TO_INT_min
        c.ole.s         fcc0, fa0, fa1
        l.s             fv0, .LFLOAT_TO_INT_ret_min
        bc1t            .LTEMPLATE_FLOAT_TO_INT_VFP_set_vreg_f

        mov.s           fa1, fa0
        c.un.s          fcc0, fa0, fa1
        li.s            fv0, 0
        bc1t            .LTEMPLATE_FLOAT_TO_INT_VFP_set_vreg_f

        trunc.w.s       fv0, fa0
        b .LTEMPLATE_FLOAT_TO_INT_VFP_set_vreg_f
#endif

.LFLOAT_TO_INT_max:
        .word   0x4f000000
.LFLOAT_TO_INT_min:
        .word   0xcf000000
.LFLOAT_TO_INT_ret_max:
        .word   0x7fffffff
.LFLOAT_TO_INT_ret_min:
        .word   0x80000000


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INT_TO_DOUBLE_VFP
dvmCompiler_TEMPLATE_INT_TO_DOUBLE_VFP:
/* File: mips/TEMPLATE_INT_TO_DOUBLE_VFP.S */
/* File: mips/funopWider.S */
    /*
     * Generic 32bit-to-64bit floating point unary operation.  Provide an
     * "instr" line that specifies an instruction that performs "d0 = op s0".
     *
     * For: int-to-double, float-to-double
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = src dalvik register address
     */
    /* unop vA, vB */
    move rOBJ, a0                       # save a0
#ifdef  SOFT_FLOAT
    LOAD(a0, a1)                        # a0<- vB
                               # optional op
    JAL(__floatsidf)                              # result<- op, a0-a3 changed

.LTEMPLATE_INT_TO_DOUBLE_VFP_set_vreg:
    STORE64(rRESULT0, rRESULT1, rOBJ)   # vA/vA+1<- v0/v1
#else
    LOAD_F(fa0, a1)                     # fa0<- vB
                               # optional op
    cvt.d.w    fv0, fa0

.LTEMPLATE_INT_TO_DOUBLE_VFP_set_vreg:
    STORE64_F(fv0, fv0f, rOBJ)                          # vA/vA+1<- fv0/fv0f
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INT_TO_FLOAT_VFP
dvmCompiler_TEMPLATE_INT_TO_FLOAT_VFP:
/* File: mips/TEMPLATE_INT_TO_FLOAT_VFP.S */
/* File: mips/funop.S */
    /*
     * Generic 32-bit unary operation.  Provide an "instr" line that
     * specifies an instruction that performs "result = op a0".
     * This could be a MIPS instruction or a function call.
     *
     * for: int-to-float, float-to-int
     *
     * On entry:
     *     a0 = target dalvik register address
     *     a1 = src dalvik register address
     *
     * IMPORTANT: you may specify "chkzero" or "preinstr" but not both.
     *
     */
    move rOBJ, a0                       # save a0
#ifdef SOFT_FLOAT
    LOAD(a0, a1)                        # a0<- vBB
                               # optional op
    JAL(__floatsisf)                              # v0<- op, a0-a3 changed
.LTEMPLATE_INT_TO_FLOAT_VFP_set_vreg:
    STORE(v0, rOBJ)                     # vAA<- v0
#else
    LOAD_F(fa0, a1)                     # fa0<- vBB
                               # optional op
    cvt.s.w fv0, fa0                            # fv0 = result
.LTEMPLATE_INT_TO_FLOAT_VFP_set_vreg_f:
    STORE_F(fv0, rOBJ)                  # vAA <- fv0
#endif
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_CMPG_DOUBLE_VFP
dvmCompiler_TEMPLATE_CMPG_DOUBLE_VFP:
/* File: mips/TEMPLATE_CMPG_DOUBLE_VFP.S */
/* File: mips/TEMPLATE_CMPL_DOUBLE_VFP.S */
    /*
     * Compare two double precision floating-point values.  Puts 0, 1, or -1 into the
     * destination register based on the results of the comparison.
     *
     * Provide a "naninst" instruction that puts 1 or -1 into a1 depending
     * on what value we'd like to return when one of the operands is NaN.
     *
     * The operation we're implementing is:
     *   if (x == y)
     *     return 0;
     *   else if (x < y)
     *     return -1;
     *   else if (x > y)
     *     return 1;
     *   else
     *     return {-1,1};  // one or both operands was NaN
     *
     * On entry:
     *    a0 = &op1 [vBB]
     *    a1 = &op2 [vCC]
     *
     * for: cmpl-double, cmpg-double
     */
    /* op vAA, vBB, vCC */

    /* "clasic" form */
#ifdef  SOFT_FLOAT
    move rOBJ, a0                       # save a0
    move rBIX, a1                       # save a1
    LOAD64(rARG0, rARG1, rOBJ)          # a0/a1<- vBB/vBB+1
    LOAD64(rARG2, rARG3, rBIX)          # a2/a3<- vCC/vCC+1
    JAL(__eqdf2)                        # v0<- (vBB == vCC)
    li       rTEMP, 0                   # vAA<- 0
    beqz     v0, TEMPLATE_CMPG_DOUBLE_VFP_finish
    LOAD64(rARG0, rARG1, rOBJ)          # a0/a1<- vBB/vBB+1
    LOAD64(rARG2, rARG3, rBIX)          # a2/a3<- vCC/vCC+1
    JAL(__ltdf2)                        # a0<- (vBB < vCC)
    li       rTEMP, -1                  # vAA<- -1
    bltz     v0, TEMPLATE_CMPG_DOUBLE_VFP_finish
    LOAD64(rARG0, rARG1, rOBJ)          # a0/a1<- vBB/vBB+1
    LOAD64(rARG2, rARG3, rBIX)          # a2/a3<- vCC/vCC+1
    JAL(__gtdf2)                        # v0<- (vBB > vCC)
    li      rTEMP, 1                    # vAA<- 1
    bgtz    v0, TEMPLATE_CMPG_DOUBLE_VFP_finish
#else
    LOAD64_F(fs0, fs0f, a0)             # fs0<- vBB
    LOAD64_F(fs1, fs1f, a1)             # fs1<- vCC
    c.olt.d     fcc0, fs0, fs1          # Is fs0 < fs1
    li          rTEMP, -1
    bc1t        fcc0, TEMPLATE_CMPG_DOUBLE_VFP_finish
    c.olt.d     fcc0, fs1, fs0
    li          rTEMP, 1
    bc1t        fcc0, TEMPLATE_CMPG_DOUBLE_VFP_finish
    c.eq.d      fcc0, fs0, fs1
    li          rTEMP, 0
    bc1t        fcc0, TEMPLATE_CMPG_DOUBLE_VFP_finish
#endif

    li            rTEMP, 1

TEMPLATE_CMPG_DOUBLE_VFP_finish:
    move     v0, rTEMP                  # v0<- vAA
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_CMPL_DOUBLE_VFP
dvmCompiler_TEMPLATE_CMPL_DOUBLE_VFP:
/* File: mips/TEMPLATE_CMPL_DOUBLE_VFP.S */
    /*
     * Compare two double precision floating-point values.  Puts 0, 1, or -1 into the
     * destination register based on the results of the comparison.
     *
     * Provide a "naninst" instruction that puts 1 or -1 into a1 depending
     * on what value we'd like to return when one of the operands is NaN.
     *
     * The operation we're implementing is:
     *   if (x == y)
     *     return 0;
     *   else if (x < y)
     *     return -1;
     *   else if (x > y)
     *     return 1;
     *   else
     *     return {-1,1};  // one or both operands was NaN
     *
     * On entry:
     *    a0 = &op1 [vBB]
     *    a1 = &op2 [vCC]
     *
     * for: cmpl-double, cmpg-double
     */
    /* op vAA, vBB, vCC */

    /* "clasic" form */
#ifdef  SOFT_FLOAT
    move rOBJ, a0                       # save a0
    move rBIX, a1                       # save a1
    LOAD64(rARG0, rARG1, rOBJ)          # a0/a1<- vBB/vBB+1
    LOAD64(rARG2, rARG3, rBIX)          # a2/a3<- vCC/vCC+1
    JAL(__eqdf2)                        # v0<- (vBB == vCC)
    li       rTEMP, 0                   # vAA<- 0
    beqz     v0, TEMPLATE_CMPL_DOUBLE_VFP_finish
    LOAD64(rARG0, rARG1, rOBJ)          # a0/a1<- vBB/vBB+1
    LOAD64(rARG2, rARG3, rBIX)          # a2/a3<- vCC/vCC+1
    JAL(__ltdf2)                        # a0<- (vBB < vCC)
    li       rTEMP, -1                  # vAA<- -1
    bltz     v0, TEMPLATE_CMPL_DOUBLE_VFP_finish
    LOAD64(rARG0, rARG1, rOBJ)          # a0/a1<- vBB/vBB+1
    LOAD64(rARG2, rARG3, rBIX)          # a2/a3<- vCC/vCC+1
    JAL(__gtdf2)                        # v0<- (vBB > vCC)
    li      rTEMP, 1                    # vAA<- 1
    bgtz    v0, TEMPLATE_CMPL_DOUBLE_VFP_finish
#else
    LOAD64_F(fs0, fs0f, a0)             # fs0<- vBB
    LOAD64_F(fs1, fs1f, a1)             # fs1<- vCC
    c.olt.d     fcc0, fs0, fs1          # Is fs0 < fs1
    li          rTEMP, -1
    bc1t        fcc0, TEMPLATE_CMPL_DOUBLE_VFP_finish
    c.olt.d     fcc0, fs1, fs0
    li          rTEMP, 1
    bc1t        fcc0, TEMPLATE_CMPL_DOUBLE_VFP_finish
    c.eq.d      fcc0, fs0, fs1
    li          rTEMP, 0
    bc1t        fcc0, TEMPLATE_CMPL_DOUBLE_VFP_finish
#endif

    li     rTEMP, -1

TEMPLATE_CMPL_DOUBLE_VFP_finish:
    move     v0, rTEMP                  # v0<- vAA
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_CMPG_FLOAT_VFP
dvmCompiler_TEMPLATE_CMPG_FLOAT_VFP:
/* File: mips/TEMPLATE_CMPG_FLOAT_VFP.S */
/* File: mips/TEMPLATE_CMPL_FLOAT_VFP.S */
    /*
     * Compare two floating-point values.  Puts 0, 1, or -1 into the
     * destination register based on the results of the comparison.
     *
     * Provide a "naninst" instruction that puts 1 or -1 into a1 depending
     * on what value we'd like to return when one of the operands is NaN.
     *
     * The operation we're implementing is:
     *   if (x == y)
     *     return 0;
     *   else if (x < y)
     *     return -1;
     *   else if (x > y)
     *     return 1;
     *   else
     *     return {-1,1};  // one or both operands was NaN
     *
     * On entry:
     *    a0 = &op1 [vBB]
     *    a1 = &op2 [vCC]
     *
     * for: cmpl-float, cmpg-float
     */
    /* op vAA, vBB, vCC */

    /* "clasic" form */
#ifdef  SOFT_FLOAT
    LOAD(rOBJ, a0)                      # rOBJ<- vBB
    LOAD(rBIX, a1)                      # rBIX<- vCC
    move     a0, rOBJ                   # a0<- vBB
    move     a1, rBIX                   # a1<- vCC
    JAL(__eqsf2)                        # v0<- (vBB == vCC)
    li       rTEMP, 0                   # vAA<- 0
    beqz     v0, TEMPLATE_CMPG_FLOAT_VFP_finish
    move     a0, rOBJ                   # a0<- vBB
    move     a1, rBIX                   # a1<- vCC
    JAL(__ltsf2)                        # a0<- (vBB < vCC)
    li       rTEMP, -1                  # vAA<- -1
    bltz     v0, TEMPLATE_CMPG_FLOAT_VFP_finish
    move     a0, rOBJ                   # a0<- vBB
    move     a1, rBIX                   # a1<- vCC
    JAL(__gtsf2)                        # v0<- (vBB > vCC)
    li      rTEMP, 1                    # vAA<- 1
    bgtz    v0, TEMPLATE_CMPG_FLOAT_VFP_finish
#else
    LOAD_F(fs0, a0)                     # fs0<- vBB
    LOAD_F(fs1, a1)                     # fs1<- vCC
    c.olt.s     fcc0, fs0, fs1          #Is fs0 < fs1
    li          rTEMP, -1
    bc1t        fcc0, TEMPLATE_CMPG_FLOAT_VFP_finish
    c.olt.s     fcc0, fs1, fs0
    li          rTEMP, 1
    bc1t        fcc0, TEMPLATE_CMPG_FLOAT_VFP_finish
    c.eq.s      fcc0, fs0, fs1
    li          rTEMP, 0
    bc1t        fcc0, TEMPLATE_CMPG_FLOAT_VFP_finish
#endif

    li     rTEMP, 1

TEMPLATE_CMPG_FLOAT_VFP_finish:
    move     v0, rTEMP                  # v0<- vAA
    RETURN


/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_CMPL_FLOAT_VFP
dvmCompiler_TEMPLATE_CMPL_FLOAT_VFP:
/* File: mips/TEMPLATE_CMPL_FLOAT_VFP.S */
    /*
     * Compare two floating-point values.  Puts 0, 1, or -1 into the
     * destination register based on the results of the comparison.
     *
     * Provide a "naninst" instruction that puts 1 or -1 into a1 depending
     * on what value we'd like to return when one of the operands is NaN.
     *
     * The operation we're implementing is:
     *   if (x == y)
     *     return 0;
     *   else if (x < y)
     *     return -1;
     *   else if (x > y)
     *     return 1;
     *   else
     *     return {-1,1};  // one or both operands was NaN
     *
     * On entry:
     *    a0 = &op1 [vBB]
     *    a1 = &op2 [vCC]
     *
     * for: cmpl-float, cmpg-float
     */
    /* op vAA, vBB, vCC */

    /* "clasic" form */
#ifdef  SOFT_FLOAT
    LOAD(rOBJ, a0)                      # rOBJ<- vBB
    LOAD(rBIX, a1)                      # rBIX<- vCC
    move     a0, rOBJ                   # a0<- vBB
    move     a1, rBIX                   # a1<- vCC
    JAL(__eqsf2)                        # v0<- (vBB == vCC)
    li       rTEMP, 0                   # vAA<- 0
    beqz     v0, TEMPLATE_CMPL_FLOAT_VFP_finish
    move     a0, rOBJ                   # a0<- vBB
    move     a1, rBIX                   # a1<- vCC
    JAL(__ltsf2)                        # a0<- (vBB < vCC)
    li       rTEMP, -1                  # vAA<- -1
    bltz     v0, TEMPLATE_CMPL_FLOAT_VFP_finish
    move     a0, rOBJ                   # a0<- vBB
    move     a1, rBIX                   # a1<- vCC
    JAL(__gtsf2)                        # v0<- (vBB > vCC)
    li      rTEMP, 1                    # vAA<- 1
    bgtz    v0, TEMPLATE_CMPL_FLOAT_VFP_finish
#else
    LOAD_F(fs0, a0)                     # fs0<- vBB
    LOAD_F(fs1, a1)                     # fs1<- vCC
    c.olt.s     fcc0, fs0, fs1          #Is fs0 < fs1
    li          rTEMP, -1
    bc1t        fcc0, TEMPLATE_CMPL_FLOAT_VFP_finish
    c.olt.s     fcc0, fs1, fs0
    li          rTEMP, 1
    bc1t        fcc0, TEMPLATE_CMPL_FLOAT_VFP_finish
    c.eq.s      fcc0, fs0, fs1
    li          rTEMP, 0
    bc1t        fcc0, TEMPLATE_CMPL_FLOAT_VFP_finish
#endif

    li     rTEMP, -1

TEMPLATE_CMPL_FLOAT_VFP_finish:
    move     v0, rTEMP                  # v0<- vAA
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_SQRT_DOUBLE_VFP
dvmCompiler_TEMPLATE_SQRT_DOUBLE_VFP:
/* File: mips/TEMPLATE_SQRT_DOUBLE_VFP.S */

    /*
     * 64-bit floating point sqrt operation.
     * If the result is a NaN, bail out to library code to do
     * the right thing.
     *
     * On entry:
     *     a2 src addr of op1
     * On exit:
     *     v0,v1/fv0 = res
     */
#ifdef  SOFT_FLOAT
    LOAD64(rARG0, rARG1, a2)        # a0/a1<- vBB/vBB+1
#else
    LOAD64_F(fa0, fa0f, a2)         # fa0/fa0f<- vBB/vBB+1
    sqrt.d	fv0, fa0
    c.eq.d	fv0, fv0
    bc1t	1f
#endif
    JAL(sqrt)
1:
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_THROW_EXCEPTION_COMMON
dvmCompiler_TEMPLATE_THROW_EXCEPTION_COMMON:
/* File: mips/TEMPLATE_THROW_EXCEPTION_COMMON.S */
    /*
     * Throw an exception from JIT'ed code.
     * On entry:
     *    a0    Dalvik PC that raises the exception
     */
    j      .LhandleException

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_MEM_OP_DECODE
dvmCompiler_TEMPLATE_MEM_OP_DECODE:
/* File: mips/TEMPLATE_MEM_OP_DECODE.S */
#if defined(WITH_SELF_VERIFICATION)
    /*
     * This handler encapsulates heap memory ops for selfVerification mode.
     *
     * The call to the handler is inserted prior to a heap memory operation.
     * This handler then calls a function to decode the memory op, and process
     * it accordingly. Afterwards, the handler changes the return address to
     * skip the memory op so it never gets executed.
     */
#ifdef HARD_FLOAT
    /* push f0-f31 onto stack */
    sw      f0, fr0*-4(sp)              # push f0
    sw      f1, fr1*-4(sp)              # push f1
    sw      f2, fr2*-4(sp)              # push f2
    sw      f3, fr3*-4(sp)              # push f3
    sw      f4, fr4*-4(sp)              # push f4
    sw      f5, fr5*-4(sp)              # push f5
    sw      f6, fr6*-4(sp)              # push f6
    sw      f7, fr7*-4(sp)              # push f7
    sw      f8, fr8*-4(sp)              # push f8
    sw      f9, fr9*-4(sp)              # push f9
    sw      f10, fr10*-4(sp)            # push f10
    sw      f11, fr11*-4(sp)            # push f11
    sw      f12, fr12*-4(sp)            # push f12
    sw      f13, fr13*-4(sp)            # push f13
    sw      f14, fr14*-4(sp)            # push f14
    sw      f15, fr15*-4(sp)            # push f15
    sw      f16, fr16*-4(sp)            # push f16
    sw      f17, fr17*-4(sp)            # push f17
    sw      f18, fr18*-4(sp)            # push f18
    sw      f19, fr19*-4(sp)            # push f19
    sw      f20, fr20*-4(sp)            # push f20
    sw      f21, fr21*-4(sp)            # push f21
    sw      f22, fr22*-4(sp)            # push f22
    sw      f23, fr23*-4(sp)            # push f23
    sw      f24, fr24*-4(sp)            # push f24
    sw      f25, fr25*-4(sp)            # push f25
    sw      f26, fr26*-4(sp)            # push f26
    sw      f27, fr27*-4(sp)            # push f27
    sw      f28, fr28*-4(sp)            # push f28
    sw      f29, fr29*-4(sp)            # push f29
    sw      f30, fr30*-4(sp)            # push f30
    sw      f31, fr31*-4(sp)            # push f31

    sub     sp, (32-0)*4                # adjust stack pointer
#endif

    /* push gp registers (except zero, gp, sp, and fp) */
    .set noat
    sw      AT, r_AT*-4(sp)             # push at
    .set at
    sw      v0, r_V0*-4(sp)             # push v0
    sw      v1, r_V1*-4(sp)             # push v1
    sw      a0, r_A0*-4(sp)             # push a0
    sw      a1, r_A1*-4(sp)             # push a1
    sw      a2, r_A2*-4(sp)             # push a2
    sw      a3, r_A3*-4(sp)             # push a3
    sw      t0, r_T0*-4(sp)             # push t0
    sw      t1, r_T1*-4(sp)             # push t1
    sw      t2, r_T2*-4(sp)             # push t2
    sw      t3, r_T3*-4(sp)             # push t3
    sw      t4, r_T4*-4(sp)             # push t4
    sw      t5, r_T5*-4(sp)             # push t5
    sw      t6, r_T6*-4(sp)             # push t6
    sw      t7, r_T7*-4(sp)             # push t7
    sw      s0, r_S0*-4(sp)             # push s0
    sw      s1, r_S1*-4(sp)             # push s1
    sw      s2, r_S2*-4(sp)             # push s2
    sw      s3, r_S3*-4(sp)             # push s3
    sw      s4, r_S4*-4(sp)             # push s4
    sw      s5, r_S5*-4(sp)             # push s5
    sw      s6, r_S6*-4(sp)             # push s6
    sw      s7, r_S7*-4(sp)             # push s7
    sw      t8, r_T8*-4(sp)             # push t8
    sw      t9, r_T9*-4(sp)             # push t9
    sw      k0, r_K0*-4(sp)             # push k0
    sw      k1, r_K1*-4(sp)             # push k1
    sw      ra, r_RA*-4(sp)             # push RA

    # Note: even if we don't save all 32 registers, we still need to
    #       adjust SP by 32 registers due to the way we are storing
    #       the registers on the stack.
    sub     sp, (32-0)*4                # adjust stack pointer

    la     a2, .LdvmSelfVerificationMemOpDecode  # defined in footer.S
    lw     a2, (a2)
    move   a0, ra                       # a0<- link register
    move   a1, sp                       # a1<- stack pointer
    JALR(a2)

    /* pop gp registers (except zero, gp, sp, and fp) */
    # Note: even if we don't save all 32 registers, we still need to
    #       adjust SP by 32 registers due to the way we are storing
    #       the registers on the stack.
    add     sp, (32-0)*4                # adjust stack pointer
    .set noat
    lw      AT, r_AT*-4(sp)             # pop at
    .set at
    lw      v0, r_V0*-4(sp)             # pop v0
    lw      v1, r_V1*-4(sp)             # pop v1
    lw      a0, r_A0*-4(sp)             # pop a0
    lw      a1, r_A1*-4(sp)             # pop a1
    lw      a2, r_A2*-4(sp)             # pop a2
    lw      a3, r_A3*-4(sp)             # pop a3
    lw      t0, r_T0*-4(sp)             # pop t0
    lw      t1, r_T1*-4(sp)             # pop t1
    lw      t2, r_T2*-4(sp)             # pop t2
    lw      t3, r_T3*-4(sp)             # pop t3
    lw      t4, r_T4*-4(sp)             # pop t4
    lw      t5, r_T5*-4(sp)             # pop t5
    lw      t6, r_T6*-4(sp)             # pop t6
    lw      t7, r_T7*-4(sp)             # pop t7
    lw      s0, r_S0*-4(sp)             # pop s0
    lw      s1, r_S1*-4(sp)             # pop s1
    lw      s2, r_S2*-4(sp)             # pop s2
    lw      s3, r_S3*-4(sp)             # pop s3
    lw      s4, r_S4*-4(sp)             # pop s4
    lw      s5, r_S5*-4(sp)             # pop s5
    lw      s6, r_S6*-4(sp)             # pop s6
    lw      s7, r_S7*-4(sp)             # pop s7
    lw      t8, r_T8*-4(sp)             # pop t8
    lw      t9, r_T9*-4(sp)             # pop t9
    lw      k0, r_K0*-4(sp)             # pop k0
    lw      k1, r_K1*-4(sp)             # pop k1
    lw      ra, r_RA*-4(sp)             # pop RA

#ifdef HARD_FLOAT
    /* pop f0-f31 from stack */
    add     sp, (32-0)*4                # adjust stack pointer
    lw      f0, fr0*-4(sp)              # pop f0
    lw      f1, fr1*-4(sp)              # pop f1
    lw      f2, fr2*-4(sp)              # pop f2
    lw      f3, fr3*-4(sp)              # pop f3
    lw      f4, fr4*-4(sp)              # pop f4
    lw      f5, fr5*-4(sp)              # pop f5
    lw      f6, fr6*-4(sp)              # pop f6
    lw      f7, fr7*-4(sp)              # pop f7
    lw      f8, fr8*-4(sp)              # pop f8
    lw      f9, fr9*-4(sp)              # pop f9
    lw      f10, fr10*-4(sp)            # pop f10
    lw      f11, fr11*-4(sp)            # pop f11
    lw      f12, fr12*-4(sp)            # pop f12
    lw      f13, fr13*-4(sp)            # pop f13
    lw      f14, fr14*-4(sp)            # pop f14
    lw      f15, fr15*-4(sp)            # pop f15
    lw      f16, fr16*-4(sp)            # pop f16
    lw      f17, fr17*-4(sp)            # pop f17
    lw      f18, fr18*-4(sp)            # pop f18
    lw      f19, fr19*-4(sp)            # pop f19
    lw      f20, fr20*-4(sp)            # pop f20
    lw      f21, fr21*-4(sp)            # pop f21
    lw      f22, fr22*-4(sp)            # pop f22
    lw      f23, fr23*-4(sp)            # pop f23
    lw      f24, fr24*-4(sp)            # pop f24
    lw      f25, fr25*-4(sp)            # pop f25
    lw      f26, fr26*-4(sp)            # pop f26
    lw      f27, fr27*-4(sp)            # pop f27
    lw      f28, fr28*-4(sp)            # pop f28
    lw      f29, fr29*-4(sp)            # pop f29
    lw      f30, fr30*-4(sp)            # pop f30
    lw      f31, fr31*-4(sp)            # pop f31
#endif

    RETURN
#endif

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_STRING_COMPARETO
dvmCompiler_TEMPLATE_STRING_COMPARETO:
/* File: mips/TEMPLATE_STRING_COMPARETO.S */
    /*
     * String's compareTo.
     *
     * Requires a0/a1 to have been previously checked for null.  Will
     * return negative if this's string is < comp, 0 if they are the
     * same and positive if >.
     *
     * IMPORTANT NOTE:
     *
     * This code relies on hard-coded offsets for string objects, and must be
     * kept in sync with definitions in UtfString.h.  See asm-constants.h
     *
     * On entry:
     *    a0:   this object pointer
     *    a1:   comp object pointer
     *
     */

     subu  v0, a0, a1                # Same?
     bnez  v0, 1f
     RETURN
1:
     lw    t0, STRING_FIELDOFF_OFFSET(a0)
     lw    t1, STRING_FIELDOFF_OFFSET(a1)
     lw    t2, STRING_FIELDOFF_COUNT(a0)
     lw    a2, STRING_FIELDOFF_COUNT(a1)
     lw    a0, STRING_FIELDOFF_VALUE(a0)
     lw    a1, STRING_FIELDOFF_VALUE(a1)

    /*
     * At this point, we have this/comp:
     *    offset: t0/t1
     *    count:  t2/a2
     *    value:  a0/a1
     * We're going to compute
     *    a3 <- countDiff
     *    a2 <- minCount
     */
     subu  a3, t2, a2                # a3<- countDiff
     sleu  t7, t2, a2
     movn  a2, t2, t7                # a2<- minCount

     /*
      * Note: data pointers point to first element.
      */
     addu  a0, 16                    # point to contents[0]
     addu  a1, 16                    # point to contents[0]

     /* Now, build pointers to the string data */
     sll   t7, t0, 1                 # multiply offset by 2
     addu  a0, a0, t7
     sll   t7, t1, 1                 # multiply offset by 2
     addu  a1, a1, t7

     /*
      * At this point we have:
      *   a0: *this string data
      *   a1: *comp string data
      *   a2: iteration count for comparison
      *   a3: value to return if the first part of the string is equal
      *   v0: reserved for result
      *   t0-t5 available for loading string data
      */

     subu  a2, 2
     bltz  a2, do_remainder2

     /*
      * Unroll the first two checks so we can quickly catch early mismatch
      * on long strings (but preserve incoming alignment)
      */
     lhu   t0, 0(a0)
     lhu   t1, 0(a1)
     subu  v0, t0, t1
     beqz  v0, 1f
     RETURN
1:
     lhu   t2, 2(a0)
     lhu   t3, 2(a1)
     subu  v0, t2, t3
     beqz  v0, 2f
     RETURN
2:
     addu  a0, 4                     # offset to contents[2]
     addu  a1, 4                     # offset to contents[2]
     li    t7, 28
     bgt   a2, t7, do_memcmp16
     subu  a2, 3
     bltz  a2, do_remainder

loopback_triple:
     lhu   t0, 0(a0)
     lhu   t1, 0(a1)
     subu  v0, t0, t1
     beqz  v0, 1f
     RETURN
1:
     lhu   t2, 2(a0)
     lhu   t3, 2(a1)
     subu  v0, t2, t3
     beqz  v0, 2f
     RETURN
2:
     lhu   t4, 4(a0)
     lhu   t5, 4(a1)
     subu  v0, t4, t5
     beqz  v0, 3f
     RETURN
3:
     addu  a0, 6                     # offset to contents[i+3]
     addu  a1, 6                     # offset to contents[i+3]
     subu  a2, 3
     bgez  a2, loopback_triple

do_remainder:
     addu  a2, 3
     beqz  a2, returnDiff

loopback_single:
     lhu   t0, 0(a0)
     lhu   t1, 0(a1)
     subu  v0, t0, t1
     bnez  v0, 1f
     addu  a0, 2                     # offset to contents[i+1]
     addu  a1, 2                     # offset to contents[i+1]
     subu  a2, 1
     bnez  a2, loopback_single

returnDiff:
     move  v0, a3
1:
     RETURN

do_remainder2:
     addu  a2, 2
     bnez  a2, loopback_single
     move  v0, a3
     RETURN

    /* Long string case */
do_memcmp16:
     move  rOBJ, a3                  # save return value if strings are equal
     JAL(__memcmp16)
     seq   t0, v0, zero
     movn  v0, rOBJ, t0              # overwrite return value if strings are equal
     RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_STRING_INDEXOF
dvmCompiler_TEMPLATE_STRING_INDEXOF:
/* File: mips/TEMPLATE_STRING_INDEXOF.S */
    /*
     * String's indexOf.
     *
     * Requires a0 to have been previously checked for null.  Will
     * return index of match of a1 in v0.
     *
     * IMPORTANT NOTE:
     *
     * This code relies on hard-coded offsets for string objects, and must be
     * kept in sync wth definitions in UtfString.h  See asm-constants.h
     *
     * On entry:
     *    a0:   string object pointer
     *    a1:   char to match
     *    a2:   Starting offset in string data
     */

     lw    t0, STRING_FIELDOFF_OFFSET(a0)
     lw    t1, STRING_FIELDOFF_COUNT(a0)
     lw    v0, STRING_FIELDOFF_VALUE(a0)

    /*
     * At this point, we have:
     *    v0: object pointer
     *    a1: char to match
     *    a2: starting offset
     *    t0: offset
     *    t1: string length
     */

    /* Point to first element */
     addu  v0, 16                    # point to contents[0]

    /* Build pointer to start of string data */
     sll   t7, t0, 1                 # multiply offset by 2
     addu  v0, v0, t7

    /* Save a copy of starting data in v1 */
     move  v1, v0

    /* Clamp start to [0..count] */
     slt   t7, a2, zero
     movn  a2, zero, t7
     sgt   t7, a2, t1
     movn  a2, t1, t7

    /* Build pointer to start of data to compare */
     sll   t7, a2, 1                # multiply offset by 2
     addu  v0, v0, t7

    /* Compute iteration count */
     subu  a3, t1, a2

    /*
     * At this point we have:
     *   v0: start of data to test
     *   a1: char to compare
     *   a3: iteration count
     *   v1: original start of string
     *   t0-t7 available for loading string data
     */
     subu  a3, 4
     bltz  a3, indexof_remainder

indexof_loop4:
     lhu   t0, 0(v0)
     beq   t0, a1, match_0
     lhu   t0, 2(v0)
     beq   t0, a1, match_1
     lhu   t0, 4(v0)
     beq   t0, a1, match_2
     lhu   t0, 6(v0)
     beq   t0, a1, match_3
     addu  v0, 8                     # offset to contents[i+4]
     subu  a3, 4
     bgez  a3, indexof_loop4

indexof_remainder:
     addu  a3, 4
     beqz  a3, indexof_nomatch

indexof_loop1:
     lhu   t0, 0(v0)
     beq   t0, a1, match_0
     addu  v0, 2                     # offset to contents[i+1]
     subu  a3, 1
     bnez  a3, indexof_loop1

indexof_nomatch:
     li    v0, -1
     RETURN

match_0:
     subu  v0, v1
     sra   v0, v0, 1                 # divide by 2
     RETURN
match_1:
     addu  v0, 2
     subu  v0, v1
     sra   v0, v0, 1                 # divide by 2
     RETURN
match_2:
     addu  v0, 4
     subu  v0, v1
     sra   v0, v0, 1                 # divide by 2
     RETURN
match_3:
     addu  v0, 6
     subu  v0, v1
     sra   v0, v0, 1                 # divide by 2
     RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INTERPRET
dvmCompiler_TEMPLATE_INTERPRET:
/* File: mips/TEMPLATE_INTERPRET.S */
    /*
     * This handler transfers control to the interpeter without performing
     * any lookups.  It may be called either as part of a normal chaining
     * operation, or from the transition code in header.S.  We distinquish
     * the two cases by looking at the link register.  If called from a
     * translation chain, it will point to the chaining Dalvik PC.
     * On entry:
     *    ra - if NULL:
     *        a1 - the Dalvik PC to begin interpretation.
     *    else
     *        [ra] contains Dalvik PC to begin interpretation
     *    rSELF - pointer to thread
     *    rFP - Dalvik frame pointer
     */
    la      t0, dvmJitToInterpPunt
    move    a0, a1
    beq     ra, zero, 1f
    lw      a0, 0(ra)
1:
    jr      t0
    # doesn't return

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_MONITOR_ENTER
dvmCompiler_TEMPLATE_MONITOR_ENTER:
/* File: mips/TEMPLATE_MONITOR_ENTER.S */
    /*
     * Call out to the runtime to lock an object.  Because this thread
     * may have been suspended in THREAD_MONITOR state and the Jit's
     * translation cache subsequently cleared, we cannot return directly.
     * Instead, unconditionally transition to the interpreter to resume.
     *
     * On entry:
     *    a0 - self pointer
     *    a1 - the object (which has already been null-checked by the caller
     *    rPC - the Dalvik PC of the following instruction.
     */
    la     a2, .LdvmLockObject
    lw     t9, (a2)
    sw     zero, offThread_inJitCodeCache(a0)   # record that we're not returning
    JALR(t9)                                    # dvmLockObject(self, obj)
    lw     gp, STACK_OFFSET_GP(sp)

    la     a2, .LdvmJitToInterpNoChain
    lw     a2, (a2)

    # Bail to interpreter - no chain [note - rPC still contains dPC]
#if defined(WITH_JIT_TUNING)
    li      a0, kHeavyweightMonitor
#endif
    jr      a2

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_MONITOR_ENTER_DEBUG
dvmCompiler_TEMPLATE_MONITOR_ENTER_DEBUG:
/* File: mips/TEMPLATE_MONITOR_ENTER_DEBUG.S */
    /*
     * To support deadlock prediction, this version of MONITOR_ENTER
     * will always call the heavyweight dvmLockObject, check for an
     * exception and then bail out to the interpreter.
     *
     * On entry:
     *    a0 - self pointer
     *    a1 - the object (which has already been null-checked by the caller
     *    rPC - the Dalvik PC of the following instruction.
     *
     */
    la     a2, .LdvmLockObject
    lw     t9, (a2)
    sw     zero, offThread_inJitCodeCache(a0)   # record that we're not returning
    JALR(t9)                                    # dvmLockObject(self, obj)
    lw     gp, STACK_OFFSET_GP(sp)

    # test for exception
    lw     a1, offThread_exception(rSELF)
    beqz   a1, 1f
    sub    a0, rPC, 2                           # roll dPC back to this monitor instruction
    j      .LhandleException
1:
    # Bail to interpreter - no chain [note - rPC still contains dPC]
#if defined(WITH_JIT_TUNING)
    li     a0, kHeavyweightMonitor
#endif
    la     a2, .LdvmJitToInterpNoChain
    lw     a2, (a2)
    jr     a2

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_RESTORE_STATE
dvmCompiler_TEMPLATE_RESTORE_STATE:
/* File: mips/TEMPLATE_RESTORE_STATE.S */
    /*
     * This handler restores state following a selfVerification memory access.
     * On entry:
     *    a0 - offset from rSELF to the 1st element of the coreRegs save array.
     * Note: the following registers are not restored
     *       zero, AT, gp, sp, fp, ra
     */

    add     a0, a0, rSELF               # pointer to heapArgSpace.coreRegs[0]
#if 0
    lw      zero, r_ZERO*4(a0)          # restore zero
#endif
    .set noat
    lw      AT, r_AT*4(a0)              # restore at
    .set at
    lw      v0, r_V0*4(a0)              # restore v0
    lw      v1, r_V1*4(a0)              # restore v1

    lw      a1, r_A1*4(a0)              # restore a1
    lw      a2, r_A2*4(a0)              # restore a2
    lw      a3, r_A3*4(a0)              # restore a3

    lw      t0, r_T0*4(a0)              # restore t0
    lw      t1, r_T1*4(a0)              # restore t1
    lw      t2, r_T2*4(a0)              # restore t2
    lw      t3, r_T3*4(a0)              # restore t3
    lw      t4, r_T4*4(a0)              # restore t4
    lw      t5, r_T5*4(a0)              # restore t5
    lw      t6, r_T6*4(a0)              # restore t6
    lw      t7, r_T7*4(a0)              # restore t7

    lw      s0, r_S0*4(a0)              # restore s0
    lw      s1, r_S1*4(a0)              # restore s1
    lw      s2, r_S2*4(a0)              # restore s2
    lw      s3, r_S3*4(a0)              # restore s3
    lw      s4, r_S4*4(a0)              # restore s4
    lw      s5, r_S5*4(a0)              # restore s5
    lw      s6, r_S6*4(a0)              # restore s6
    lw      s7, r_S7*4(a0)              # restore s7

    lw      t8, r_T8*4(a0)              # restore t8
    lw      t9, r_T9*4(a0)              # restore t9

    lw      k0, r_K0*4(a0)              # restore k0
    lw      k1, r_K1*4(a0)              # restore k1

#if 0
    lw      gp, r_GP*4(a0)              # restore gp
    lw      sp, r_SP*4(a0)              # restore sp
    lw      fp, r_FP*4(a0)              # restore fp
    lw      ra, r_RA*4(a0)              # restore ra
#endif

/* #ifdef HARD_FLOAT */
#if 0
    lw      f0, fr0*4(a0)               # restore f0
    lw      f1, fr1*4(a0)               # restore f1
    lw      f2, fr2*4(a0)               # restore f2
    lw      f3, fr3*4(a0)               # restore f3
    lw      f4, fr4*4(a0)               # restore f4
    lw      f5, fr5*4(a0)               # restore f5
    lw      f6, fr6*4(a0)               # restore f6
    lw      f7, fr7*4(a0)               # restore f7
    lw      f8, fr8*4(a0)               # restore f8
    lw      f9, fr9*4(a0)               # restore f9
    lw      f10, fr10*4(a0)             # restore f10
    lw      f11, fr11*4(a0)             # restore f11
    lw      f12, fr12*4(a0)             # restore f12
    lw      f13, fr13*4(a0)             # restore f13
    lw      f14, fr14*4(a0)             # restore f14
    lw      f15, fr15*4(a0)             # restore f15
    lw      f16, fr16*4(a0)             # restore f16
    lw      f17, fr17*4(a0)             # restore f17
    lw      f18, fr18*4(a0)             # restore f18
    lw      f19, fr19*4(a0)             # restore f19
    lw      f20, fr20*4(a0)             # restore f20
    lw      f21, fr21*4(a0)             # restore f21
    lw      f22, fr22*4(a0)             # restore f22
    lw      f23, fr23*4(a0)             # restore f23
    lw      f24, fr24*4(a0)             # restore f24
    lw      f25, fr25*4(a0)             # restore f25
    lw      f26, fr26*4(a0)             # restore f26
    lw      f27, fr27*4(a0)             # restore f27
    lw      f28, fr28*4(a0)             # restore f28
    lw      f29, fr29*4(a0)             # restore f29
    lw      f30, fr30*4(a0)             # restore f30
    lw      f31, fr31*4(a0)             # restore f31
#endif

    lw      a0, r_A1*4(a0)              # restore a0
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_SAVE_STATE
dvmCompiler_TEMPLATE_SAVE_STATE:
/* File: mips/TEMPLATE_SAVE_STATE.S */
    /*
     * This handler performs a register save for selfVerification mode.
     * On entry:
     *    Top of stack + 4: a1 value to save
     *    Top of stack + 0: a0 value to save
     *    a0 - offset from rSELF to the beginning of the heapArgSpace record
     *    a1 - the value of regMap
     *
     * The handler must save regMap, r0-r31, f0-f31 if FPU, and then return with
     * r0-r31 with their original values (note that this means a0 and a1 must take
     * the values on the stack - not the ones in those registers on entry.
     * Finally, the two registers previously pushed must be popped.
     * Note: the following registers are not saved
     *       zero, AT, gp, sp, fp, ra
     */
    add     a0, a0, rSELF               # pointer to heapArgSpace
    sw      a1, 0(a0)                   # save regMap
    add     a0, a0, 4                   # pointer to coreRegs
#if 0
    sw      zero, r_ZERO*4(a0)          # save zero
#endif
    .set noat
    sw      AT, r_AT*4(a0)              # save at
    .set at
    sw      v0, r_V0*4(a0)              # save v0
    sw      v1, r_V1*4(a0)              # save v1

    lw      a1, 0(sp)                   # recover a0 value
    sw      a1, r_A0*4(a0)              # save a0
    lw      a1, 4(sp)                   # recover a1 value
    sw      a1, r_A1*4(a0)              # save a1
    sw      a2, r_A2*4(a0)              # save a2
    sw      a3, r_A3*4(a0)              # save a3

    sw      t0, r_T0*4(a0)              # save t0
    sw      t1, r_T1*4(a0)              # save t1
    sw      t2, r_T2*4(a0)              # save t2
    sw      t3, r_T3*4(a0)              # save t3
    sw      t4, r_T4*4(a0)              # save t4
    sw      t5, r_T5*4(a0)              # save t5
    sw      t6, r_T6*4(a0)              # save t6
    sw      t7, r_T7*4(a0)              # save t7

    sw      s0, r_S0*4(a0)              # save s0
    sw      s1, r_S1*4(a0)              # save s1
    sw      s2, r_S2*4(a0)              # save s2
    sw      s3, r_S3*4(a0)              # save s3
    sw      s4, r_S4*4(a0)              # save s4
    sw      s5, r_S5*4(a0)              # save s5
    sw      s6, r_S6*4(a0)              # save s6
    sw      s7, r_S7*4(a0)              # save s7

    sw      t8, r_T8*4(a0)              # save t8
    sw      t9, r_T9*4(a0)              # save t9

    sw      k0, r_K0*4(a0)              # save k0
    sw      k1, r_K1*4(a0)              # save k1

#if 0
    sw      gp, r_GP*4(a0)              # save gp
    sw      sp, r_SP*4(a0)              # save sp (need to adjust??? )
    sw      fp, r_FP*4(a0)              # save fp
    sw      ra, r_RA*4(a0)              # save ra
#endif

/* #ifdef HARD_FLOAT */
#if 0
    sw      f0, fr0*4(a0)               # save f0
    sw      f1, fr1*4(a0)               # save f1
    sw      f2, fr2*4(a0)               # save f2
    sw      f3, fr3*4(a0)               # save f3
    sw      f4, fr4*4(a0)               # save f4
    sw      f5, fr5*4(a0)               # save f5
    sw      f6, fr6*4(a0)               # save f6
    sw      f7, fr7*4(a0)               # save f7
    sw      f8, fr8*4(a0)               # save f8
    sw      f9, fr9*4(a0)               # save f9
    sw      f10, fr10*4(a0)             # save f10
    sw      f11, fr11*4(a0)             # save f11
    sw      f12, fr12*4(a0)             # save f12
    sw      f13, fr13*4(a0)             # save f13
    sw      f14, fr14*4(a0)             # save f14
    sw      f15, fr15*4(a0)             # save f15
    sw      f16, fr16*4(a0)             # save f16
    sw      f17, fr17*4(a0)             # save f17
    sw      f18, fr18*4(a0)             # save f18
    sw      f19, fr19*4(a0)             # save f19
    sw      f20, fr20*4(a0)             # save f20
    sw      f21, fr21*4(a0)             # save f21
    sw      f22, fr22*4(a0)             # save f22
    sw      f23, fr23*4(a0)             # save f23
    sw      f24, fr24*4(a0)             # save f24
    sw      f25, fr25*4(a0)             # save f25
    sw      f26, fr26*4(a0)             # save f26
    sw      f27, fr27*4(a0)             # save f27
    sw      f28, fr28*4(a0)             # save f28
    sw      f29, fr29*4(a0)             # save f29
    sw      f30, fr30*4(a0)             # save f30
    sw      f31, fr31*4(a0)             # save f31
#endif

    lw      a1, 0(sp)                   # recover a0 value
    lw      a1, 4(sp)                   # recover a1 value
    sub     sp, sp, 8                   # adjust stack ptr
    RETURN

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_PERIODIC_PROFILING
dvmCompiler_TEMPLATE_PERIODIC_PROFILING:
/* File: mips/TEMPLATE_PERIODIC_PROFILING.S */
    /*
     * Increment profile counter for this trace, and decrement
     * sample counter.  If sample counter goes below zero, turn
     * off profiling.
     *
     * On entry
     * (ra-16) is address of pointer to counter.  Note: the counter
     *    actually exists 16 bytes before the return target for mips.
     *     - 4 bytes for prof count addr.
     *     - 4 bytes for chain cell offset (2bytes 32 bit aligned).
     *     - 4 bytes for call TEMPLATE_PERIODIC_PROFILING.
     *     - 4 bytes for call delay slot.
     */
     lw     a0, -16(ra)
     lw     a1, offThread_pProfileCountdown(rSELF)
     lw     a2, 0(a0)                   # get counter
     lw     a3, 0(a1)                   # get countdown timer
     addu   a2, 1
     sub    a3, 1                       # FIXME - bug in ARM code???
     bltz   a3, .LTEMPLATE_PERIODIC_PROFILING_disable_profiling
     sw     a2, 0(a0)
     sw     a3, 0(a1)
     RETURN
.LTEMPLATE_PERIODIC_PROFILING_disable_profiling:
     move   rTEMP, ra                   # preserve ra
     la     a0, dvmJitTraceProfilingOff
     JALR(a0)
     jr     rTEMP

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_RETURN_PROF
dvmCompiler_TEMPLATE_RETURN_PROF:
/* File: mips/TEMPLATE_RETURN_PROF.S */
#define TEMPLATE_INLINE_PROFILING
/* File: mips/TEMPLATE_RETURN.S */
    /*
     * Unwind a frame from the Dalvik stack for compiled OP_RETURN_XXX.
     * If the stored value in returnAddr
     * is non-zero, the caller is compiled by the JIT thus return to the
     * address in the code cache following the invoke instruction. Otherwise
     * return to the special dvmJitToInterpNoChain entry point.
     */
#if defined(TEMPLATE_INLINE_PROFILING)
    # preserve a0-a2 and ra
    SCRATCH_STORE(a0, 0)
    SCRATCH_STORE(a1, 4)
    SCRATCH_STORE(a2, 8)
    SCRATCH_STORE(ra, 12)

    # a0=rSELF
    move    a0, rSELF
    la      t9, dvmFastMethodTraceExit
    JALR(t9)
    lw      gp, STACK_OFFSET_GP(sp)

    # restore a0-a2 and ra
    SCRATCH_LOAD(ra, 12)
    SCRATCH_LOAD(a2, 8)
    SCRATCH_LOAD(a1, 4)
    SCRATCH_LOAD(a0, 0)
#endif
    SAVEAREA_FROM_FP(a0, rFP)           # a0<- saveArea (old)
    lw      t0, offStackSaveArea_prevFrame(a0)     # t0<- saveArea->prevFrame
    lbu     t1, offThread_breakFlags(rSELF)        # t1<- breakFlags
    lw      rPC, offStackSaveArea_savedPc(a0)      # rPC<- saveArea->savedPc
#if !defined(WITH_SELF_VERIFICATION)
    lw      t2,  offStackSaveArea_returnAddr(a0)   # t2<- chaining cell ret
#else
    move    t2, zero                               # disable chaining
#endif
    lw      a2, offStackSaveArea_method - sizeofStackSaveArea(t0)
                                                   # a2<- method we're returning to
#if !defined(WITH_SELF_VERIFICATION)
    beq     a2, zero, 1f                           # bail to interpreter
#else
    bne     a2, zero, 2f
    JALR(ra)                                       # punt to interpreter and compare state
    # DOUG: assume this does not return ???
2:
#endif
    la      t4, .LdvmJitToInterpNoChainNoProfile   # defined in footer.S
    lw      a1, (t4)
    move    rFP, t0                                # publish new FP
    beq     a2, zero, 4f
    lw      t0, offMethod_clazz(a2)                # t0<- method->clazz
4:

    sw      a2, offThread_method(rSELF)            # self->method = newSave->method
    lw      a0, offClassObject_pDvmDex(t0)         # a0<- method->clazz->pDvmDex
    sw      rFP, offThread_curFrame(rSELF)         # self->curFrame = fp
    add     rPC, rPC, 3*2                          # publish new rPC
    sw      a0, offThread_methodClassDex(rSELF)
    movn    t2, zero, t1                           # check the breadFlags and
                                                   # clear the chaining cell address
    sw      t2, offThread_inJitCodeCache(rSELF)    # in code cache or not
    beq     t2, zero, 3f                           # chaining cell exists?
    JALR(t2)                                       # jump to the chaining cell
    # DOUG: assume this does not return ???
3:
#if defined(WITH_JIT_TUNING)
    li      a0, kCallsiteInterpreted
#endif
    j       a1                                     # callsite is interpreted
1:
    sw      zero, offThread_inJitCodeCache(rSELF)  # reset inJitCodeCache
    SAVE_PC_TO_SELF()                              # SAVE_PC_FP_TO_SELF()
    SAVE_FP_TO_SELF()
    la      t4, .LdvmMterpStdBail                  # defined in footer.S
    lw      a2, (t4)
    move    a0, rSELF                              # Expecting rSELF in a0
    JALR(a2)                                       # exit the interpreter
    # DOUG: assume this does not return ???

#undef TEMPLATE_INLINE_PROFILING

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT_PROF
dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT_PROF:
/* File: mips/TEMPLATE_INVOKE_METHOD_NO_OPT_PROF.S */
#define TEMPLATE_INLINE_PROFILING
/* File: mips/TEMPLATE_INVOKE_METHOD_NO_OPT.S */
    /*
     * For polymorphic callsites - setup the Dalvik frame and load Dalvik PC
     * into rPC then jump to dvmJitToInterpNoChain to dispatch the
     * runtime-resolved callee.
     */
    # a0 = methodToCall, a1 = returnCell, rPC = dalvikCallsite
    lh     t7, offMethod_registersSize(a0)        # t7<- methodToCall->regsSize
    lh     a2, offMethod_outsSize(a0)             # a2<- methodToCall->outsSize
    lw     t9, offThread_interpStackEnd(rSELF)    # t9<- interpStackEnd
    lbu    t8, offThread_breakFlags(rSELF)        # t8<- breakFlags
    move   a3, a1                                 # a3<- returnCell
    SAVEAREA_FROM_FP(a1, rFP)                     # a1<- stack save area
    sll    t6, t7, 2                              # multiply regsSize by 4 (4 bytes per reg)
    sub    a1, a1, t6                             # a1<- newFp(old savearea-regsSize)
    SAVEAREA_FROM_FP(t0, a1)                      # t0<- stack save area
    sll    t6, a2, 2                              # multiply outsSize by 4 (4 bytes per reg)
    sub    t0, t0, t6                             # t0<- bottom (newsave-outsSize)
    bgeu   t0, t9, 1f                             # bottom < interpStackEnd?
    RETURN                                        # return to raise stack overflow excep.

1:
    # a1 = newFP, a0 = methodToCall, a3 = returnCell, rPC = dalvikCallsite
    lw     t9, offMethod_clazz(a0)                # t9<- methodToCall->clazz
    lw     t0, offMethod_accessFlags(a0)          # t0<- methodToCall->accessFlags
    sw     rPC, (offStackSaveArea_currentPc - sizeofStackSaveArea)(rFP)
    sw     rPC, (offStackSaveArea_savedPc - sizeofStackSaveArea)(a1)
    lw     rPC, offMethod_insns(a0)               # rPC<- methodToCall->insns

    # set up newSaveArea
    sw     rFP, (offStackSaveArea_prevFrame - sizeofStackSaveArea)(a1)
    sw     a3, (offStackSaveArea_returnAddr - sizeofStackSaveArea)(a1)
    sw     a0, (offStackSaveArea_method - sizeofStackSaveArea)(a1)
    beqz   t8, 2f                                 # breakFlags != 0
    RETURN                                        # bail to the interpreter

2:
    and    t6, t0, ACC_NATIVE
    beqz   t6, 3f
#if !defined(WITH_SELF_VERIFICATION)
    j      .LinvokeNative
#else
    RETURN                                        # bail to the interpreter
#endif

3:
    # continue executing the next instruction through the interpreter
    la     t0, .LdvmJitToInterpTraceSelectNoChain # defined in footer.S
    lw     rTEMP, (t0)
    lw     a3, offClassObject_pDvmDex(t9)         # a3<- method->clazz->pDvmDex

    # Update "thread" values for the new method
    sw     a0, offThread_method(rSELF)            # self->method = methodToCall
    sw     a3, offThread_methodClassDex(rSELF)    # self->methodClassDex = ...
    move   rFP, a1                                # fp = newFp
    sw     rFP, offThread_curFrame(rSELF)         # self->curFrame = newFp
#if defined(TEMPLATE_INLINE_PROFILING)
    # preserve a0-a3
    SCRATCH_STORE(a0, 0)
    SCRATCH_STORE(a1, 4)
    SCRATCH_STORE(a2, 8)
    SCRATCH_STORE(a3, 12)

    # a0=methodToCall, a1=rSELF
    move   a1, rSELF
    la     t9, dvmFastMethodTraceEnter
    JALR(t9)
    lw     gp, STACK_OFFSET_GP(sp)

    # restore a0-a3
    SCRATCH_LOAD(a3, 12)
    SCRATCH_LOAD(a2, 8)
    SCRATCH_LOAD(a1, 4)
    SCRATCH_LOAD(a0, 0)
#endif

    # Start executing the callee
#if defined(WITH_JIT_TUNING)
    li     a0, kInlineCacheMiss
#endif
    jr     rTEMP                                  # dvmJitToInterpTraceSelectNoChain

#undef TEMPLATE_INLINE_PROFILING

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN_PROF
dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN_PROF:
/* File: mips/TEMPLATE_INVOKE_METHOD_CHAIN_PROF.S */
#define TEMPLATE_INLINE_PROFILING
/* File: mips/TEMPLATE_INVOKE_METHOD_CHAIN.S */
    /*
     * For monomorphic callsite, setup the Dalvik frame and return to the
     * Thumb code through the link register to transfer control to the callee
     * method through a dedicated chaining cell.
     */
    # a0 = methodToCall, a1 = returnCell, rPC = dalvikCallsite
    # methodToCall is guaranteed to be non-native
.LinvokeChainProf:
    lh     t7, offMethod_registersSize(a0)        # t7<- methodToCall->regsSize
    lh     a2, offMethod_outsSize(a0)             # a2<- methodToCall->outsSize
    lw     t9, offThread_interpStackEnd(rSELF)    # t9<- interpStackEnd
    lbu    t8, offThread_breakFlags(rSELF)        # t8<- breakFlags
    move   a3, a1                                 # a3<- returnCell
    SAVEAREA_FROM_FP(a1, rFP)                     # a1<- stack save area
    sll    t6, t7, 2                              # multiply regsSize by 4 (4 bytes per reg)
    sub    a1, a1, t6                             # a1<- newFp(old savearea-regsSize)
    SAVEAREA_FROM_FP(t0, a1)                      # t0<- stack save area
    add    t2, ra, 8                              # setup the punt-to-interp address
                                                  # 8 bytes skips branch and delay slot
    sll    t6, a2, 2                              # multiply outsSize by 4 (4 bytes per reg)
    sub    t0, t0, t6                             # t0<- bottom (newsave-outsSize)
    bgeu   t0, t9, 1f                             # bottom < interpStackEnd?
    jr     t2                                     # return to raise stack overflow excep.

1:
    # a1 = newFP, a0 = methodToCall, a3 = returnCell, rPC = dalvikCallsite
    lw     t9, offMethod_clazz(a0)                # t9<- methodToCall->clazz
    sw     rPC, (offStackSaveArea_currentPc - sizeofStackSaveArea)(rFP)
    sw     rPC, (offStackSaveArea_savedPc - sizeofStackSaveArea)(a1)
    lw     rPC, offMethod_insns(a0)               # rPC<- methodToCall->insns

    # set up newSaveArea
    sw     rFP, (offStackSaveArea_prevFrame - sizeofStackSaveArea)(a1)
    sw     a3, (offStackSaveArea_returnAddr - sizeofStackSaveArea)(a1)
    sw     a0, (offStackSaveArea_method - sizeofStackSaveArea)(a1)
    beqz   t8, 2f                                 # breakFlags != 0
    jr     t2                                     # bail to the interpreter

2:
    lw     a3, offClassObject_pDvmDex(t9)         # a3<- methodToCall->clazz->pDvmDex

    # Update "thread" values for the new method
    sw     a0, offThread_method(rSELF)            # self->method = methodToCall
    sw     a3, offThread_methodClassDex(rSELF)    # self->methodClassDex = ...
    move   rFP, a1                                # fp = newFp
    sw     rFP, offThread_curFrame(rSELF)         # self->curFrame = newFp
#if defined(TEMPLATE_INLINE_PROFILING)
    # preserve a0-a2 and ra
    SCRATCH_STORE(a0, 0)
    SCRATCH_STORE(a1, 4)
    SCRATCH_STORE(a2, 8)
    SCRATCH_STORE(ra, 12)

    move   a1, rSELF
    # a0=methodToCall, a1=rSELF
    la     t9, dvmFastMethodTraceEnter
    jalr   t9
    lw     gp, STACK_OFFSET_GP(sp)

    # restore a0-a2 and ra
    SCRATCH_LOAD(ra, 12)
    SCRATCH_LOAD(a2, 8)
    SCRATCH_LOAD(a1, 4)
    SCRATCH_LOAD(a0, 0)
#endif
    RETURN                                        # return to the callee-chaining cell

#undef TEMPLATE_INLINE_PROFILING

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF
dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF:
/* File: mips/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF.S */
#define TEMPLATE_INLINE_PROFILING
/* File: mips/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S */
    /*
     * For polymorphic callsite, check whether the cached class pointer matches
     * the current one. If so setup the Dalvik frame and return to the
     * Thumb code through the link register to transfer control to the callee
     * method through a dedicated chaining cell.
     *
     * The predicted chaining cell is declared in ArmLIR.h with the
     * following layout:
     *
     *  typedef struct PredictedChainingCell {
     *      u4 branch;
     *      u4 delay_slot;
     *      const ClassObject *clazz;
     *      const Method *method;
     *      u4 counter;
     *  } PredictedChainingCell;
     *
     * Upon returning to the callsite:
     *    - lr   : to branch to the chaining cell
     *    - lr+8 : to punt to the interpreter
     *    - lr+16: to fully resolve the callee and may rechain.
     *             a3 <- class
     */
    # a0 = this, a1 = returnCell, a2 = predictedChainCell, rPC = dalvikCallsite
    lw      a3, offObject_clazz(a0)     # a3 <- this->class
    lw      rIBASE, 8(a2)                   # t0 <- predictedChainCell->clazz
    lw      a0, 12(a2)                  # a0 <- predictedChainCell->method
    lw      t1, offThread_icRechainCount(rSELF)    # t1 <- shared rechainCount

#if defined(WITH_JIT_TUNING)
    la      rINST, .LdvmICHitCount
    #add     t2, t2, 1
    bne    a3, rIBASE, 1f
    nop
    lw      t2, 0(rINST)
    add     t2, t2, 1
    sw      t2, 0(rINST)
1:
    #add     t2, t2, 1
#endif
    beq     a3, rIBASE, .LinvokeChainProf       # branch if predicted chain is valid
    lw      rINST, offClassObject_vtable(a3)     # rINST <- this->class->vtable
    beqz    rIBASE, 2f                      # initialized class or not
    sub     a1, t1, 1                   # count--
    sw      a1, offThread_icRechainCount(rSELF)   # write back to InterpState
    b       3f
2:
    move    a1, zero
3:
    add     ra, ra, 16                  # return to fully-resolve landing pad
    /*
     * a1 <- count
     * a2 <- &predictedChainCell
     * a3 <- this->class
     * rPC <- dPC
     * rINST <- this->class->vtable
     */
    RETURN

#undef TEMPLATE_INLINE_PROFILING

/* ------------------------------ */
    .balign 4
    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE_PROF
dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE_PROF:
/* File: mips/TEMPLATE_INVOKE_METHOD_NATIVE_PROF.S */
#define TEMPLATE_INLINE_PROFILING
/* File: mips/TEMPLATE_INVOKE_METHOD_NATIVE.S */
    # a0 = methodToCall, a1 = returnCell, rPC = dalvikCallsite
    lh     t7, offMethod_registersSize(a0)        # t7<- methodToCall->regsSize
    lw     t9, offThread_interpStackEnd(rSELF)    # t9<- interpStackEnd
    lbu    t8, offThread_breakFlags(rSELF)        # t8<- breakFlags
    move   a3, a1                                 # a3<- returnCell
    SAVEAREA_FROM_FP(a1, rFP)                     # a1<- stack save area
    sll    t6, t7, 2                              # multiply regsSize by 4 (4 bytes per reg)
    sub    a1, a1, t6                             # a1<- newFp(old savearea-regsSize)
    SAVEAREA_FROM_FP(t0, a1)                      # t0<- stack save area
    bgeu   t0, t9, 1f                             # bottom < interpStackEnd?
    RETURN                                        # return to raise stack overflow excep.

1:
    # a1 = newFP, a0 = methodToCall, a3 = returnCell, rPC = dalvikCallsite
    sw     rPC, (offStackSaveArea_currentPc - sizeofStackSaveArea)(rFP)
    sw     rPC, (offStackSaveArea_savedPc - sizeofStackSaveArea)(a1)
    lw     rPC, offMethod_insns(a0)               # rPC<- methodToCall->insns

    # set up newSaveArea
    sw     rFP, (offStackSaveArea_prevFrame - sizeofStackSaveArea)(a1)
    sw     a3, (offStackSaveArea_returnAddr - sizeofStackSaveArea)(a1)
    sw     a0, (offStackSaveArea_method - sizeofStackSaveArea)(a1)
    lw     rTEMP, offMethod_nativeFunc(a0)        # t9<- method->nativeFunc
#if !defined(WITH_SELF_VERIFICATION)
    beqz   t8, 2f                                 # breakFlags != 0
    RETURN                                        # bail to the interpreter
2:
#else
    RETURN                                        # bail to the interpreter unconditionally
#endif

    # go ahead and transfer control to the native code
    lw     t6, offThread_jniLocal_topCookie(rSELF)  # t6<- thread->localRef->...
    sw     a1, offThread_curFrame(rSELF)          # self->curFrame = newFp
    sw     zero, offThread_inJitCodeCache(rSELF)  # not in the jit code cache
    sw     t6, (offStackSaveArea_localRefCookie - sizeofStackSaveArea)(a1)
                                                  # newFp->localRefCookie=top
    SAVEAREA_FROM_FP(rBIX, a1)                    # rBIX<- new stack save area
    move   a2, a0                                 # a2<- methodToCall
    move   a0, a1                                 # a0<- newFp
    add    a1, rSELF, offThread_retval            # a1<- &retval
    move   a3, rSELF                              # a3<- self
#if defined(TEMPLATE_INLINE_PROFILING)
    # a2: methodToCall
    # preserve a0-a3
    SCRATCH_STORE(a0, 0)
    SCRATCH_STORE(a1, 4)
    SCRATCH_STORE(a2, 8)
    SCRATCH_STORE(a3, 12)

    move   a0, a2
    move   a1, rSELF
    # a0=JNIMethod, a1=rSELF
    la      t9, dvmFastMethodTraceEnter
    JALR(t9)                                      # off to the native code
    lw     gp, STACK_OFFSET_GP(sp)

    # restore a0-a3
    SCRATCH_LOAD(a3, 12)
    SCRATCH_LOAD(a2, 8)
    SCRATCH_LOAD(a1, 4)
    SCRATCH_LOAD(a0, 0)

    move   rOBJ, a2                               # save a2
#endif

    JALR(rTEMP)                                   # off to the native code
    lw     gp, STACK_OFFSET_GP(sp)

#if defined(TEMPLATE_INLINE_PROFILING)
    move   a0, rOBJ
    move   a1, rSELF
    # a0=JNIMethod, a1=rSELF
    la      t9, dvmFastNativeMethodTraceExit
    JALR(t9)
    lw     gp, STACK_OFFSET_GP(sp)
#endif

    # native return; rBIX=newSaveArea
    # equivalent to dvmPopJniLocals
    lw     a2, offStackSaveArea_returnAddr(rBIX)     # a2 = chaining cell ret addr
    lw     a0, offStackSaveArea_localRefCookie(rBIX) # a0<- saved->top
    lw     a1, offThread_exception(rSELF)            # check for exception
    sw     rFP, offThread_curFrame(rSELF)            # self->curFrame = fp
    sw     a0, offThread_jniLocal_topCookie(rSELF)   # new top <- old top
    lw     a0, (offStackSaveArea_currentPc - sizeofStackSaveArea)(rFP)

    # a0 = dalvikCallsitePC
    bnez   a1, .LhandleException                     # handle exception if any

    sw     a2, offThread_inJitCodeCache(rSELF)       # set the mode properly
    beqz   a2, 3f
    jr     a2                                        # go if return chaining cell still exist

3:
    # continue executing the next instruction through the interpreter
    la     a1, .LdvmJitToInterpTraceSelectNoChain    # defined in footer.S
    lw     a1, (a1)
    add    rPC, a0, 3*2                              # reconstruct new rPC (advance 3 dalvik instr)

#if defined(WITH_JIT_TUNING)
    li     a0, kCallsiteInterpreted
#endif
    jr     a1

#undef TEMPLATE_INLINE_PROFILING

    .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
/* File: mips/footer.S */
/*
 * ===========================================================================
 *  Common subroutines and data
 * ===========================================================================
 */

    .section .data.rel.ro
    .align  4
.LinvokeNative:
    # Prep for the native call
    # a1 = newFP, a0 = methodToCall
    lw     t9, offThread_jniLocal_topCookie(rSELF)  # t9<- thread->localRef->...
    sw     zero, offThread_inJitCodeCache(rSELF)    # not in jit code cache
    sw     a1, offThread_curFrame(rSELF)            # self->curFrame = newFp
    sw     t9, (offStackSaveArea_localRefCookie - sizeofStackSaveArea)(a1)
                                                 # newFp->localRefCookie=top
    lhu     ra, offThread_subMode(rSELF)
    SAVEAREA_FROM_FP(rBIX, a1)                   # rBIX<- new stack save area

    move    a2, a0                               # a2<- methodToCall
    move    a0, a1                               # a0<- newFp
    add     a1, rSELF, offThread_retval          # a1<- &retval
    move    a3, rSELF                            # a3<- self
    andi    ra, kSubModeMethodTrace
    beqz    ra, 121f
    # a2: methodToCall
    # preserve a0-a3
    SCRATCH_STORE(a0, 0)
    SCRATCH_STORE(a1, 4)
    SCRATCH_STORE(a2, 8)
    SCRATCH_STORE(a3, 12)
    move    rTEMP, a2                            # preserve a2

    move    a0, rTEMP
    move    a1, rSELF
    la      t9, dvmFastMethodTraceEnter
    JALR(t9)
    lw      gp, STACK_OFFSET_GP(sp)

    # restore a0-a3
    SCRATCH_LOAD(a3, 12)
    SCRATCH_LOAD(a2, 8)
    SCRATCH_LOAD(a1, 4)
    SCRATCH_LOAD(a0, 0)

    lw      t9, offMethod_nativeFunc(a2)
    JALR(t9)                                      # call methodToCall->nativeFunc
    lw      gp, STACK_OFFSET_GP(sp)

    move    a0, rTEMP
    move    a1, rSELF
    la      t9, dvmFastNativeMethodTraceExit
    JALR(t9)
    lw      gp, STACK_OFFSET_GP(sp)
    b       212f

121:
    lw      t9, offMethod_nativeFunc(a2)
    JALR(t9)                                     # call methodToCall->nativeFunc
    lw      gp, STACK_OFFSET_GP(sp)

212:
    # native return; rBIX=newSaveArea
    # equivalent to dvmPopJniLocals
    lw     a2, offStackSaveArea_returnAddr(rBIX)     # a2 = chaining cell ret addr
    lw     a0, offStackSaveArea_localRefCookie(rBIX) # a0<- saved->top
    lw     a1, offThread_exception(rSELF)            # check for exception
    sw     rFP, offThread_curFrame(rSELF)            # self->curFrame = fp
    sw     a0, offThread_jniLocal_topCookie(rSELF)   # new top <- old top
    lw     a0, offStackSaveArea_savedPc(rBIX)        # reload rPC

    # a0 = dalvikCallsitePC
    bnez   a1, .LhandleException                     # handle exception if any

    sw     a2, offThread_inJitCodeCache(rSELF)       # set the mode properly
    beqz   a2, 3f
    jr     a2                                        # go if return chaining cell still exist

3:
    # continue executing the next instruction through the interpreter
    la     a1, .LdvmJitToInterpTraceSelectNoChain    # defined in footer.S
    lw     a1, (a1)
    add    rPC, a0, 3*2                              # reconstruct new rPC

#if defined(WITH_JIT_TUNING)
    li     a0, kCallsiteInterpreted
#endif
    jr     a1


/*
 * On entry:
 * a0  Faulting Dalvik PC
 */
.LhandleException:
#if defined(WITH_SELF_VERIFICATION)
    la     t0, .LdeadFood
    lw     t0, (t0)                  # should not see this under self-verification mode
    jr     t0
.LdeadFood:
    .word   0xdeadf00d
#endif
    sw     zero, offThread_inJitCodeCache(rSELF)  # in interpreter land
    la     a1, .LdvmMterpCommonExceptionThrown  # PIC way of getting &func
    lw     a1, (a1)
    la     rIBASE, .LdvmAsmInstructionStart     # PIC way of getting &func
    lw     rIBASE, (rIBASE)
    move   rPC, a0                              # reload the faulting Dalvid address
    jr     a1                                   # branch to dvmMterpCommonExeceptionThrown

    .align  4
.LdvmAsmInstructionStart:
    .word   dvmAsmInstructionStart
.LdvmJitToInterpNoChainNoProfile:
    .word   dvmJitToInterpNoChainNoProfile
.LdvmJitToInterpTraceSelectNoChain:
    .word   dvmJitToInterpTraceSelectNoChain
.LdvmJitToInterpNoChain:
    .word   dvmJitToInterpNoChain
.LdvmMterpStdBail:
    .word   dvmMterpStdBail
.LdvmMterpCommonExceptionThrown:
    .word   dvmMterpCommonExceptionThrown
.LdvmLockObject:
    .word   dvmLockObject
#if defined(WITH_JIT_TUNING)
.LdvmICHitCount:
    .word   gDvmICHitCount
#endif
#if defined(WITH_SELF_VERIFICATION)
.LdvmSelfVerificationMemOpDecode:
    .word   dvmSelfVerificationMemOpDecode
#endif

    .global dmvCompilerTemplateEnd
dmvCompilerTemplateEnd:

#endif /* WITH_JIT */