普通文本  |  4618行  |  167.95 KB

// Copyright 2013 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "src/compiler/code-generator.h"

#include "src/assembler-inl.h"
#include "src/callable.h"
#include "src/compiler/code-generator-impl.h"
#include "src/compiler/gap-resolver.h"
#include "src/compiler/node-matchers.h"
#include "src/compiler/osr.h"
#include "src/frame-constants.h"
#include "src/frames.h"
#include "src/heap/heap-inl.h"
#include "src/ia32/assembler-ia32.h"
#include "src/ia32/macro-assembler-ia32.h"
#include "src/optimized-compilation-info.h"
#include "src/wasm/wasm-code-manager.h"
#include "src/wasm/wasm-objects.h"

namespace v8 {
namespace internal {
namespace compiler {

#define __ tasm()->

#define kScratchDoubleReg xmm0


// Adds IA-32 specific methods for decoding operands.
class IA32OperandConverter : public InstructionOperandConverter {
 public:
  IA32OperandConverter(CodeGenerator* gen, Instruction* instr)
      : InstructionOperandConverter(gen, instr) {}

  Operand InputOperand(size_t index, int extra = 0) {
    return ToOperand(instr_->InputAt(index), extra);
  }

  Immediate InputImmediate(size_t index) {
    return ToImmediate(instr_->InputAt(index));
  }

  Operand OutputOperand() { return ToOperand(instr_->Output()); }

  Operand ToOperand(InstructionOperand* op, int extra = 0) {
    if (op->IsRegister()) {
      DCHECK_EQ(0, extra);
      return Operand(ToRegister(op));
    } else if (op->IsFPRegister()) {
      DCHECK_EQ(0, extra);
      return Operand(ToDoubleRegister(op));
    }
    DCHECK(op->IsStackSlot() || op->IsFPStackSlot());
    return SlotToOperand(AllocatedOperand::cast(op)->index(), extra);
  }

  Operand SlotToOperand(int slot, int extra = 0) {
    FrameOffset offset = frame_access_state()->GetFrameOffset(slot);
    return Operand(offset.from_stack_pointer() ? esp : ebp,
                   offset.offset() + extra);
  }

  Immediate ToImmediate(InstructionOperand* operand) {
    Constant constant = ToConstant(operand);
    if (constant.type() == Constant::kInt32 &&
        RelocInfo::IsWasmReference(constant.rmode())) {
      return Immediate(static_cast<Address>(constant.ToInt32()),
                       constant.rmode());
    }
    switch (constant.type()) {
      case Constant::kInt32:
        return Immediate(constant.ToInt32());
      case Constant::kFloat32:
        return Immediate::EmbeddedNumber(constant.ToFloat32());
      case Constant::kFloat64:
        return Immediate::EmbeddedNumber(constant.ToFloat64().value());
      case Constant::kExternalReference:
        return Immediate(constant.ToExternalReference());
      case Constant::kHeapObject:
        return Immediate(constant.ToHeapObject());
      case Constant::kInt64:
        break;
      case Constant::kRpoNumber:
        return Immediate::CodeRelativeOffset(ToLabel(operand));
    }
    UNREACHABLE();
  }

  static size_t NextOffset(size_t* offset) {
    size_t i = *offset;
    (*offset)++;
    return i;
  }

  static ScaleFactor ScaleFor(AddressingMode one, AddressingMode mode) {
    STATIC_ASSERT(0 == static_cast<int>(times_1));
    STATIC_ASSERT(1 == static_cast<int>(times_2));
    STATIC_ASSERT(2 == static_cast<int>(times_4));
    STATIC_ASSERT(3 == static_cast<int>(times_8));
    int scale = static_cast<int>(mode - one);
    DCHECK(scale >= 0 && scale < 4);
    return static_cast<ScaleFactor>(scale);
  }

  Operand MemoryOperand(size_t* offset) {
    AddressingMode mode = AddressingModeField::decode(instr_->opcode());
    switch (mode) {
      case kMode_MR: {
        Register base = InputRegister(NextOffset(offset));
        int32_t disp = 0;
        return Operand(base, disp);
      }
      case kMode_MRI: {
        Register base = InputRegister(NextOffset(offset));
        Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
        return Operand(base, ctant.ToInt32(), ctant.rmode());
      }
      case kMode_MR1:
      case kMode_MR2:
      case kMode_MR4:
      case kMode_MR8: {
        Register base = InputRegister(NextOffset(offset));
        Register index = InputRegister(NextOffset(offset));
        ScaleFactor scale = ScaleFor(kMode_MR1, mode);
        int32_t disp = 0;
        return Operand(base, index, scale, disp);
      }
      case kMode_MR1I:
      case kMode_MR2I:
      case kMode_MR4I:
      case kMode_MR8I: {
        Register base = InputRegister(NextOffset(offset));
        Register index = InputRegister(NextOffset(offset));
        ScaleFactor scale = ScaleFor(kMode_MR1I, mode);
        Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
        return Operand(base, index, scale, ctant.ToInt32(), ctant.rmode());
      }
      case kMode_M1:
      case kMode_M2:
      case kMode_M4:
      case kMode_M8: {
        Register index = InputRegister(NextOffset(offset));
        ScaleFactor scale = ScaleFor(kMode_M1, mode);
        int32_t disp = 0;
        return Operand(index, scale, disp);
      }
      case kMode_M1I:
      case kMode_M2I:
      case kMode_M4I:
      case kMode_M8I: {
        Register index = InputRegister(NextOffset(offset));
        ScaleFactor scale = ScaleFor(kMode_M1I, mode);
        Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
        return Operand(index, scale, ctant.ToInt32(), ctant.rmode());
      }
      case kMode_MI: {
        Constant ctant = ToConstant(instr_->InputAt(NextOffset(offset)));
        return Operand(ctant.ToInt32(), ctant.rmode());
      }
      case kMode_None:
        UNREACHABLE();
    }
    UNREACHABLE();
  }

  Operand MemoryOperand(size_t first_input = 0) {
    return MemoryOperand(&first_input);
  }

  Operand NextMemoryOperand(size_t offset = 0) {
    AddressingMode mode = AddressingModeField::decode(instr_->opcode());
    Register base = InputRegister(NextOffset(&offset));
    const int32_t disp = 4;
    if (mode == kMode_MR1) {
      Register index = InputRegister(NextOffset(&offset));
      ScaleFactor scale = ScaleFor(kMode_MR1, kMode_MR1);
      return Operand(base, index, scale, disp);
    } else if (mode == kMode_MRI) {
      Constant ctant = ToConstant(instr_->InputAt(NextOffset(&offset)));
      return Operand(base, ctant.ToInt32() + disp, ctant.rmode());
    } else {
      UNREACHABLE();
    }
  }
};


namespace {

bool HasImmediateInput(Instruction* instr, size_t index) {
  return instr->InputAt(index)->IsImmediate();
}

class OutOfLineLoadFloat32NaN final : public OutOfLineCode {
 public:
  OutOfLineLoadFloat32NaN(CodeGenerator* gen, XMMRegister result)
      : OutOfLineCode(gen), result_(result) {}

  void Generate() final {
    __ xorps(result_, result_);
    __ divss(result_, result_);
  }

 private:
  XMMRegister const result_;
};

class OutOfLineLoadFloat64NaN final : public OutOfLineCode {
 public:
  OutOfLineLoadFloat64NaN(CodeGenerator* gen, XMMRegister result)
      : OutOfLineCode(gen), result_(result) {}

  void Generate() final {
    __ xorpd(result_, result_);
    __ divsd(result_, result_);
  }

 private:
  XMMRegister const result_;
};

class OutOfLineTruncateDoubleToI final : public OutOfLineCode {
 public:
  OutOfLineTruncateDoubleToI(CodeGenerator* gen, Register result,
                             XMMRegister input, StubCallMode stub_mode)
      : OutOfLineCode(gen),
        result_(result),
        input_(input),
        stub_mode_(stub_mode),
        isolate_(gen->isolate()),
        zone_(gen->zone()) {}

  void Generate() final {
    __ sub(esp, Immediate(kDoubleSize));
    __ movsd(MemOperand(esp, 0), input_);
    if (stub_mode_ == StubCallMode::kCallWasmRuntimeStub) {
      // A direct call to a wasm runtime stub defined in this module.
      // Just encode the stub index. This will be patched at relocation.
      __ wasm_call(wasm::WasmCode::kDoubleToI, RelocInfo::WASM_STUB_CALL);
    } else {
      __ Call(BUILTIN_CODE(isolate_, DoubleToI), RelocInfo::CODE_TARGET);
    }
    __ mov(result_, MemOperand(esp, 0));
    __ add(esp, Immediate(kDoubleSize));
  }

 private:
  Register const result_;
  XMMRegister const input_;
  StubCallMode stub_mode_;
  Isolate* isolate_;
  Zone* zone_;
};


class OutOfLineRecordWrite final : public OutOfLineCode {
 public:
  OutOfLineRecordWrite(CodeGenerator* gen, Register object, Operand operand,
                       Register value, Register scratch0, Register scratch1,
                       RecordWriteMode mode)
      : OutOfLineCode(gen),
        object_(object),
        operand_(operand),
        value_(value),
        scratch0_(scratch0),
        scratch1_(scratch1),
        mode_(mode),
        zone_(gen->zone()) {}

  void SaveRegisters(RegList registers) {
    DCHECK_LT(0, NumRegs(registers));
    for (int i = 0; i < Register::kNumRegisters; ++i) {
      if ((registers >> i) & 1u) {
        __ push(Register::from_code(i));
      }
    }
  }

  void RestoreRegisters(RegList registers) {
    DCHECK_LT(0, NumRegs(registers));
    for (int i = Register::kNumRegisters - 1; i >= 0; --i) {
      if ((registers >> i) & 1u) {
        __ pop(Register::from_code(i));
      }
    }
  }

  void Generate() final {
    if (mode_ > RecordWriteMode::kValueIsPointer) {
      __ JumpIfSmi(value_, exit());
    }
    __ CheckPageFlag(value_, scratch0_,
                     MemoryChunk::kPointersToHereAreInterestingMask, zero,
                     exit());
    __ lea(scratch1_, operand_);
    RememberedSetAction const remembered_set_action =
        mode_ > RecordWriteMode::kValueIsMap ? EMIT_REMEMBERED_SET
                                             : OMIT_REMEMBERED_SET;
    SaveFPRegsMode const save_fp_mode =
        frame()->DidAllocateDoubleRegisters() ? kSaveFPRegs : kDontSaveFPRegs;
    __ CallRecordWriteStub(object_, scratch1_, remembered_set_action,
                           save_fp_mode);
  }

 private:
  Register const object_;
  Operand const operand_;
  Register const value_;
  Register const scratch0_;
  Register const scratch1_;
  RecordWriteMode const mode_;
  Zone* zone_;
};

void MoveOperandIfAliasedWithPoisonRegister(Instruction* call_instruction,
                                            CodeGenerator* gen) {
  IA32OperandConverter i(gen, call_instruction);
  int const poison_index = i.InputInt32(1);
  if (poison_index == -1) {
    // No aliasing -> nothing to move.
    return;
  }

  InstructionOperand* op = call_instruction->InputAt(poison_index);
  if (op->IsImmediate() || op->IsConstant()) {
    gen->tasm()->mov(kSpeculationPoisonRegister, i.ToImmediate(op));
  } else {
    gen->tasm()->mov(kSpeculationPoisonRegister, i.InputOperand(poison_index));
  }
}

void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
                                   InstructionCode opcode, Instruction* instr,
                                   IA32OperandConverter& i) {
  const MemoryAccessMode access_mode =
      static_cast<MemoryAccessMode>(MiscField::decode(opcode));
  if (access_mode == kMemoryAccessPoisoned) {
    Register value = i.OutputRegister();
    codegen->tasm()->and_(value, kSpeculationPoisonRegister);
  }
}

}  // namespace

#define ASSEMBLE_COMPARE(asm_instr)                                   \
  do {                                                                \
    if (AddressingModeField::decode(instr->opcode()) != kMode_None) { \
      size_t index = 0;                                               \
      Operand left = i.MemoryOperand(&index);                         \
      if (HasImmediateInput(instr, index)) {                          \
        __ asm_instr(left, i.InputImmediate(index));                  \
      } else {                                                        \
        __ asm_instr(left, i.InputRegister(index));                   \
      }                                                               \
    } else {                                                          \
      if (HasImmediateInput(instr, 1)) {                              \
        if (instr->InputAt(0)->IsRegister()) {                        \
          __ asm_instr(i.InputRegister(0), i.InputImmediate(1));      \
        } else {                                                      \
          __ asm_instr(i.InputOperand(0), i.InputImmediate(1));       \
        }                                                             \
      } else {                                                        \
        if (instr->InputAt(1)->IsRegister()) {                        \
          __ asm_instr(i.InputRegister(0), i.InputRegister(1));       \
        } else {                                                      \
          __ asm_instr(i.InputRegister(0), i.InputOperand(1));        \
        }                                                             \
      }                                                               \
    }                                                                 \
  } while (0)

#define ASSEMBLE_IEEE754_BINOP(name)                                     \
  do {                                                                   \
    /* Pass two doubles as arguments on the stack. */                    \
    __ PrepareCallCFunction(4, eax);                                     \
    __ movsd(Operand(esp, 0 * kDoubleSize), i.InputDoubleRegister(0));   \
    __ movsd(Operand(esp, 1 * kDoubleSize), i.InputDoubleRegister(1));   \
    __ CallCFunction(ExternalReference::ieee754_##name##_function(), 4); \
    /* Return value is in st(0) on ia32. */                              \
    /* Store it into the result register. */                             \
    __ sub(esp, Immediate(kDoubleSize));                                 \
    __ fstp_d(Operand(esp, 0));                                          \
    __ movsd(i.OutputDoubleRegister(), Operand(esp, 0));                 \
    __ add(esp, Immediate(kDoubleSize));                                 \
  } while (false)

#define ASSEMBLE_IEEE754_UNOP(name)                                      \
  do {                                                                   \
    /* Pass one double as argument on the stack. */                      \
    __ PrepareCallCFunction(2, eax);                                     \
    __ movsd(Operand(esp, 0 * kDoubleSize), i.InputDoubleRegister(0));   \
    __ CallCFunction(ExternalReference::ieee754_##name##_function(), 2); \
    /* Return value is in st(0) on ia32. */                              \
    /* Store it into the result register. */                             \
    __ sub(esp, Immediate(kDoubleSize));                                 \
    __ fstp_d(Operand(esp, 0));                                          \
    __ movsd(i.OutputDoubleRegister(), Operand(esp, 0));                 \
    __ add(esp, Immediate(kDoubleSize));                                 \
  } while (false)

#define ASSEMBLE_BINOP(asm_instr)                                     \
  do {                                                                \
    if (AddressingModeField::decode(instr->opcode()) != kMode_None) { \
      size_t index = 1;                                               \
      Operand right = i.MemoryOperand(&index);                        \
      __ asm_instr(i.InputRegister(0), right);                        \
    } else {                                                          \
      if (HasImmediateInput(instr, 1)) {                              \
        __ asm_instr(i.InputOperand(0), i.InputImmediate(1));         \
      } else {                                                        \
        __ asm_instr(i.InputRegister(0), i.InputOperand(1));          \
      }                                                               \
    }                                                                 \
  } while (0)

#define ASSEMBLE_ATOMIC_BINOP(bin_inst, mov_inst, cmpxchg_inst) \
  do {                                                          \
    Label binop;                                                \
    __ bind(&binop);                                            \
    __ mov_inst(eax, i.MemoryOperand(1));                       \
    __ Move(i.TempRegister(0), eax);                            \
    __ bin_inst(i.TempRegister(0), i.InputRegister(0));         \
    __ lock();                                                  \
    __ cmpxchg_inst(i.MemoryOperand(1), i.TempRegister(0));     \
    __ j(not_equal, &binop);                                    \
  } while (false)

#define ASSEMBLE_I64ATOMIC_BINOP(instr1, instr2)         \
  do {                                                   \
    Label binop;                                         \
    __ bind(&binop);                                     \
    __ mov(i.OutputRegister(0), i.MemoryOperand(2));     \
    __ mov(i.OutputRegister(1), i.NextMemoryOperand(2)); \
    __ push(i.InputRegister(0));                         \
    __ push(i.InputRegister(1));                         \
    __ instr1(i.InputRegister(0), i.OutputRegister(0));  \
    __ instr2(i.InputRegister(1), i.OutputRegister(1));  \
    __ lock();                                           \
    __ cmpxchg8b(i.MemoryOperand(2));                    \
    __ pop(i.InputRegister(1));                          \
    __ pop(i.InputRegister(0));                          \
    __ j(not_equal, &binop);                             \
  } while (false);

#define ASSEMBLE_MOVX(mov_instr)                            \
  do {                                                      \
    if (instr->addressing_mode() != kMode_None) {           \
      __ mov_instr(i.OutputRegister(), i.MemoryOperand());  \
    } else if (instr->InputAt(0)->IsRegister()) {           \
      __ mov_instr(i.OutputRegister(), i.InputRegister(0)); \
    } else {                                                \
      __ mov_instr(i.OutputRegister(), i.InputOperand(0));  \
    }                                                       \
  } while (0)

#define ASSEMBLE_SIMD_PUNPCK_SHUFFLE(opcode)                         \
  do {                                                               \
    XMMRegister src0 = i.InputSimd128Register(0);                    \
    Operand src1 = i.InputOperand(instr->InputCount() == 2 ? 1 : 0); \
    if (CpuFeatures::IsSupported(AVX)) {                             \
      CpuFeatureScope avx_scope(tasm(), AVX);                        \
      __ v##opcode(i.OutputSimd128Register(), src0, src1);           \
    } else {                                                         \
      DCHECK_EQ(i.OutputSimd128Register(), src0);                    \
      __ opcode(i.OutputSimd128Register(), src1);                    \
    }                                                                \
  } while (false)

#define ASSEMBLE_SIMD_IMM_SHUFFLE(opcode, SSELevel, imm)               \
  if (CpuFeatures::IsSupported(AVX)) {                                 \
    CpuFeatureScope avx_scope(tasm(), AVX);                            \
    __ v##opcode(i.OutputSimd128Register(), i.InputSimd128Register(0), \
                 i.InputOperand(1), imm);                              \
  } else {                                                             \
    CpuFeatureScope sse_scope(tasm(), SSELevel);                       \
    DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));   \
    __ opcode(i.OutputSimd128Register(), i.InputOperand(1), imm);      \
  }

void CodeGenerator::AssembleDeconstructFrame() {
  __ mov(esp, ebp);
  __ pop(ebp);
}

void CodeGenerator::AssemblePrepareTailCall() {
  if (frame_access_state()->has_frame()) {
    __ mov(ebp, MemOperand(ebp, 0));
  }
  frame_access_state()->SetFrameAccessToSP();
}

void CodeGenerator::AssemblePopArgumentsAdaptorFrame(Register args_reg,
                                                     Register, Register,
                                                     Register) {
  // There are not enough temp registers left on ia32 for a call instruction
  // so we pick some scratch registers and save/restore them manually here.
  int scratch_count = 3;
  Register scratch1 = ebx;
  Register scratch2 = ecx;
  Register scratch3 = edx;
  DCHECK(!AreAliased(args_reg, scratch1, scratch2, scratch3));
  Label done;

  // Check if current frame is an arguments adaptor frame.
  __ cmp(Operand(ebp, StandardFrameConstants::kContextOffset),
         Immediate(StackFrame::TypeToMarker(StackFrame::ARGUMENTS_ADAPTOR)));
  __ j(not_equal, &done, Label::kNear);

  __ push(scratch1);
  __ push(scratch2);
  __ push(scratch3);

  // Load arguments count from current arguments adaptor frame (note, it
  // does not include receiver).
  Register caller_args_count_reg = scratch1;
  __ mov(caller_args_count_reg,
         Operand(ebp, ArgumentsAdaptorFrameConstants::kLengthOffset));
  __ SmiUntag(caller_args_count_reg);

  ParameterCount callee_args_count(args_reg);
  __ PrepareForTailCall(callee_args_count, caller_args_count_reg, scratch2,
                        scratch3, scratch_count);
  __ pop(scratch3);
  __ pop(scratch2);
  __ pop(scratch1);

  __ bind(&done);
}

namespace {

void AdjustStackPointerForTailCall(TurboAssembler* tasm,
                                   FrameAccessState* state,
                                   int new_slot_above_sp,
                                   bool allow_shrinkage = true) {
  int current_sp_offset = state->GetSPToFPSlotCount() +
                          StandardFrameConstants::kFixedSlotCountAboveFp;
  int stack_slot_delta = new_slot_above_sp - current_sp_offset;
  if (stack_slot_delta > 0) {
    tasm->sub(esp, Immediate(stack_slot_delta * kPointerSize));
    state->IncreaseSPDelta(stack_slot_delta);
  } else if (allow_shrinkage && stack_slot_delta < 0) {
    tasm->add(esp, Immediate(-stack_slot_delta * kPointerSize));
    state->IncreaseSPDelta(stack_slot_delta);
  }
}

}  // namespace

void CodeGenerator::AssembleTailCallBeforeGap(Instruction* instr,
                                              int first_unused_stack_slot) {
  CodeGenerator::PushTypeFlags flags(kImmediatePush | kScalarPush);
  ZoneVector<MoveOperands*> pushes(zone());
  GetPushCompatibleMoves(instr, flags, &pushes);

  if (!pushes.empty() &&
      (LocationOperand::cast(pushes.back()->destination()).index() + 1 ==
       first_unused_stack_slot)) {
    IA32OperandConverter g(this, instr);
    for (auto move : pushes) {
      LocationOperand destination_location(
          LocationOperand::cast(move->destination()));
      InstructionOperand source(move->source());
      AdjustStackPointerForTailCall(tasm(), frame_access_state(),
                                    destination_location.index());
      if (source.IsStackSlot()) {
        LocationOperand source_location(LocationOperand::cast(source));
        __ push(g.SlotToOperand(source_location.index()));
      } else if (source.IsRegister()) {
        LocationOperand source_location(LocationOperand::cast(source));
        __ push(source_location.GetRegister());
      } else if (source.IsImmediate()) {
        __ push(Immediate(ImmediateOperand::cast(source).inline_value()));
      } else {
        // Pushes of non-scalar data types is not supported.
        UNIMPLEMENTED();
      }
      frame_access_state()->IncreaseSPDelta(1);
      move->Eliminate();
    }
  }
  AdjustStackPointerForTailCall(tasm(), frame_access_state(),
                                first_unused_stack_slot, false);
}

void CodeGenerator::AssembleTailCallAfterGap(Instruction* instr,
                                             int first_unused_stack_slot) {
  AdjustStackPointerForTailCall(tasm(), frame_access_state(),
                                first_unused_stack_slot);
}

// Check that {kJavaScriptCallCodeStartRegister} is correct.
void CodeGenerator::AssembleCodeStartRegisterCheck() {
  __ push(eax);  // Push eax so we can use it as a scratch register.
  __ ComputeCodeStartAddress(eax);
  __ cmp(eax, kJavaScriptCallCodeStartRegister);
  __ Assert(equal, AbortReason::kWrongFunctionCodeStart);
  __ pop(eax);  // Restore eax.
}

// Check if the code object is marked for deoptimization. If it is, then it
// jumps to the CompileLazyDeoptimizedCode builtin. In order to do this we need
// to:
//    1. read from memory the word that contains that bit, which can be found in
//       the flags in the referenced {CodeDataContainer} object;
//    2. test kMarkedForDeoptimizationBit in those flags; and
//    3. if it is not zero then it jumps to the builtin.
void CodeGenerator::BailoutIfDeoptimized() {
  int offset = Code::kCodeDataContainerOffset - Code::kHeaderSize;
  __ mov(ebx, Operand(kJavaScriptCallCodeStartRegister, offset));
  __ test(FieldOperand(ebx, CodeDataContainer::kKindSpecificFlagsOffset),
          Immediate(1 << Code::kMarkedForDeoptimizationBit));
  // Ensure we're not serializing (otherwise we'd need to use an indirection to
  // access the builtin below).
  DCHECK(!isolate()->ShouldLoadConstantsFromRootList());
  Handle<Code> code = isolate()->builtins()->builtin_handle(
      Builtins::kCompileLazyDeoptimizedCode);
  __ j(not_zero, code, RelocInfo::CODE_TARGET);
}

void CodeGenerator::GenerateSpeculationPoisonFromCodeStartRegister() {
  __ push(eax);  // Push eax so we can use it as a scratch register.

  // Set a mask which has all bits set in the normal case, but has all
  // bits cleared if we are speculatively executing the wrong PC.
  __ ComputeCodeStartAddress(eax);
  __ mov(kSpeculationPoisonRegister, Immediate(0));
  __ cmp(kJavaScriptCallCodeStartRegister, eax);
  __ mov(eax, Immediate(-1));
  __ cmov(equal, kSpeculationPoisonRegister, eax);

  __ pop(eax);  // Restore eax.
}

void CodeGenerator::AssembleRegisterArgumentPoisoning() {
  __ and_(kJSFunctionRegister, kSpeculationPoisonRegister);
  __ and_(kContextRegister, kSpeculationPoisonRegister);
  __ and_(esp, kSpeculationPoisonRegister);
}

// Assembles an instruction after register allocation, producing machine code.
CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
    Instruction* instr) {
  IA32OperandConverter i(this, instr);
  InstructionCode opcode = instr->opcode();
  ArchOpcode arch_opcode = ArchOpcodeField::decode(opcode);
  switch (arch_opcode) {
    case kArchCallCodeObject: {
      MoveOperandIfAliasedWithPoisonRegister(instr, this);
      if (HasImmediateInput(instr, 0)) {
        Handle<Code> code = i.InputCode(0);
        __ call(code, RelocInfo::CODE_TARGET);
      } else {
        Register reg = i.InputRegister(0);
        DCHECK_IMPLIES(
            HasCallDescriptorFlag(instr, CallDescriptor::kFixedTargetRegister),
            reg == kJavaScriptCallCodeStartRegister);
        __ add(reg, Immediate(Code::kHeaderSize - kHeapObjectTag));
        if (HasCallDescriptorFlag(instr, CallDescriptor::kRetpoline)) {
          __ RetpolineCall(reg);
        } else {
          __ call(reg);
        }
      }
      RecordCallPosition(instr);
      frame_access_state()->ClearSPDelta();
      break;
    }
    case kArchCallWasmFunction: {
      MoveOperandIfAliasedWithPoisonRegister(instr, this);
      if (HasImmediateInput(instr, 0)) {
        Constant constant = i.ToConstant(instr->InputAt(0));
        Address wasm_code = static_cast<Address>(constant.ToInt32());
        if (DetermineStubCallMode() == StubCallMode::kCallWasmRuntimeStub) {
          __ wasm_call(wasm_code, constant.rmode());
        } else {
          if (HasCallDescriptorFlag(instr, CallDescriptor::kRetpoline)) {
            __ RetpolineCall(wasm_code, constant.rmode());
          } else {
            __ call(wasm_code, constant.rmode());
          }
        }
      } else {
        Register reg = i.InputRegister(0);
        if (HasCallDescriptorFlag(instr, CallDescriptor::kRetpoline)) {
          __ RetpolineCall(reg);
        } else {
          __ call(reg);
        }
      }
      RecordCallPosition(instr);
      frame_access_state()->ClearSPDelta();
      break;
    }
    case kArchTailCallCodeObjectFromJSFunction:
    case kArchTailCallCodeObject: {
      MoveOperandIfAliasedWithPoisonRegister(instr, this);
      if (arch_opcode == kArchTailCallCodeObjectFromJSFunction) {
        AssemblePopArgumentsAdaptorFrame(kJavaScriptCallArgCountRegister,
                                         no_reg, no_reg, no_reg);
      }
      if (HasImmediateInput(instr, 0)) {
        Handle<Code> code = i.InputCode(0);
        __ jmp(code, RelocInfo::CODE_TARGET);
      } else {
        Register reg = i.InputRegister(0);
        DCHECK_IMPLIES(
            HasCallDescriptorFlag(instr, CallDescriptor::kFixedTargetRegister),
            reg == kJavaScriptCallCodeStartRegister);
        __ add(reg, Immediate(Code::kHeaderSize - kHeapObjectTag));
        if (HasCallDescriptorFlag(instr, CallDescriptor::kRetpoline)) {
          __ RetpolineJump(reg);
        } else {
          __ jmp(reg);
        }
      }
      frame_access_state()->ClearSPDelta();
      frame_access_state()->SetFrameAccessToDefault();
      break;
    }
    case kArchTailCallWasm: {
      MoveOperandIfAliasedWithPoisonRegister(instr, this);
      if (HasImmediateInput(instr, 0)) {
        Constant constant = i.ToConstant(instr->InputAt(0));
        Address wasm_code = static_cast<Address>(constant.ToInt32());
        __ jmp(wasm_code, constant.rmode());
      } else {
        Register reg = i.InputRegister(0);
        if (HasCallDescriptorFlag(instr, CallDescriptor::kRetpoline)) {
          __ RetpolineJump(reg);
        } else {
          __ jmp(reg);
        }
      }
      frame_access_state()->ClearSPDelta();
      frame_access_state()->SetFrameAccessToDefault();
      break;
    }
    case kArchTailCallAddress: {
      MoveOperandIfAliasedWithPoisonRegister(instr, this);
      CHECK(!HasImmediateInput(instr, 0));
      Register reg = i.InputRegister(0);
      DCHECK_IMPLIES(
          HasCallDescriptorFlag(instr, CallDescriptor::kFixedTargetRegister),
          reg == kJavaScriptCallCodeStartRegister);
      if (HasCallDescriptorFlag(instr, CallDescriptor::kRetpoline)) {
        __ RetpolineJump(reg);
      } else {
        __ jmp(reg);
      }
      frame_access_state()->ClearSPDelta();
      frame_access_state()->SetFrameAccessToDefault();
      break;
    }
    case kArchCallJSFunction: {
      MoveOperandIfAliasedWithPoisonRegister(instr, this);
      Register func = i.InputRegister(0);
      if (FLAG_debug_code) {
        // Check the function's context matches the context argument.
        __ cmp(esi, FieldOperand(func, JSFunction::kContextOffset));
        __ Assert(equal, AbortReason::kWrongFunctionContext);
      }
      static_assert(kJavaScriptCallCodeStartRegister == ecx, "ABI mismatch");
      __ mov(ecx, FieldOperand(func, JSFunction::kCodeOffset));
      __ add(ecx, Immediate(Code::kHeaderSize - kHeapObjectTag));
      __ call(ecx);
      RecordCallPosition(instr);
      frame_access_state()->ClearSPDelta();
      break;
    }
    case kArchPrepareCallCFunction: {
      // Frame alignment requires using FP-relative frame addressing.
      frame_access_state()->SetFrameAccessToFP();
      int const num_parameters = MiscField::decode(instr->opcode());
      __ PrepareCallCFunction(num_parameters, i.TempRegister(0));
      break;
    }
    case kArchSaveCallerRegisters: {
      fp_mode_ =
          static_cast<SaveFPRegsMode>(MiscField::decode(instr->opcode()));
      DCHECK(fp_mode_ == kDontSaveFPRegs || fp_mode_ == kSaveFPRegs);
      // kReturnRegister0 should have been saved before entering the stub.
      int bytes = __ PushCallerSaved(fp_mode_, kReturnRegister0);
      DCHECK_EQ(0, bytes % kPointerSize);
      DCHECK_EQ(0, frame_access_state()->sp_delta());
      frame_access_state()->IncreaseSPDelta(bytes / kPointerSize);
      DCHECK(!caller_registers_saved_);
      caller_registers_saved_ = true;
      break;
    }
    case kArchRestoreCallerRegisters: {
      DCHECK(fp_mode_ ==
             static_cast<SaveFPRegsMode>(MiscField::decode(instr->opcode())));
      DCHECK(fp_mode_ == kDontSaveFPRegs || fp_mode_ == kSaveFPRegs);
      // Don't overwrite the returned value.
      int bytes = __ PopCallerSaved(fp_mode_, kReturnRegister0);
      frame_access_state()->IncreaseSPDelta(-(bytes / kPointerSize));
      DCHECK_EQ(0, frame_access_state()->sp_delta());
      DCHECK(caller_registers_saved_);
      caller_registers_saved_ = false;
      break;
    }
    case kArchPrepareTailCall:
      AssemblePrepareTailCall();
      break;
    case kArchCallCFunction: {
      MoveOperandIfAliasedWithPoisonRegister(instr, this);
      int const num_parameters = MiscField::decode(instr->opcode());
      if (HasImmediateInput(instr, 0)) {
        ExternalReference ref = i.InputExternalReference(0);
        __ CallCFunction(ref, num_parameters);
      } else {
        Register func = i.InputRegister(0);
        __ CallCFunction(func, num_parameters);
      }
      frame_access_state()->SetFrameAccessToDefault();
      // Ideally, we should decrement SP delta to match the change of stack
      // pointer in CallCFunction. However, for certain architectures (e.g.
      // ARM), there may be more strict alignment requirement, causing old SP
      // to be saved on the stack. In those cases, we can not calculate the SP
      // delta statically.
      frame_access_state()->ClearSPDelta();
      if (caller_registers_saved_) {
        // Need to re-sync SP delta introduced in kArchSaveCallerRegisters.
        // Here, we assume the sequence to be:
        //   kArchSaveCallerRegisters;
        //   kArchCallCFunction;
        //   kArchRestoreCallerRegisters;
        int bytes =
            __ RequiredStackSizeForCallerSaved(fp_mode_, kReturnRegister0);
        frame_access_state()->IncreaseSPDelta(bytes / kPointerSize);
      }
      break;
    }
    case kArchJmp:
      AssembleArchJump(i.InputRpo(0));
      break;
    case kArchBinarySearchSwitch:
      AssembleArchBinarySearchSwitch(instr);
      break;
    case kArchLookupSwitch:
      AssembleArchLookupSwitch(instr);
      break;
    case kArchTableSwitch:
      AssembleArchTableSwitch(instr);
      break;
    case kArchComment:
      __ RecordComment(reinterpret_cast<const char*>(i.InputInt32(0)));
      break;
    case kArchDebugAbort:
      DCHECK(i.InputRegister(0) == edx);
      if (!frame_access_state()->has_frame()) {
        // We don't actually want to generate a pile of code for this, so just
        // claim there is a stack frame, without generating one.
        FrameScope scope(tasm(), StackFrame::NONE);
        __ Call(isolate()->builtins()->builtin_handle(Builtins::kAbortJS),
                RelocInfo::CODE_TARGET);
      } else {
        __ Call(isolate()->builtins()->builtin_handle(Builtins::kAbortJS),
                RelocInfo::CODE_TARGET);
      }
      __ int3();
      break;
    case kArchDebugBreak:
      __ int3();
      break;
    case kArchNop:
    case kArchThrowTerminator:
      // don't emit code for nops.
      break;
    case kArchDeoptimize: {
      int deopt_state_id =
          BuildTranslation(instr, -1, 0, OutputFrameStateCombine::Ignore());
      CodeGenResult result =
          AssembleDeoptimizerCall(deopt_state_id, current_source_position_);
      if (result != kSuccess) return result;
      break;
    }
    case kArchRet:
      AssembleReturn(instr->InputAt(0));
      break;
    case kArchStackPointer:
      __ mov(i.OutputRegister(), esp);
      break;
    case kArchFramePointer:
      __ mov(i.OutputRegister(), ebp);
      break;
    case kArchParentFramePointer:
      if (frame_access_state()->has_frame()) {
        __ mov(i.OutputRegister(), Operand(ebp, 0));
      } else {
        __ mov(i.OutputRegister(), ebp);
      }
      break;
    case kArchTruncateDoubleToI: {
      auto result = i.OutputRegister();
      auto input = i.InputDoubleRegister(0);
      auto ool = new (zone()) OutOfLineTruncateDoubleToI(
          this, result, input, DetermineStubCallMode());
      __ cvttsd2si(result, Operand(input));
      __ cmp(result, 1);
      __ j(overflow, ool->entry());
      __ bind(ool->exit());
      break;
    }
    case kArchStoreWithWriteBarrier: {
      RecordWriteMode mode =
          static_cast<RecordWriteMode>(MiscField::decode(instr->opcode()));
      Register object = i.InputRegister(0);
      size_t index = 0;
      Operand operand = i.MemoryOperand(&index);
      Register value = i.InputRegister(index);
      Register scratch0 = i.TempRegister(0);
      Register scratch1 = i.TempRegister(1);
      auto ool = new (zone()) OutOfLineRecordWrite(this, object, operand, value,
                                                   scratch0, scratch1, mode);
      __ mov(operand, value);
      __ CheckPageFlag(object, scratch0,
                       MemoryChunk::kPointersFromHereAreInterestingMask,
                       not_zero, ool->entry());
      __ bind(ool->exit());
      break;
    }
    case kArchStackSlot: {
      FrameOffset offset =
          frame_access_state()->GetFrameOffset(i.InputInt32(0));
      Register base = offset.from_stack_pointer() ? esp : ebp;
      __ lea(i.OutputRegister(), Operand(base, offset.offset()));
      break;
    }
    case kIeee754Float64Acos:
      ASSEMBLE_IEEE754_UNOP(acos);
      break;
    case kIeee754Float64Acosh:
      ASSEMBLE_IEEE754_UNOP(acosh);
      break;
    case kIeee754Float64Asin:
      ASSEMBLE_IEEE754_UNOP(asin);
      break;
    case kIeee754Float64Asinh:
      ASSEMBLE_IEEE754_UNOP(asinh);
      break;
    case kIeee754Float64Atan:
      ASSEMBLE_IEEE754_UNOP(atan);
      break;
    case kIeee754Float64Atanh:
      ASSEMBLE_IEEE754_UNOP(atanh);
      break;
    case kIeee754Float64Atan2:
      ASSEMBLE_IEEE754_BINOP(atan2);
      break;
    case kIeee754Float64Cbrt:
      ASSEMBLE_IEEE754_UNOP(cbrt);
      break;
    case kIeee754Float64Cos:
      ASSEMBLE_IEEE754_UNOP(cos);
      break;
    case kIeee754Float64Cosh:
      ASSEMBLE_IEEE754_UNOP(cosh);
      break;
    case kIeee754Float64Expm1:
      ASSEMBLE_IEEE754_UNOP(expm1);
      break;
    case kIeee754Float64Exp:
      ASSEMBLE_IEEE754_UNOP(exp);
      break;
    case kIeee754Float64Log:
      ASSEMBLE_IEEE754_UNOP(log);
      break;
    case kIeee754Float64Log1p:
      ASSEMBLE_IEEE754_UNOP(log1p);
      break;
    case kIeee754Float64Log2:
      ASSEMBLE_IEEE754_UNOP(log2);
      break;
    case kIeee754Float64Log10:
      ASSEMBLE_IEEE754_UNOP(log10);
      break;
    case kIeee754Float64Pow: {
      // TODO(bmeurer): Improve integration of the stub.
      if (i.InputDoubleRegister(1) != xmm2) {
        __ movaps(xmm2, i.InputDoubleRegister(0));
        __ movaps(xmm1, i.InputDoubleRegister(1));
      } else {
        __ movaps(xmm0, i.InputDoubleRegister(0));
        __ movaps(xmm1, xmm2);
        __ movaps(xmm2, xmm0);
      }
      __ Call(BUILTIN_CODE(isolate(), MathPowInternal), RelocInfo::CODE_TARGET);
      __ movaps(i.OutputDoubleRegister(), xmm3);
      break;
    }
    case kIeee754Float64Sin:
      ASSEMBLE_IEEE754_UNOP(sin);
      break;
    case kIeee754Float64Sinh:
      ASSEMBLE_IEEE754_UNOP(sinh);
      break;
    case kIeee754Float64Tan:
      ASSEMBLE_IEEE754_UNOP(tan);
      break;
    case kIeee754Float64Tanh:
      ASSEMBLE_IEEE754_UNOP(tanh);
      break;
    case kIA32Add:
      ASSEMBLE_BINOP(add);
      break;
    case kIA32And:
      ASSEMBLE_BINOP(and_);
      break;
    case kIA32Cmp:
      ASSEMBLE_COMPARE(cmp);
      break;
    case kIA32Cmp16:
      ASSEMBLE_COMPARE(cmpw);
      break;
    case kIA32Cmp8:
      ASSEMBLE_COMPARE(cmpb);
      break;
    case kIA32Test:
      ASSEMBLE_COMPARE(test);
      break;
    case kIA32Test16:
      ASSEMBLE_COMPARE(test_w);
      break;
    case kIA32Test8:
      ASSEMBLE_COMPARE(test_b);
      break;
    case kIA32Imul:
      if (HasImmediateInput(instr, 1)) {
        __ imul(i.OutputRegister(), i.InputOperand(0), i.InputInt32(1));
      } else {
        __ imul(i.OutputRegister(), i.InputOperand(1));
      }
      break;
    case kIA32ImulHigh:
      __ imul(i.InputRegister(1));
      break;
    case kIA32UmulHigh:
      __ mul(i.InputRegister(1));
      break;
    case kIA32Idiv:
      __ cdq();
      __ idiv(i.InputOperand(1));
      break;
    case kIA32Udiv:
      __ Move(edx, Immediate(0));
      __ div(i.InputOperand(1));
      break;
    case kIA32Not:
      __ not_(i.OutputOperand());
      break;
    case kIA32Neg:
      __ neg(i.OutputOperand());
      break;
    case kIA32Or:
      ASSEMBLE_BINOP(or_);
      break;
    case kIA32Xor:
      ASSEMBLE_BINOP(xor_);
      break;
    case kIA32Sub:
      ASSEMBLE_BINOP(sub);
      break;
    case kIA32Shl:
      if (HasImmediateInput(instr, 1)) {
        __ shl(i.OutputOperand(), i.InputInt5(1));
      } else {
        __ shl_cl(i.OutputOperand());
      }
      break;
    case kIA32Shr:
      if (HasImmediateInput(instr, 1)) {
        __ shr(i.OutputOperand(), i.InputInt5(1));
      } else {
        __ shr_cl(i.OutputOperand());
      }
      break;
    case kIA32Sar:
      if (HasImmediateInput(instr, 1)) {
        __ sar(i.OutputOperand(), i.InputInt5(1));
      } else {
        __ sar_cl(i.OutputOperand());
      }
      break;
    case kIA32AddPair: {
      // i.OutputRegister(0) == i.InputRegister(0) ... left low word.
      // i.InputRegister(1) ... left high word.
      // i.InputRegister(2) ... right low word.
      // i.InputRegister(3) ... right high word.
      bool use_temp = false;
      if (i.OutputRegister(0).code() == i.InputRegister(1).code() ||
          i.OutputRegister(0).code() == i.InputRegister(3).code()) {
        // We cannot write to the output register directly, because it would
        // overwrite an input for adc. We have to use the temp register.
        use_temp = true;
        __ Move(i.TempRegister(0), i.InputRegister(0));
        __ add(i.TempRegister(0), i.InputRegister(2));
      } else {
        __ add(i.OutputRegister(0), i.InputRegister(2));
      }
      if (i.OutputRegister(1).code() != i.InputRegister(1).code()) {
        __ Move(i.OutputRegister(1), i.InputRegister(1));
      }
      __ adc(i.OutputRegister(1), Operand(i.InputRegister(3)));
      if (use_temp) {
        __ Move(i.OutputRegister(0), i.TempRegister(0));
      }
      break;
    }
    case kIA32SubPair: {
      // i.OutputRegister(0) == i.InputRegister(0) ... left low word.
      // i.InputRegister(1) ... left high word.
      // i.InputRegister(2) ... right low word.
      // i.InputRegister(3) ... right high word.
      bool use_temp = false;
      if (i.OutputRegister(0).code() == i.InputRegister(1).code() ||
          i.OutputRegister(0).code() == i.InputRegister(3).code()) {
        // We cannot write to the output register directly, because it would
        // overwrite an input for adc. We have to use the temp register.
        use_temp = true;
        __ Move(i.TempRegister(0), i.InputRegister(0));
        __ sub(i.TempRegister(0), i.InputRegister(2));
      } else {
        __ sub(i.OutputRegister(0), i.InputRegister(2));
      }
      if (i.OutputRegister(1).code() != i.InputRegister(1).code()) {
        __ Move(i.OutputRegister(1), i.InputRegister(1));
      }
      __ sbb(i.OutputRegister(1), Operand(i.InputRegister(3)));
      if (use_temp) {
        __ Move(i.OutputRegister(0), i.TempRegister(0));
      }
      break;
    }
    case kIA32MulPair: {
      __ imul(i.OutputRegister(1), i.InputOperand(0));
      __ mov(i.TempRegister(0), i.InputOperand(1));
      __ imul(i.TempRegister(0), i.InputOperand(2));
      __ add(i.OutputRegister(1), i.TempRegister(0));
      __ mov(i.OutputRegister(0), i.InputOperand(0));
      // Multiplies the low words and stores them in eax and edx.
      __ mul(i.InputRegister(2));
      __ add(i.OutputRegister(1), i.TempRegister(0));

      break;
    }
    case kIA32ShlPair:
      if (HasImmediateInput(instr, 2)) {
        __ ShlPair(i.InputRegister(1), i.InputRegister(0), i.InputInt6(2));
      } else {
        // Shift has been loaded into CL by the register allocator.
        __ ShlPair_cl(i.InputRegister(1), i.InputRegister(0));
      }
      break;
    case kIA32ShrPair:
      if (HasImmediateInput(instr, 2)) {
        __ ShrPair(i.InputRegister(1), i.InputRegister(0), i.InputInt6(2));
      } else {
        // Shift has been loaded into CL by the register allocator.
        __ ShrPair_cl(i.InputRegister(1), i.InputRegister(0));
      }
      break;
    case kIA32SarPair:
      if (HasImmediateInput(instr, 2)) {
        __ SarPair(i.InputRegister(1), i.InputRegister(0), i.InputInt6(2));
      } else {
        // Shift has been loaded into CL by the register allocator.
        __ SarPair_cl(i.InputRegister(1), i.InputRegister(0));
      }
      break;
    case kIA32Ror:
      if (HasImmediateInput(instr, 1)) {
        __ ror(i.OutputOperand(), i.InputInt5(1));
      } else {
        __ ror_cl(i.OutputOperand());
      }
      break;
    case kIA32Lzcnt:
      __ Lzcnt(i.OutputRegister(), i.InputOperand(0));
      break;
    case kIA32Tzcnt:
      __ Tzcnt(i.OutputRegister(), i.InputOperand(0));
      break;
    case kIA32Popcnt:
      __ Popcnt(i.OutputRegister(), i.InputOperand(0));
      break;
    case kIA32Bswap:
      __ bswap(i.OutputRegister());
      break;
    case kArchWordPoisonOnSpeculation:
      DCHECK_EQ(i.OutputRegister(), i.InputRegister(0));
      __ and_(i.InputRegister(0), kSpeculationPoisonRegister);
      break;
    case kLFence:
      __ lfence();
      break;
    case kSSEFloat32Cmp:
      __ ucomiss(i.InputDoubleRegister(0), i.InputOperand(1));
      break;
    case kSSEFloat32Add:
      __ addss(i.InputDoubleRegister(0), i.InputOperand(1));
      break;
    case kSSEFloat32Sub:
      __ subss(i.InputDoubleRegister(0), i.InputOperand(1));
      break;
    case kSSEFloat32Mul:
      __ mulss(i.InputDoubleRegister(0), i.InputOperand(1));
      break;
    case kSSEFloat32Div:
      __ divss(i.InputDoubleRegister(0), i.InputOperand(1));
      // Don't delete this mov. It may improve performance on some CPUs,
      // when there is a (v)mulss depending on the result.
      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    case kSSEFloat32Sqrt:
      __ sqrtss(i.OutputDoubleRegister(), i.InputOperand(0));
      break;
    case kSSEFloat32Abs: {
      // TODO(bmeurer): Use 128-bit constants.
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ psrlq(kScratchDoubleReg, 33);
      __ andps(i.OutputDoubleRegister(), kScratchDoubleReg);
      break;
    }
    case kSSEFloat32Neg: {
      // TODO(bmeurer): Use 128-bit constants.
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ psllq(kScratchDoubleReg, 31);
      __ xorps(i.OutputDoubleRegister(), kScratchDoubleReg);
      break;
    }
    case kSSEFloat32Round: {
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      RoundingMode const mode =
          static_cast<RoundingMode>(MiscField::decode(instr->opcode()));
      __ roundss(i.OutputDoubleRegister(), i.InputDoubleRegister(0), mode);
      break;
    }
    case kSSEFloat64Cmp:
      __ ucomisd(i.InputDoubleRegister(0), i.InputOperand(1));
      break;
    case kSSEFloat64Add:
      __ addsd(i.InputDoubleRegister(0), i.InputOperand(1));
      break;
    case kSSEFloat64Sub:
      __ subsd(i.InputDoubleRegister(0), i.InputOperand(1));
      break;
    case kSSEFloat64Mul:
      __ mulsd(i.InputDoubleRegister(0), i.InputOperand(1));
      break;
    case kSSEFloat64Div:
      __ divsd(i.InputDoubleRegister(0), i.InputOperand(1));
      // Don't delete this mov. It may improve performance on some CPUs,
      // when there is a (v)mulsd depending on the result.
      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    case kSSEFloat32Max: {
      Label compare_nan, compare_swap, done_compare;
      if (instr->InputAt(1)->IsFPRegister()) {
        __ ucomiss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
      } else {
        __ ucomiss(i.InputDoubleRegister(0), i.InputOperand(1));
      }
      auto ool =
          new (zone()) OutOfLineLoadFloat32NaN(this, i.OutputDoubleRegister());
      __ j(parity_even, ool->entry());
      __ j(above, &done_compare, Label::kNear);
      __ j(below, &compare_swap, Label::kNear);
      __ movmskps(i.TempRegister(0), i.InputDoubleRegister(0));
      __ test(i.TempRegister(0), Immediate(1));
      __ j(zero, &done_compare, Label::kNear);
      __ bind(&compare_swap);
      if (instr->InputAt(1)->IsFPRegister()) {
        __ movss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
      } else {
        __ movss(i.InputDoubleRegister(0), i.InputOperand(1));
      }
      __ bind(&done_compare);
      __ bind(ool->exit());
      break;
    }

    case kSSEFloat64Max: {
      Label compare_nan, compare_swap, done_compare;
      if (instr->InputAt(1)->IsFPRegister()) {
        __ ucomisd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
      } else {
        __ ucomisd(i.InputDoubleRegister(0), i.InputOperand(1));
      }
      auto ool =
          new (zone()) OutOfLineLoadFloat64NaN(this, i.OutputDoubleRegister());
      __ j(parity_even, ool->entry());
      __ j(above, &done_compare, Label::kNear);
      __ j(below, &compare_swap, Label::kNear);
      __ movmskpd(i.TempRegister(0), i.InputDoubleRegister(0));
      __ test(i.TempRegister(0), Immediate(1));
      __ j(zero, &done_compare, Label::kNear);
      __ bind(&compare_swap);
      if (instr->InputAt(1)->IsFPRegister()) {
        __ movsd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
      } else {
        __ movsd(i.InputDoubleRegister(0), i.InputOperand(1));
      }
      __ bind(&done_compare);
      __ bind(ool->exit());
      break;
    }
    case kSSEFloat32Min: {
      Label compare_swap, done_compare;
      if (instr->InputAt(1)->IsFPRegister()) {
        __ ucomiss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
      } else {
        __ ucomiss(i.InputDoubleRegister(0), i.InputOperand(1));
      }
      auto ool =
          new (zone()) OutOfLineLoadFloat32NaN(this, i.OutputDoubleRegister());
      __ j(parity_even, ool->entry());
      __ j(below, &done_compare, Label::kNear);
      __ j(above, &compare_swap, Label::kNear);
      if (instr->InputAt(1)->IsFPRegister()) {
        __ movmskps(i.TempRegister(0), i.InputDoubleRegister(1));
      } else {
        __ movss(kScratchDoubleReg, i.InputOperand(1));
        __ movmskps(i.TempRegister(0), kScratchDoubleReg);
      }
      __ test(i.TempRegister(0), Immediate(1));
      __ j(zero, &done_compare, Label::kNear);
      __ bind(&compare_swap);
      if (instr->InputAt(1)->IsFPRegister()) {
        __ movss(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
      } else {
        __ movss(i.InputDoubleRegister(0), i.InputOperand(1));
      }
      __ bind(&done_compare);
      __ bind(ool->exit());
      break;
    }
    case kSSEFloat64Min: {
      Label compare_swap, done_compare;
      if (instr->InputAt(1)->IsFPRegister()) {
        __ ucomisd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
      } else {
        __ ucomisd(i.InputDoubleRegister(0), i.InputOperand(1));
      }
      auto ool =
          new (zone()) OutOfLineLoadFloat64NaN(this, i.OutputDoubleRegister());
      __ j(parity_even, ool->entry());
      __ j(below, &done_compare, Label::kNear);
      __ j(above, &compare_swap, Label::kNear);
      if (instr->InputAt(1)->IsFPRegister()) {
        __ movmskpd(i.TempRegister(0), i.InputDoubleRegister(1));
      } else {
        __ movsd(kScratchDoubleReg, i.InputOperand(1));
        __ movmskpd(i.TempRegister(0), kScratchDoubleReg);
      }
      __ test(i.TempRegister(0), Immediate(1));
      __ j(zero, &done_compare, Label::kNear);
      __ bind(&compare_swap);
      if (instr->InputAt(1)->IsFPRegister()) {
        __ movsd(i.InputDoubleRegister(0), i.InputDoubleRegister(1));
      } else {
        __ movsd(i.InputDoubleRegister(0), i.InputOperand(1));
      }
      __ bind(&done_compare);
      __ bind(ool->exit());
      break;
    }
    case kSSEFloat64Mod: {
      // TODO(dcarney): alignment is wrong.
      __ sub(esp, Immediate(kDoubleSize));
      // Move values to st(0) and st(1).
      __ movsd(Operand(esp, 0), i.InputDoubleRegister(1));
      __ fld_d(Operand(esp, 0));
      __ movsd(Operand(esp, 0), i.InputDoubleRegister(0));
      __ fld_d(Operand(esp, 0));
      // Loop while fprem isn't done.
      Label mod_loop;
      __ bind(&mod_loop);
      // This instructions traps on all kinds inputs, but we are assuming the
      // floating point control word is set to ignore them all.
      __ fprem();
      // The following 2 instruction implicitly use eax.
      __ fnstsw_ax();
      __ sahf();
      __ j(parity_even, &mod_loop);
      // Move output to stack and clean up.
      __ fstp(1);
      __ fstp_d(Operand(esp, 0));
      __ movsd(i.OutputDoubleRegister(), Operand(esp, 0));
      __ add(esp, Immediate(kDoubleSize));
      break;
    }
    case kSSEFloat64Abs: {
      // TODO(bmeurer): Use 128-bit constants.
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ psrlq(kScratchDoubleReg, 1);
      __ andpd(i.OutputDoubleRegister(), kScratchDoubleReg);
      break;
    }
    case kSSEFloat64Neg: {
      // TODO(bmeurer): Use 128-bit constants.
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ psllq(kScratchDoubleReg, 63);
      __ xorpd(i.OutputDoubleRegister(), kScratchDoubleReg);
      break;
    }
    case kSSEFloat64Sqrt:
      __ sqrtsd(i.OutputDoubleRegister(), i.InputOperand(0));
      break;
    case kSSEFloat64Round: {
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      RoundingMode const mode =
          static_cast<RoundingMode>(MiscField::decode(instr->opcode()));
      __ roundsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0), mode);
      break;
    }
    case kSSEFloat32ToFloat64:
      __ cvtss2sd(i.OutputDoubleRegister(), i.InputOperand(0));
      break;
    case kSSEFloat64ToFloat32:
      __ cvtsd2ss(i.OutputDoubleRegister(), i.InputOperand(0));
      break;
    case kSSEFloat32ToInt32:
      __ cvttss2si(i.OutputRegister(), i.InputOperand(0));
      break;
    case kSSEFloat32ToUint32:
      __ Cvttss2ui(i.OutputRegister(), i.InputOperand(0), kScratchDoubleReg);
      break;
    case kSSEFloat64ToInt32:
      __ cvttsd2si(i.OutputRegister(), i.InputOperand(0));
      break;
    case kSSEFloat64ToUint32:
      __ Cvttsd2ui(i.OutputRegister(), i.InputOperand(0), kScratchDoubleReg);
      break;
    case kSSEInt32ToFloat32:
      __ cvtsi2ss(i.OutputDoubleRegister(), i.InputOperand(0));
      break;
    case kSSEUint32ToFloat32:
      __ Cvtui2ss(i.OutputDoubleRegister(), i.InputOperand(0),
                  i.TempRegister(0));
      break;
    case kSSEInt32ToFloat64:
      __ cvtsi2sd(i.OutputDoubleRegister(), i.InputOperand(0));
      break;
    case kSSEUint32ToFloat64:
      __ Cvtui2sd(i.OutputDoubleRegister(), i.InputOperand(0));
      break;
    case kSSEFloat64ExtractLowWord32:
      if (instr->InputAt(0)->IsFPStackSlot()) {
        __ mov(i.OutputRegister(), i.InputOperand(0));
      } else {
        __ movd(i.OutputRegister(), i.InputDoubleRegister(0));
      }
      break;
    case kSSEFloat64ExtractHighWord32:
      if (instr->InputAt(0)->IsFPStackSlot()) {
        __ mov(i.OutputRegister(), i.InputOperand(0, kDoubleSize / 2));
      } else {
        __ Pextrd(i.OutputRegister(), i.InputDoubleRegister(0), 1);
      }
      break;
    case kSSEFloat64InsertLowWord32:
      __ Pinsrd(i.OutputDoubleRegister(), i.InputOperand(1), 0, true);
      break;
    case kSSEFloat64InsertHighWord32:
      __ Pinsrd(i.OutputDoubleRegister(), i.InputOperand(1), 1, true);
      break;
    case kSSEFloat64LoadLowWord32:
      __ movd(i.OutputDoubleRegister(), i.InputOperand(0));
      break;
    case kAVXFloat32Add: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vaddss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
      break;
    }
    case kAVXFloat32Sub: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vsubss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
      break;
    }
    case kAVXFloat32Mul: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vmulss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
      break;
    }
    case kAVXFloat32Div: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vdivss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
      // Don't delete this mov. It may improve performance on some CPUs,
      // when there is a (v)mulss depending on the result.
      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    }
    case kAVXFloat64Add: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vaddsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
      break;
    }
    case kAVXFloat64Sub: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vsubsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
      break;
    }
    case kAVXFloat64Mul: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vmulsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
      break;
    }
    case kAVXFloat64Div: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vdivsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
                i.InputOperand(1));
      // Don't delete this mov. It may improve performance on some CPUs,
      // when there is a (v)mulsd depending on the result.
      __ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
      break;
    }
    case kAVXFloat32Abs: {
      // TODO(bmeurer): Use RIP relative 128-bit constants.
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ psrlq(kScratchDoubleReg, 33);
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vandps(i.OutputDoubleRegister(), kScratchDoubleReg, i.InputOperand(0));
      break;
    }
    case kAVXFloat32Neg: {
      // TODO(bmeurer): Use RIP relative 128-bit constants.
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ psllq(kScratchDoubleReg, 31);
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vxorps(i.OutputDoubleRegister(), kScratchDoubleReg, i.InputOperand(0));
      break;
    }
    case kAVXFloat64Abs: {
      // TODO(bmeurer): Use RIP relative 128-bit constants.
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ psrlq(kScratchDoubleReg, 1);
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vandpd(i.OutputDoubleRegister(), kScratchDoubleReg, i.InputOperand(0));
      break;
    }
    case kAVXFloat64Neg: {
      // TODO(bmeurer): Use RIP relative 128-bit constants.
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ psllq(kScratchDoubleReg, 63);
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vxorpd(i.OutputDoubleRegister(), kScratchDoubleReg, i.InputOperand(0));
      break;
    }
    case kSSEFloat64SilenceNaN:
      __ xorpd(kScratchDoubleReg, kScratchDoubleReg);
      __ subsd(i.InputDoubleRegister(0), kScratchDoubleReg);
      break;
    case kIA32Movsxbl:
      ASSEMBLE_MOVX(movsx_b);
      EmitWordLoadPoisoningIfNeeded(this, opcode, instr, i);
      break;
    case kIA32Movzxbl:
      ASSEMBLE_MOVX(movzx_b);
      EmitWordLoadPoisoningIfNeeded(this, opcode, instr, i);
      break;
    case kIA32Movb: {
      size_t index = 0;
      Operand operand = i.MemoryOperand(&index);
      if (HasImmediateInput(instr, index)) {
        __ mov_b(operand, i.InputInt8(index));
      } else {
        __ mov_b(operand, i.InputRegister(index));
      }
      EmitWordLoadPoisoningIfNeeded(this, opcode, instr, i);
      break;
    }
    case kIA32Movsxwl:
      ASSEMBLE_MOVX(movsx_w);
      EmitWordLoadPoisoningIfNeeded(this, opcode, instr, i);
      break;
    case kIA32Movzxwl:
      ASSEMBLE_MOVX(movzx_w);
      EmitWordLoadPoisoningIfNeeded(this, opcode, instr, i);
      break;
    case kIA32Movw: {
      size_t index = 0;
      Operand operand = i.MemoryOperand(&index);
      if (HasImmediateInput(instr, index)) {
        __ mov_w(operand, i.InputInt16(index));
      } else {
        __ mov_w(operand, i.InputRegister(index));
      }
      EmitWordLoadPoisoningIfNeeded(this, opcode, instr, i);
      break;
    }
    case kIA32Movl:
      if (instr->HasOutput()) {
        __ mov(i.OutputRegister(), i.MemoryOperand());
        EmitWordLoadPoisoningIfNeeded(this, opcode, instr, i);
      } else {
        size_t index = 0;
        Operand operand = i.MemoryOperand(&index);
        if (HasImmediateInput(instr, index)) {
          __ mov(operand, i.InputImmediate(index));
        } else {
          __ mov(operand, i.InputRegister(index));
        }
      }
      break;
    case kIA32Movsd:
      if (instr->HasOutput()) {
        __ movsd(i.OutputDoubleRegister(), i.MemoryOperand());
      } else {
        size_t index = 0;
        Operand operand = i.MemoryOperand(&index);
        __ movsd(operand, i.InputDoubleRegister(index));
      }
      break;
    case kIA32Movss:
      if (instr->HasOutput()) {
        __ movss(i.OutputDoubleRegister(), i.MemoryOperand());
      } else {
        size_t index = 0;
        Operand operand = i.MemoryOperand(&index);
        __ movss(operand, i.InputDoubleRegister(index));
      }
      break;
    case kIA32Movdqu:
      if (instr->HasOutput()) {
        __ Movdqu(i.OutputSimd128Register(), i.MemoryOperand());
      } else {
        size_t index = 0;
        Operand operand = i.MemoryOperand(&index);
        __ Movdqu(operand, i.InputSimd128Register(index));
      }
      break;
    case kIA32BitcastFI:
      if (instr->InputAt(0)->IsFPStackSlot()) {
        __ mov(i.OutputRegister(), i.InputOperand(0));
      } else {
        __ movd(i.OutputRegister(), i.InputDoubleRegister(0));
      }
      break;
    case kIA32BitcastIF:
      if (instr->InputAt(0)->IsRegister()) {
        __ movd(i.OutputDoubleRegister(), i.InputRegister(0));
      } else {
        __ movss(i.OutputDoubleRegister(), i.InputOperand(0));
      }
      break;
    case kIA32Lea: {
      AddressingMode mode = AddressingModeField::decode(instr->opcode());
      // Shorten "leal" to "addl", "subl" or "shll" if the register allocation
      // and addressing mode just happens to work out. The "addl"/"subl" forms
      // in these cases are faster based on measurements.
      if (mode == kMode_MI) {
        __ Move(i.OutputRegister(), Immediate(i.InputInt32(0)));
      } else if (i.InputRegister(0) == i.OutputRegister()) {
        if (mode == kMode_MRI) {
          int32_t constant_summand = i.InputInt32(1);
          if (constant_summand > 0) {
            __ add(i.OutputRegister(), Immediate(constant_summand));
          } else if (constant_summand < 0) {
            __ sub(i.OutputRegister(), Immediate(-constant_summand));
          }
        } else if (mode == kMode_MR1) {
          if (i.InputRegister(1) == i.OutputRegister()) {
            __ shl(i.OutputRegister(), 1);
          } else {
            __ add(i.OutputRegister(), i.InputRegister(1));
          }
        } else if (mode == kMode_M2) {
          __ shl(i.OutputRegister(), 1);
        } else if (mode == kMode_M4) {
          __ shl(i.OutputRegister(), 2);
        } else if (mode == kMode_M8) {
          __ shl(i.OutputRegister(), 3);
        } else {
          __ lea(i.OutputRegister(), i.MemoryOperand());
        }
      } else if (mode == kMode_MR1 &&
                 i.InputRegister(1) == i.OutputRegister()) {
        __ add(i.OutputRegister(), i.InputRegister(0));
      } else {
        __ lea(i.OutputRegister(), i.MemoryOperand());
      }
      break;
    }
    case kIA32PushFloat32:
      if (instr->InputAt(0)->IsFPRegister()) {
        __ sub(esp, Immediate(kFloatSize));
        __ movss(Operand(esp, 0), i.InputDoubleRegister(0));
        frame_access_state()->IncreaseSPDelta(kFloatSize / kPointerSize);
      } else if (HasImmediateInput(instr, 0)) {
        __ Move(kScratchDoubleReg, i.InputFloat32(0));
        __ sub(esp, Immediate(kFloatSize));
        __ movss(Operand(esp, 0), kScratchDoubleReg);
        frame_access_state()->IncreaseSPDelta(kFloatSize / kPointerSize);
      } else {
        __ movss(kScratchDoubleReg, i.InputOperand(0));
        __ sub(esp, Immediate(kFloatSize));
        __ movss(Operand(esp, 0), kScratchDoubleReg);
        frame_access_state()->IncreaseSPDelta(kFloatSize / kPointerSize);
      }
      break;
    case kIA32PushFloat64:
      if (instr->InputAt(0)->IsFPRegister()) {
        __ sub(esp, Immediate(kDoubleSize));
        __ movsd(Operand(esp, 0), i.InputDoubleRegister(0));
        frame_access_state()->IncreaseSPDelta(kDoubleSize / kPointerSize);
      } else if (HasImmediateInput(instr, 0)) {
        __ Move(kScratchDoubleReg, i.InputDouble(0));
        __ sub(esp, Immediate(kDoubleSize));
        __ movsd(Operand(esp, 0), kScratchDoubleReg);
        frame_access_state()->IncreaseSPDelta(kDoubleSize / kPointerSize);
      } else {
        __ movsd(kScratchDoubleReg, i.InputOperand(0));
        __ sub(esp, Immediate(kDoubleSize));
        __ movsd(Operand(esp, 0), kScratchDoubleReg);
        frame_access_state()->IncreaseSPDelta(kDoubleSize / kPointerSize);
      }
      break;
    case kIA32PushSimd128:
      if (instr->InputAt(0)->IsFPRegister()) {
        __ sub(esp, Immediate(kSimd128Size));
        __ movups(Operand(esp, 0), i.InputSimd128Register(0));
      } else {
        __ movups(kScratchDoubleReg, i.InputOperand(0));
        __ sub(esp, Immediate(kSimd128Size));
        __ movups(Operand(esp, 0), kScratchDoubleReg);
      }
      frame_access_state()->IncreaseSPDelta(kSimd128Size / kPointerSize);
      break;
    case kIA32Push:
      if (AddressingModeField::decode(instr->opcode()) != kMode_None) {
        size_t index = 0;
        Operand operand = i.MemoryOperand(&index);
        __ push(operand);
        frame_access_state()->IncreaseSPDelta(kFloatSize / kPointerSize);
      } else if (instr->InputAt(0)->IsFPRegister()) {
        __ sub(esp, Immediate(kFloatSize));
        __ movsd(Operand(esp, 0), i.InputDoubleRegister(0));
        frame_access_state()->IncreaseSPDelta(kFloatSize / kPointerSize);
      } else if (HasImmediateInput(instr, 0)) {
        __ push(i.InputImmediate(0));
        frame_access_state()->IncreaseSPDelta(1);
      } else {
        __ push(i.InputOperand(0));
        frame_access_state()->IncreaseSPDelta(1);
      }
      break;
    case kIA32Poke: {
      int slot = MiscField::decode(instr->opcode());
      if (HasImmediateInput(instr, 0)) {
        __ mov(Operand(esp, slot * kPointerSize), i.InputImmediate(0));
      } else {
        __ mov(Operand(esp, slot * kPointerSize), i.InputRegister(0));
      }
      break;
    }
    case kIA32Peek: {
      int reverse_slot = i.InputInt32(0) + 1;
      int offset =
          FrameSlotToFPOffset(frame()->GetTotalFrameSlotCount() - reverse_slot);
      if (instr->OutputAt(0)->IsFPRegister()) {
        LocationOperand* op = LocationOperand::cast(instr->OutputAt(0));
        if (op->representation() == MachineRepresentation::kFloat64) {
          __ movsd(i.OutputDoubleRegister(), Operand(ebp, offset));
        } else {
          DCHECK_EQ(MachineRepresentation::kFloat32, op->representation());
          __ movss(i.OutputFloatRegister(), Operand(ebp, offset));
        }
      } else {
        __ mov(i.OutputRegister(), Operand(ebp, offset));
      }
      break;
    }
    case kSSEF32x4Splat: {
      DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
      XMMRegister dst = i.OutputSimd128Register();
      __ shufps(dst, dst, 0x0);
      break;
    }
    case kAVXF32x4Splat: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister src = i.InputFloatRegister(0);
      __ vshufps(i.OutputSimd128Register(), src, src, 0x0);
      break;
    }
    case kSSEF32x4ExtractLane: {
      DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
      XMMRegister dst = i.OutputFloatRegister();
      int8_t lane = i.InputInt8(1);
      if (lane != 0) {
        DCHECK_LT(lane, 4);
        __ shufps(dst, dst, lane);
      }
      break;
    }
    case kAVXF32x4ExtractLane: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputFloatRegister();
      XMMRegister src = i.InputSimd128Register(0);
      int8_t lane = i.InputInt8(1);
      if (lane == 0) {
        if (dst != src) __ vmovaps(dst, src);
      } else {
        DCHECK_LT(lane, 4);
        __ vshufps(dst, src, src, lane);
      }
      break;
    }
    case kSSEF32x4ReplaceLane: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ insertps(i.OutputSimd128Register(), i.InputOperand(2),
                  i.InputInt8(1) << 4);
      break;
    }
    case kAVXF32x4ReplaceLane: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vinsertps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(2), i.InputInt8(1) << 4);
      break;
    }
    case kIA32F32x4SConvertI32x4: {
      __ Cvtdq2ps(i.OutputSimd128Register(), i.InputOperand(0));
      break;
    }
    case kSSEF32x4UConvertI32x4: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      __ pxor(kScratchDoubleReg, kScratchDoubleReg);      // zeros
      __ pblendw(kScratchDoubleReg, dst, 0x55);           // get lo 16 bits
      __ psubd(dst, kScratchDoubleReg);                   // get hi 16 bits
      __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // convert lo exactly
      __ psrld(dst, 1);                  // divide by 2 to get in unsigned range
      __ cvtdq2ps(dst, dst);             // convert hi exactly
      __ addps(dst, dst);                // double hi, exactly
      __ addps(dst, kScratchDoubleReg);  // add hi and lo, may round.
      break;
    }
    case kAVXF32x4UConvertI32x4: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src = i.InputSimd128Register(0);
      __ vpxor(kScratchDoubleReg, kScratchDoubleReg,
               kScratchDoubleReg);  // zeros
      __ vpblendw(kScratchDoubleReg, kScratchDoubleReg, src,
                  0x55);                                   // get lo 16 bits
      __ vpsubd(dst, src, kScratchDoubleReg);              // get hi 16 bits
      __ vcvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // convert lo exactly
      __ vpsrld(dst, dst, 1);    // divide by 2 to get in unsigned range
      __ vcvtdq2ps(dst, dst);    // convert hi exactly
      __ vaddps(dst, dst, dst);  // double hi, exactly
      __ vaddps(dst, dst, kScratchDoubleReg);  // add hi and lo, may round.
      break;
    }
    case kSSEF32x4Abs: {
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(0);
      if (src.is_reg(dst)) {
        __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
        __ psrld(kScratchDoubleReg, 1);
        __ andps(dst, kScratchDoubleReg);
      } else {
        __ pcmpeqd(dst, dst);
        __ psrld(dst, 1);
        __ andps(dst, src);
      }
      break;
    }
    case kAVXF32x4Abs: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 1);
      __ vandps(i.OutputSimd128Register(), kScratchDoubleReg,
                i.InputOperand(0));
      break;
    }
    case kSSEF32x4Neg: {
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(0);
      if (src.is_reg(dst)) {
        __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
        __ pslld(kScratchDoubleReg, 31);
        __ xorps(dst, kScratchDoubleReg);
      } else {
        __ pcmpeqd(dst, dst);
        __ pslld(dst, 31);
        __ xorps(dst, src);
      }
      break;
    }
    case kAVXF32x4Neg: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpslld(kScratchDoubleReg, kScratchDoubleReg, 31);
      __ vxorps(i.OutputSimd128Register(), kScratchDoubleReg,
                i.InputOperand(0));
      break;
    }
    case kIA32F32x4RecipApprox: {
      __ Rcpps(i.OutputSimd128Register(), i.InputOperand(0));
      break;
    }
    case kIA32F32x4RecipSqrtApprox: {
      __ Rsqrtps(i.OutputSimd128Register(), i.InputOperand(0));
      break;
    }
    case kSSEF32x4Add: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ addps(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXF32x4Add: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vaddps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEF32x4AddHoriz: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE3);
      __ haddps(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXF32x4AddHoriz: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vhaddps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEF32x4Sub: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ subps(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXF32x4Sub: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vsubps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEF32x4Mul: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ mulps(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXF32x4Mul: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vmulps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEF32x4Min: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ minps(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXF32x4Min: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vminps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEF32x4Max: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ maxps(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXF32x4Max: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vmaxps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEF32x4Eq: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ cmpeqps(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXF32x4Eq: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vcmpeqps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEF32x4Ne: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ cmpneqps(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXF32x4Ne: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vcmpneqps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
      break;
    }
    case kSSEF32x4Lt: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ cmpltps(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXF32x4Lt: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vcmpltps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEF32x4Le: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ cmpleps(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXF32x4Le: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vcmpleps(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kIA32I32x4Splat: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Movd(dst, i.InputOperand(0));
      __ Pshufd(dst, dst, 0x0);
      break;
    }
    case kIA32I32x4ExtractLane: {
      __ Pextrd(i.OutputRegister(), i.InputSimd128Register(0), i.InputInt8(1));
      break;
    }
    case kSSEI32x4ReplaceLane: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pinsrd(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1));
      break;
    }
    case kAVXI32x4ReplaceLane: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpinsrd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(2), i.InputInt8(1));
      break;
    }
    case kSSEI32x4SConvertF32x4: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      XMMRegister dst = i.OutputSimd128Register();
      // NAN->0
      __ movaps(kScratchDoubleReg, dst);
      __ cmpeqps(kScratchDoubleReg, kScratchDoubleReg);
      __ pand(dst, kScratchDoubleReg);
      // Set top bit if >= 0 (but not -0.0!)
      __ pxor(kScratchDoubleReg, dst);
      // Convert
      __ cvttps2dq(dst, dst);
      // Set top bit if >=0 is now < 0
      __ pand(kScratchDoubleReg, dst);
      __ psrad(kScratchDoubleReg, 31);
      // Set positive overflow lanes to 0x7FFFFFFF
      __ pxor(dst, kScratchDoubleReg);
      break;
    }
    case kAVXI32x4SConvertF32x4: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src = i.InputSimd128Register(0);
      // NAN->0
      __ vcmpeqps(kScratchDoubleReg, src, src);
      __ vpand(dst, src, kScratchDoubleReg);
      // Set top bit if >= 0 (but not -0.0!)
      __ vpxor(kScratchDoubleReg, kScratchDoubleReg, dst);
      // Convert
      __ vcvttps2dq(dst, dst);
      // Set top bit if >=0 is now < 0
      __ vpand(kScratchDoubleReg, kScratchDoubleReg, dst);
      __ vpsrad(kScratchDoubleReg, kScratchDoubleReg, 31);
      // Set positive overflow lanes to 0x7FFFFFFF
      __ vpxor(dst, dst, kScratchDoubleReg);
      break;
    }
    case kIA32I32x4SConvertI16x8Low: {
      __ Pmovsxwd(i.OutputSimd128Register(), i.InputOperand(0));
      break;
    }
    case kIA32I32x4SConvertI16x8High: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Palignr(dst, i.InputOperand(0), 8);
      __ Pmovsxwd(dst, dst);
      break;
    }
    case kIA32I32x4Neg: {
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(0);
      if (src.is_reg(dst)) {
        __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
        __ Psignd(dst, kScratchDoubleReg);
      } else {
        __ Pxor(dst, dst);
        __ Psubd(dst, src);
      }
      break;
    }
    case kSSEI32x4Shl: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pslld(i.OutputSimd128Register(), i.InputInt8(1));
      break;
    }
    case kAVXI32x4Shl: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpslld(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputInt8(1));
      break;
    }
    case kSSEI32x4ShrS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psrad(i.OutputSimd128Register(), i.InputInt8(1));
      break;
    }
    case kAVXI32x4ShrS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsrad(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputInt8(1));
      break;
    }
    case kSSEI32x4Add: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ paddd(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI32x4Add: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpaddd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEI32x4AddHoriz: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSSE3);
      __ phaddd(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI32x4AddHoriz: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vphaddd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI32x4Sub: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psubd(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI32x4Sub: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsubd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEI32x4Mul: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pmulld(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI32x4Mul: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpmulld(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI32x4MinS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pminsd(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI32x4MinS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpminsd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI32x4MaxS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pmaxsd(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI32x4MaxS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpmaxsd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI32x4Eq: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI32x4Eq: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpeqd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEI32x4Ne: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1));
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
      break;
    }
    case kAVXI32x4Ne: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpeqd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
               kScratchDoubleReg);
      break;
    }
    case kSSEI32x4GtS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pcmpgtd(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI32x4GtS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpgtd(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEI32x4GeS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(1);
      __ pminsd(dst, src);
      __ pcmpeqd(dst, src);
      break;
    }
    case kAVXI32x4GeS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister src1 = i.InputSimd128Register(0);
      Operand src2 = i.InputOperand(1);
      __ vpminsd(kScratchDoubleReg, src1, src2);
      __ vpcmpeqd(i.OutputSimd128Register(), kScratchDoubleReg, src2);
      break;
    }
    case kSSEI32x4UConvertF32x4: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));
      // NAN->0, negative->0
      __ pxor(kScratchDoubleReg, kScratchDoubleReg);
      __ maxps(dst, kScratchDoubleReg);
      // scratch: float representation of max_signed
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ psrld(kScratchDoubleReg, 1);                     // 0x7fffffff
      __ cvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // 0x4f000000
      // tmp: convert (src-max_signed).
      // Positive overflow lanes -> 0x7FFFFFFF
      // Negative lanes -> 0
      __ movaps(tmp, dst);
      __ subps(tmp, kScratchDoubleReg);
      __ cmpleps(kScratchDoubleReg, tmp);
      __ cvttps2dq(tmp, tmp);
      __ pxor(tmp, kScratchDoubleReg);
      __ pxor(kScratchDoubleReg, kScratchDoubleReg);
      __ pmaxsd(tmp, kScratchDoubleReg);
      // convert. Overflow lanes above max_signed will be 0x80000000
      __ cvttps2dq(dst, dst);
      // Add (src-max_signed) for overflow lanes.
      __ paddd(dst, tmp);
      break;
    }
    case kAVXI32x4UConvertF32x4: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));
      // NAN->0, negative->0
      __ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vmaxps(dst, dst, kScratchDoubleReg);
      // scratch: float representation of max_signed
      __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 1);  // 0x7fffffff
      __ vcvtdq2ps(kScratchDoubleReg, kScratchDoubleReg);  // 0x4f000000
      // tmp: convert (src-max_signed).
      // Positive overflow lanes -> 0x7FFFFFFF
      // Negative lanes -> 0
      __ vsubps(tmp, dst, kScratchDoubleReg);
      __ vcmpleps(kScratchDoubleReg, kScratchDoubleReg, tmp);
      __ vcvttps2dq(tmp, tmp);
      __ vpxor(tmp, tmp, kScratchDoubleReg);
      __ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpmaxsd(tmp, tmp, kScratchDoubleReg);
      // convert. Overflow lanes above max_signed will be 0x80000000
      __ vcvttps2dq(dst, dst);
      // Add (src-max_signed) for overflow lanes.
      __ vpaddd(dst, dst, tmp);
      break;
    }
    case kIA32I32x4UConvertI16x8Low: {
      __ Pmovzxwd(i.OutputSimd128Register(), i.InputOperand(0));
      break;
    }
    case kIA32I32x4UConvertI16x8High: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Palignr(dst, i.InputOperand(0), 8);
      __ Pmovzxwd(dst, dst);
      break;
    }
    case kSSEI32x4ShrU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psrld(i.OutputSimd128Register(), i.InputInt8(1));
      break;
    }
    case kAVXI32x4ShrU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsrld(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputInt8(1));
      break;
    }
    case kSSEI32x4MinU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pminud(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI32x4MinU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpminud(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI32x4MaxU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pmaxud(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI32x4MaxU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpmaxud(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI32x4GtU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(1);
      __ pmaxud(dst, src);
      __ pcmpeqd(dst, src);
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ pxor(dst, kScratchDoubleReg);
      break;
    }
    case kAVXI32x4GtU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src1 = i.InputSimd128Register(0);
      Operand src2 = i.InputOperand(1);
      __ vpmaxud(kScratchDoubleReg, src1, src2);
      __ vpcmpeqd(dst, kScratchDoubleReg, src2);
      __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpxor(dst, dst, kScratchDoubleReg);
      break;
    }
    case kSSEI32x4GeU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(1);
      __ pminud(dst, src);
      __ pcmpeqd(dst, src);
      break;
    }
    case kAVXI32x4GeU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister src1 = i.InputSimd128Register(0);
      Operand src2 = i.InputOperand(1);
      __ vpminud(kScratchDoubleReg, src1, src2);
      __ vpcmpeqd(i.OutputSimd128Register(), kScratchDoubleReg, src2);
      break;
    }
    case kIA32I16x8Splat: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Movd(dst, i.InputOperand(0));
      __ Pshuflw(dst, dst, 0x0);
      __ Pshufd(dst, dst, 0x0);
      break;
    }
    case kIA32I16x8ExtractLane: {
      Register dst = i.OutputRegister();
      __ Pextrw(dst, i.InputSimd128Register(0), i.InputInt8(1));
      __ movsx_w(dst, dst);
      break;
    }
    case kSSEI16x8ReplaceLane: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pinsrw(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1));
      break;
    }
    case kAVXI16x8ReplaceLane: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpinsrw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(2), i.InputInt8(1));
      break;
    }
    case kIA32I16x8SConvertI8x16Low: {
      __ Pmovsxbw(i.OutputSimd128Register(), i.InputOperand(0));
      break;
    }
    case kIA32I16x8SConvertI8x16High: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Palignr(dst, i.InputOperand(0), 8);
      __ Pmovsxbw(dst, dst);
      break;
    }
    case kIA32I16x8Neg: {
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(0);
      if (src.is_reg(dst)) {
        __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
        __ Psignw(dst, kScratchDoubleReg);
      } else {
        __ Pxor(dst, dst);
        __ Psubw(dst, src);
      }
      break;
    }
    case kSSEI16x8Shl: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psllw(i.OutputSimd128Register(), i.InputInt8(1));
      break;
    }
    case kAVXI16x8Shl: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsllw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputInt8(1));
      break;
    }
    case kSSEI16x8ShrS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psraw(i.OutputSimd128Register(), i.InputInt8(1));
      break;
    }
    case kAVXI16x8ShrS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsraw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputInt8(1));
      break;
    }
    case kSSEI16x8SConvertI32x4: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ packssdw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8SConvertI32x4: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpackssdw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
      break;
    }
    case kSSEI16x8Add: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ paddw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8Add: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpaddw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEI16x8AddSaturateS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ paddsw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8AddSaturateS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpaddsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI16x8AddHoriz: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSSE3);
      __ phaddw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8AddHoriz: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vphaddw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI16x8Sub: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psubw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8Sub: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsubw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEI16x8SubSaturateS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psubsw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8SubSaturateS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsubsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI16x8Mul: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pmullw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8Mul: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpmullw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI16x8MinS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pminsw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8MinS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpminsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI16x8MaxS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pmaxsw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8MaxS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpmaxsw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI16x8Eq: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pcmpeqw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8Eq: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpeqw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEI16x8Ne: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pcmpeqw(i.OutputSimd128Register(), i.InputOperand(1));
      __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
      __ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
      break;
    }
    case kAVXI16x8Ne: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpeqw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      __ vpcmpeqw(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
               kScratchDoubleReg);
      break;
    }
    case kSSEI16x8GtS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pcmpgtw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8GtS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpgtw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEI16x8GeS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(1);
      __ pminsw(dst, src);
      __ pcmpeqw(dst, src);
      break;
    }
    case kAVXI16x8GeS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister src1 = i.InputSimd128Register(0);
      Operand src2 = i.InputOperand(1);
      __ vpminsw(kScratchDoubleReg, src1, src2);
      __ vpcmpeqw(i.OutputSimd128Register(), kScratchDoubleReg, src2);
      break;
    }
    case kIA32I16x8UConvertI8x16Low: {
      __ Pmovzxbw(i.OutputSimd128Register(), i.InputOperand(0));
      break;
    }
    case kIA32I16x8UConvertI8x16High: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Palignr(dst, i.InputOperand(0), 8);
      __ Pmovzxbw(dst, dst);
      break;
    }
    case kSSEI16x8ShrU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psrlw(i.OutputSimd128Register(), i.InputInt8(1));
      break;
    }
    case kAVXI16x8ShrU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsrlw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputInt8(1));
      break;
    }
    case kSSEI16x8UConvertI32x4: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      // Change negative lanes to 0x7FFFFFFF
      __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
      __ psrld(kScratchDoubleReg, 1);
      __ pminud(dst, kScratchDoubleReg);
      __ pminud(kScratchDoubleReg, i.InputOperand(1));
      __ packusdw(dst, kScratchDoubleReg);
      break;
    }
    case kAVXI16x8UConvertI32x4: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      // Change negative lanes to 0x7FFFFFFF
      __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpsrld(kScratchDoubleReg, kScratchDoubleReg, 1);
      __ vpminud(dst, kScratchDoubleReg, i.InputSimd128Register(0));
      __ vpminud(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(1));
      __ vpackusdw(dst, dst, kScratchDoubleReg);
      break;
    }
    case kSSEI16x8AddSaturateU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ paddusw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8AddSaturateU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpaddusw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEI16x8SubSaturateU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psubusw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8SubSaturateU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsubusw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEI16x8MinU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pminuw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8MinU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpminuw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI16x8MaxU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pmaxuw(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI16x8MaxU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpmaxuw(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI16x8GtU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(1);
      __ pmaxuw(dst, src);
      __ pcmpeqw(dst, src);
      __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
      __ pxor(dst, kScratchDoubleReg);
      break;
    }
    case kAVXI16x8GtU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src1 = i.InputSimd128Register(0);
      Operand src2 = i.InputOperand(1);
      __ vpmaxuw(kScratchDoubleReg, src1, src2);
      __ vpcmpeqw(dst, kScratchDoubleReg, src2);
      __ vpcmpeqw(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpxor(dst, dst, kScratchDoubleReg);
      break;
    }
    case kSSEI16x8GeU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(1);
      __ pminuw(dst, src);
      __ pcmpeqw(dst, src);
      break;
    }
    case kAVXI16x8GeU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister src1 = i.InputSimd128Register(0);
      Operand src2 = i.InputOperand(1);
      __ vpminuw(kScratchDoubleReg, src1, src2);
      __ vpcmpeqw(i.OutputSimd128Register(), kScratchDoubleReg, src2);
      break;
    }
    case kIA32I8x16Splat: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Movd(dst, i.InputOperand(0));
      __ Pxor(kScratchDoubleReg, kScratchDoubleReg);
      __ Pshufb(dst, kScratchDoubleReg);
      break;
    }
    case kIA32I8x16ExtractLane: {
      Register dst = i.OutputRegister();
      __ Pextrb(dst, i.InputSimd128Register(0), i.InputInt8(1));
      __ movsx_b(dst, dst);
      break;
    }
    case kSSEI8x16ReplaceLane: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pinsrb(i.OutputSimd128Register(), i.InputOperand(2), i.InputInt8(1));
      break;
    }
    case kAVXI8x16ReplaceLane: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpinsrb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(2), i.InputInt8(1));
      break;
    }
    case kSSEI8x16SConvertI16x8: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ packsswb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16SConvertI16x8: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpacksswb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                   i.InputOperand(1));
      break;
    }
    case kIA32I8x16Neg: {
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(0);
      if (src.is_reg(dst)) {
        __ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
        __ Psignb(dst, kScratchDoubleReg);
      } else {
        __ Pxor(dst, dst);
        __ Psubb(dst, src);
      }
      break;
    }
    case kSSEI8x16Shl: {
      XMMRegister dst = i.OutputSimd128Register();
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      int8_t shift = i.InputInt8(1) & 0x7;
      if (shift < 4) {
        // For small shifts, doubling is faster.
        for (int i = 0; i < shift; ++i) {
          __ paddb(dst, dst);
        }
      } else {
        // Mask off the unwanted bits before word-shifting.
        __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
        __ psrlw(kScratchDoubleReg, 8 + shift);
        __ packuswb(kScratchDoubleReg, kScratchDoubleReg);
        __ pand(dst, kScratchDoubleReg);
        __ psllw(dst, shift);
      }
      break;
    }
    case kAVXI8x16Shl: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src = i.InputSimd128Register(0);
      int8_t shift = i.InputInt8(1) & 0x7;
      if (shift < 4) {
        // For small shifts, doubling is faster.
        for (int i = 0; i < shift; ++i) {
          __ vpaddb(dst, src, src);
          src = dst;
        }
      } else {
        // Mask off the unwanted bits before word-shifting.
        __ vpcmpeqw(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
        __ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 8 + shift);
        __ vpackuswb(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
        __ vpand(dst, src, kScratchDoubleReg);
        __ vpsllw(dst, dst, shift);
      }
      break;
    }
    case kIA32I8x16ShrS: {
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src = i.InputSimd128Register(0);
      int8_t shift = i.InputInt8(1) & 0x7;
      // Unpack the bytes into words, do arithmetic shifts, and repack.
      __ Punpckhbw(kScratchDoubleReg, src);
      __ Punpcklbw(dst, src);
      __ Psraw(kScratchDoubleReg, 8 + shift);
      __ Psraw(dst, 8 + shift);
      __ Packsswb(dst, kScratchDoubleReg);
      break;
    }
    case kSSEI8x16Add: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ paddb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16Add: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpaddb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEI8x16AddSaturateS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ paddsb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16AddSaturateS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpaddsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI8x16Sub: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psubb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16Sub: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsubb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                i.InputOperand(1));
      break;
    }
    case kSSEI8x16SubSaturateS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psubsb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16SubSaturateS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsubsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI8x16Mul: {
      XMMRegister dst = i.OutputSimd128Register();
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      XMMRegister right = i.InputSimd128Register(1);
      XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));

      // I16x8 view of I8x16
      // left = AAaa AAaa ... AAaa AAaa
      // right= BBbb BBbb ... BBbb BBbb

      // t = 00AA 00AA ... 00AA 00AA
      // s = 00BB 00BB ... 00BB 00BB
      __ movaps(tmp, dst);
      __ movaps(kScratchDoubleReg, right);
      __ psrlw(tmp, 8);
      __ psrlw(kScratchDoubleReg, 8);
      // dst = left * 256
      __ psllw(dst, 8);

      // t = I16x8Mul(t, s)
      //    => __PP __PP ...  __PP  __PP
      __ pmullw(tmp, kScratchDoubleReg);
      // dst = I16x8Mul(left * 256, right)
      //    => pp__ pp__ ...  pp__  pp__
      __ pmullw(dst, right);

      // t = I16x8Shl(t, 8)
      //    => PP00 PP00 ...  PP00  PP00
      __ psllw(tmp, 8);

      // dst = I16x8Shr(dst, 8)
      //    => 00pp 00pp ...  00pp  00pp
      __ psrlw(dst, 8);

      // dst = I16x8Or(dst, t)
      //    => PPpp PPpp ...  PPpp  PPpp
      __ por(dst, tmp);
      break;
    }
    case kAVXI8x16Mul: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister left = i.InputSimd128Register(0);
      XMMRegister right = i.InputSimd128Register(1);
      XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));

      // I16x8 view of I8x16
      // left = AAaa AAaa ... AAaa AAaa
      // right= BBbb BBbb ... BBbb BBbb

      // t = 00AA 00AA ... 00AA 00AA
      // s = 00BB 00BB ... 00BB 00BB
      __ vpsrlw(tmp, left, 8);
      __ vpsrlw(kScratchDoubleReg, right, 8);

      // t = I16x8Mul(t0, t1)
      //    => __PP __PP ...  __PP  __PP
      __ vpmullw(tmp, tmp, kScratchDoubleReg);

      // s = left * 256
      __ vpsllw(kScratchDoubleReg, left, 8);

      // dst = I16x8Mul(left * 256, right)
      //    => pp__ pp__ ...  pp__  pp__
      __ vpmullw(dst, kScratchDoubleReg, right);

      // dst = I16x8Shr(dst, 8)
      //    => 00pp 00pp ...  00pp  00pp
      __ vpsrlw(dst, dst, 8);

      // t = I16x8Shl(t, 8)
      //    => PP00 PP00 ...  PP00  PP00
      __ vpsllw(tmp, tmp, 8);

      // dst = I16x8Or(dst, t)
      //    => PPpp PPpp ...  PPpp  PPpp
      __ vpor(dst, dst, tmp);
      break;
    }
    case kSSEI8x16MinS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pminsb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16MinS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpminsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI8x16MaxS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      __ pmaxsb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16MaxS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpmaxsb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI8x16Eq: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pcmpeqb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16Eq: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpeqb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEI8x16Ne: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pcmpeqb(i.OutputSimd128Register(), i.InputOperand(1));
      __ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
      __ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
      break;
    }
    case kAVXI8x16Ne: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpeqb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      __ vpcmpeqb(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpxor(i.OutputSimd128Register(), i.OutputSimd128Register(),
               kScratchDoubleReg);
      break;
    }
    case kSSEI8x16GtS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pcmpgtb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16GtS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpgtb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEI8x16GeS: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(1);
      __ pminsb(dst, src);
      __ pcmpeqb(dst, src);
      break;
    }
    case kAVXI8x16GeS: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister src1 = i.InputSimd128Register(0);
      Operand src2 = i.InputOperand(1);
      __ vpminsb(kScratchDoubleReg, src1, src2);
      __ vpcmpeqb(i.OutputSimd128Register(), kScratchDoubleReg, src2);
      break;
    }
    case kSSEI8x16UConvertI16x8: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      // Change negative lanes to 0x7FFF
      __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
      __ psrlw(kScratchDoubleReg, 1);
      __ pminuw(dst, kScratchDoubleReg);
      __ pminuw(kScratchDoubleReg, i.InputOperand(1));
      __ packuswb(dst, kScratchDoubleReg);
      break;
    }
    case kAVXI8x16UConvertI16x8: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      // Change negative lanes to 0x7FFF
      __ vpcmpeqw(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 1);
      __ vpminuw(dst, kScratchDoubleReg, i.InputSimd128Register(0));
      __ vpminuw(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(1));
      __ vpackuswb(dst, dst, kScratchDoubleReg);
      break;
    }
    case kSSEI8x16AddSaturateU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ paddusb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16AddSaturateU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpaddusb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kSSEI8x16SubSaturateU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ psubusb(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16SubSaturateU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpsubusb(i.OutputSimd128Register(), i.InputSimd128Register(0),
                  i.InputOperand(1));
      break;
    }
    case kIA32I8x16ShrU: {
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src = i.InputSimd128Register(0);
      int8_t shift = i.InputInt8(1) & 0x7;
      // Unpack the bytes into words, do logical shifts, and repack.
      __ Punpckhbw(kScratchDoubleReg, src);
      __ Punpcklbw(dst, src);
      __ Psrlw(kScratchDoubleReg, 8 + shift);
      __ Psrlw(dst, 8 + shift);
      __ Packuswb(dst, kScratchDoubleReg);
      break;
    }
    case kSSEI8x16MinU: {
      XMMRegister dst = i.OutputSimd128Register();
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      __ pminub(dst, i.InputOperand(1));
      break;
    }
    case kAVXI8x16MinU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpminub(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI8x16MaxU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pmaxub(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXI8x16MaxU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpmaxub(i.OutputSimd128Register(), i.InputSimd128Register(0),
                 i.InputOperand(1));
      break;
    }
    case kSSEI8x16GtU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(1);
      __ pmaxub(dst, src);
      __ pcmpeqb(dst, src);
      __ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
      __ pxor(dst, kScratchDoubleReg);
      break;
    }
    case kAVXI8x16GtU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src1 = i.InputSimd128Register(0);
      Operand src2 = i.InputOperand(1);
      __ vpmaxub(kScratchDoubleReg, src1, src2);
      __ vpcmpeqb(dst, kScratchDoubleReg, src2);
      __ vpcmpeqb(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpxor(dst, dst, kScratchDoubleReg);
      break;
    }
    case kSSEI8x16GeU: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(1);
      __ pminub(dst, src);
      __ pcmpeqb(dst, src);
      break;
    }
    case kAVXI8x16GeU: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister src1 = i.InputSimd128Register(0);
      Operand src2 = i.InputOperand(1);
      __ vpminub(kScratchDoubleReg, src1, src2);
      __ vpcmpeqb(i.OutputSimd128Register(), kScratchDoubleReg, src2);
      break;
    }
    case kIA32S128Zero: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Pxor(dst, dst);
      break;
    }
    case kSSES128Not: {
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(0);
      if (src.is_reg(dst)) {
        __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
        __ pxor(dst, kScratchDoubleReg);
      } else {
        __ pcmpeqd(dst, dst);
        __ pxor(dst, src);
      }
      break;
    }
    case kAVXS128Not: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      __ vpxor(i.OutputSimd128Register(), kScratchDoubleReg, i.InputOperand(0));
      break;
    }
    case kSSES128And: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pand(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXS128And: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpand(i.OutputSimd128Register(), i.InputSimd128Register(0),
               i.InputOperand(1));
      break;
    }
    case kSSES128Or: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ por(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXS128Or: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpor(i.OutputSimd128Register(), i.InputSimd128Register(0),
              i.InputOperand(1));
      break;
    }
    case kSSES128Xor: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      __ pxor(i.OutputSimd128Register(), i.InputOperand(1));
      break;
    }
    case kAVXS128Xor: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      __ vpxor(i.OutputSimd128Register(), i.InputSimd128Register(0),
               i.InputOperand(1));
      break;
    }
    case kSSES128Select: {
      DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
      // Mask used here is stored in dst.
      XMMRegister dst = i.OutputSimd128Register();
      __ movaps(kScratchDoubleReg, i.InputSimd128Register(1));
      __ xorps(kScratchDoubleReg, i.InputSimd128Register(2));
      __ andps(dst, kScratchDoubleReg);
      __ xorps(dst, i.InputSimd128Register(2));
      break;
    }
    case kAVXS128Select: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      __ vxorps(kScratchDoubleReg, i.InputSimd128Register(2),
                i.InputOperand(1));
      __ vandps(dst, kScratchDoubleReg, i.InputOperand(0));
      __ vxorps(dst, dst, i.InputSimd128Register(2));
      break;
    }
    case kIA32S8x16Shuffle: {
      XMMRegister dst = i.OutputSimd128Register();
      Operand src0 = i.InputOperand(0);
      Register tmp = i.TempRegister(0);
      // Prepare 16 byte aligned buffer for shuffle control mask
      __ mov(tmp, esp);
      __ and_(esp, -16);
      if (instr->InputCount() == 5) {  // only one input operand
        DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
        for (int j = 4; j > 0; j--) {
          uint32_t mask = i.InputUint32(j);
          __ push(Immediate(mask));
        }
        __ Pshufb(dst, Operand(esp, 0));
      } else {  // two input operands
        DCHECK_EQ(6, instr->InputCount());
        __ movups(kScratchDoubleReg, src0);
        for (int j = 5; j > 1; j--) {
          uint32_t lanes = i.InputUint32(j);
          uint32_t mask = 0;
          for (int k = 0; k < 32; k += 8) {
            uint8_t lane = lanes >> k;
            mask |= (lane < kSimd128Size ? lane : 0x80) << k;
          }
          __ push(Immediate(mask));
        }
        __ Pshufb(kScratchDoubleReg, Operand(esp, 0));
        Operand src1 = i.InputOperand(1);
        if (!src1.is_reg(dst)) __ movups(dst, src1);
        for (int j = 5; j > 1; j--) {
          uint32_t lanes = i.InputUint32(j);
          uint32_t mask = 0;
          for (int k = 0; k < 32; k += 8) {
            uint8_t lane = lanes >> k;
            mask |= (lane >= kSimd128Size ? (lane & 0xF) : 0x80) << k;
          }
          __ push(Immediate(mask));
        }
        __ Pshufb(dst, Operand(esp, 0));
        __ por(dst, kScratchDoubleReg);
      }
      __ mov(esp, tmp);
      break;
    }
    case kIA32S32x4Swizzle: {
      DCHECK_EQ(2, instr->InputCount());
      __ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(1));
      break;
    }
    case kIA32S32x4Shuffle: {
      DCHECK_EQ(4, instr->InputCount());  // Swizzles should be handled above.
      int8_t shuffle = i.InputInt8(2);
      DCHECK_NE(0xe4, shuffle);  // A simple blend should be handled below.
      __ Pshufd(kScratchDoubleReg, i.InputOperand(1), shuffle);
      __ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), shuffle);
      __ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3));
      break;
    }
    case kIA32S16x8Blend:
      ASSEMBLE_SIMD_IMM_SHUFFLE(pblendw, SSE4_1, i.InputInt8(2));
      break;
    case kIA32S16x8HalfShuffle1: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1));
      __ Pshufhw(dst, dst, i.InputInt8(2));
      break;
    }
    case kIA32S16x8HalfShuffle2: {
      XMMRegister dst = i.OutputSimd128Register();
      __ Pshuflw(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
      __ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputInt8(3));
      __ Pshuflw(dst, i.InputOperand(0), i.InputInt8(2));
      __ Pshufhw(dst, dst, i.InputInt8(3));
      __ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4));
      break;
    }
    case kIA32S8x16Alignr:
      ASSEMBLE_SIMD_IMM_SHUFFLE(palignr, SSSE3, i.InputInt8(2));
      break;
    case kIA32S16x8Dup: {
      XMMRegister dst = i.OutputSimd128Register();
      Operand src = i.InputOperand(0);
      int8_t lane = i.InputInt8(1) & 0x7;
      int8_t lane4 = lane & 0x3;
      int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
      if (lane < 4) {
        __ Pshuflw(dst, src, half_dup);
        __ Pshufd(dst, dst, 0);
      } else {
        __ Pshufhw(dst, src, half_dup);
        __ Pshufd(dst, dst, 0xaa);
      }
      break;
    }
    case kIA32S8x16Dup: {
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src = i.InputSimd128Register(0);
      int8_t lane = i.InputInt8(1) & 0xf;
      if (CpuFeatures::IsSupported(AVX)) {
        CpuFeatureScope avx_scope(tasm(), AVX);
        if (lane < 8) {
          __ vpunpcklbw(dst, src, src);
        } else {
          __ vpunpckhbw(dst, src, src);
        }
      } else {
        DCHECK_EQ(dst, src);
        if (lane < 8) {
          __ punpcklbw(dst, dst);
        } else {
          __ punpckhbw(dst, dst);
        }
      }
      lane &= 0x7;
      int8_t lane4 = lane & 0x3;
      int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
      if (lane < 4) {
        __ Pshuflw(dst, dst, half_dup);
        __ Pshufd(dst, dst, 0);
      } else {
        __ Pshufhw(dst, dst, half_dup);
        __ Pshufd(dst, dst, 0xaa);
      }
      break;
    }
    case kIA32S64x2UnpackHigh:
      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhqdq);
      break;
    case kIA32S32x4UnpackHigh:
      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhdq);
      break;
    case kIA32S16x8UnpackHigh:
      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhwd);
      break;
    case kIA32S8x16UnpackHigh:
      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhbw);
      break;
    case kIA32S64x2UnpackLow:
      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklqdq);
      break;
    case kIA32S32x4UnpackLow:
      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckldq);
      break;
    case kIA32S16x8UnpackLow:
      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklwd);
      break;
    case kIA32S8x16UnpackLow:
      ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklbw);
      break;
    case kSSES16x8UnzipHigh: {
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src2 = dst;
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      if (instr->InputCount() == 2) {
        __ movups(kScratchDoubleReg, i.InputOperand(1));
        __ psrld(kScratchDoubleReg, 16);
        src2 = kScratchDoubleReg;
      }
      __ psrld(dst, 16);
      __ packusdw(dst, src2);
      break;
    }
    case kAVXS16x8UnzipHigh: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src2 = dst;
      if (instr->InputCount() == 2) {
        __ vpsrld(kScratchDoubleReg, i.InputSimd128Register(1), 16);
        src2 = kScratchDoubleReg;
      }
      __ vpsrld(dst, i.InputSimd128Register(0), 16);
      __ vpackusdw(dst, dst, src2);
      break;
    }
    case kSSES16x8UnzipLow: {
      CpuFeatureScope sse_scope(tasm(), SSE4_1);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src2 = dst;
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      __ pxor(kScratchDoubleReg, kScratchDoubleReg);
      if (instr->InputCount() == 2) {
        __ pblendw(kScratchDoubleReg, i.InputOperand(1), 0x55);
        src2 = kScratchDoubleReg;
      }
      __ pblendw(dst, kScratchDoubleReg, 0xaa);
      __ packusdw(dst, src2);
      break;
    }
    case kAVXS16x8UnzipLow: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src2 = dst;
      __ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
      if (instr->InputCount() == 2) {
        __ vpblendw(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(1),
                    0x55);
        src2 = kScratchDoubleReg;
      }
      __ vpblendw(dst, kScratchDoubleReg, i.InputSimd128Register(0), 0x55);
      __ vpackusdw(dst, dst, src2);
      break;
    }
    case kSSES8x16UnzipHigh: {
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src2 = dst;
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      if (instr->InputCount() == 2) {
        __ movups(kScratchDoubleReg, i.InputOperand(1));
        __ psrlw(kScratchDoubleReg, 8);
        src2 = kScratchDoubleReg;
      }
      __ psrlw(dst, 8);
      __ packuswb(dst, src2);
      break;
    }
    case kAVXS8x16UnzipHigh: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src2 = dst;
      if (instr->InputCount() == 2) {
        __ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
        src2 = kScratchDoubleReg;
      }
      __ vpsrlw(dst, i.InputSimd128Register(0), 8);
      __ vpackuswb(dst, dst, src2);
      break;
    }
    case kSSES8x16UnzipLow: {
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src2 = dst;
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      if (instr->InputCount() == 2) {
        __ movups(kScratchDoubleReg, i.InputOperand(1));
        __ psllw(kScratchDoubleReg, 8);
        __ psrlw(kScratchDoubleReg, 8);
        src2 = kScratchDoubleReg;
      }
      __ psllw(dst, 8);
      __ psrlw(dst, 8);
      __ packuswb(dst, src2);
      break;
    }
    case kAVXS8x16UnzipLow: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src2 = dst;
      if (instr->InputCount() == 2) {
        __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
        __ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 8);
        src2 = kScratchDoubleReg;
      }
      __ vpsllw(dst, i.InputSimd128Register(0), 8);
      __ vpsrlw(dst, dst, 8);
      __ vpackuswb(dst, dst, src2);
      break;
    }
    case kSSES8x16TransposeLow: {
      XMMRegister dst = i.OutputSimd128Register();
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      __ psllw(dst, 8);
      if (instr->InputCount() == 1) {
        __ movups(kScratchDoubleReg, dst);
      } else {
        DCHECK_EQ(2, instr->InputCount());
        __ movups(kScratchDoubleReg, i.InputOperand(1));
        __ psllw(kScratchDoubleReg, 8);
      }
      __ psrlw(dst, 8);
      __ por(dst, kScratchDoubleReg);
      break;
    }
    case kAVXS8x16TransposeLow: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      if (instr->InputCount() == 1) {
        __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(0), 8);
        __ vpsrlw(dst, kScratchDoubleReg, 8);
      } else {
        DCHECK_EQ(2, instr->InputCount());
        __ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
        __ vpsllw(dst, i.InputSimd128Register(0), 8);
        __ vpsrlw(dst, dst, 8);
      }
      __ vpor(dst, dst, kScratchDoubleReg);
      break;
    }
    case kSSES8x16TransposeHigh: {
      XMMRegister dst = i.OutputSimd128Register();
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      __ psrlw(dst, 8);
      if (instr->InputCount() == 1) {
        __ movups(kScratchDoubleReg, dst);
      } else {
        DCHECK_EQ(2, instr->InputCount());
        __ movups(kScratchDoubleReg, i.InputOperand(1));
        __ psrlw(kScratchDoubleReg, 8);
      }
      __ psllw(kScratchDoubleReg, 8);
      __ por(dst, kScratchDoubleReg);
      break;
    }
    case kAVXS8x16TransposeHigh: {
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      if (instr->InputCount() == 1) {
        __ vpsrlw(dst, i.InputSimd128Register(0), 8);
        __ vpsllw(kScratchDoubleReg, dst, 8);
      } else {
        DCHECK_EQ(2, instr->InputCount());
        __ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
        __ vpsrlw(dst, i.InputSimd128Register(0), 8);
        __ vpsllw(kScratchDoubleReg, kScratchDoubleReg, 8);
      }
      __ vpor(dst, dst, kScratchDoubleReg);
      break;
    }
    case kSSES8x8Reverse:
    case kSSES8x4Reverse:
    case kSSES8x2Reverse: {
      DCHECK_EQ(1, instr->InputCount());
      XMMRegister dst = i.OutputSimd128Register();
      DCHECK_EQ(dst, i.InputSimd128Register(0));
      if (arch_opcode != kSSES8x2Reverse) {
        // First shuffle words into position.
        int8_t shuffle_mask = arch_opcode == kSSES8x4Reverse ? 0xB1 : 0x1B;
        __ pshuflw(dst, dst, shuffle_mask);
        __ pshufhw(dst, dst, shuffle_mask);
      }
      __ movaps(kScratchDoubleReg, dst);
      __ psrlw(kScratchDoubleReg, 8);
      __ psllw(dst, 8);
      __ por(dst, kScratchDoubleReg);
      break;
    }
    case kAVXS8x2Reverse:
    case kAVXS8x4Reverse:
    case kAVXS8x8Reverse: {
      DCHECK_EQ(1, instr->InputCount());
      CpuFeatureScope avx_scope(tasm(), AVX);
      XMMRegister dst = i.OutputSimd128Register();
      XMMRegister src = dst;
      if (arch_opcode != kAVXS8x2Reverse) {
        // First shuffle words into position.
        int8_t shuffle_mask = arch_opcode == kAVXS8x4Reverse ? 0xB1 : 0x1B;
        __ vpshuflw(dst, i.InputOperand(0), shuffle_mask);
        __ vpshufhw(dst, dst, shuffle_mask);
      } else {
        src = i.InputSimd128Register(0);
      }
      // Reverse each 16 bit lane.
      __ vpsrlw(kScratchDoubleReg, src, 8);
      __ vpsllw(dst, src, 8);
      __ vpor(dst, dst, kScratchDoubleReg);
      break;
    }
    case kIA32S1x4AnyTrue:
    case kIA32S1x8AnyTrue:
    case kIA32S1x16AnyTrue: {
      Register dst = i.OutputRegister();
      XMMRegister src = i.InputSimd128Register(0);
      Register tmp = i.TempRegister(0);
      __ xor_(tmp, tmp);
      __ mov(dst, Immediate(-1));
      __ Ptest(src, src);
      __ cmov(zero, dst, tmp);
      break;
    }
    case kIA32S1x4AllTrue:
    case kIA32S1x8AllTrue:
    case kIA32S1x16AllTrue: {
      Register dst = i.OutputRegister();
      Operand src = i.InputOperand(0);
      Register tmp = i.TempRegister(0);
      __ mov(tmp, Immediate(-1));
      __ xor_(dst, dst);
      // Compare all src lanes to false.
      __ Pxor(kScratchDoubleReg, kScratchDoubleReg);
      if (arch_opcode == kIA32S1x4AllTrue) {
        __ Pcmpeqd(kScratchDoubleReg, src);
      } else if (arch_opcode == kIA32S1x8AllTrue) {
        __ Pcmpeqw(kScratchDoubleReg, src);
      } else {
        __ Pcmpeqb(kScratchDoubleReg, src);
      }
      // If kScratchDoubleReg is all zero, none of src lanes are false.
      __ Ptest(kScratchDoubleReg, kScratchDoubleReg);
      __ cmov(zero, dst, tmp);
      break;
    }
    case kIA32StackCheck: {
      ExternalReference const stack_limit =
          ExternalReference::address_of_stack_limit(__ isolate());
      __ VerifyRootRegister();
      __ cmp(esp, tasm()->StaticVariable(stack_limit));
      break;
    }
    case kIA32Word32AtomicPairLoad: {
      XMMRegister tmp = i.ToDoubleRegister(instr->TempAt(0));
      __ movq(tmp, i.MemoryOperand());
      __ Pextrd(i.OutputRegister(0), tmp, 0);
      __ Pextrd(i.OutputRegister(1), tmp, 1);
      break;
    }
    case kIA32Word32AtomicPairStore: {
      __ mov(i.TempRegister(0), i.MemoryOperand(2));
      __ mov(i.TempRegister(1), i.NextMemoryOperand(2));
      __ lock();
      __ cmpxchg8b(i.MemoryOperand(2));
      break;
    }
    case kWord32AtomicExchangeInt8: {
      __ xchg_b(i.InputRegister(0), i.MemoryOperand(1));
      __ movsx_b(i.InputRegister(0), i.InputRegister(0));
      break;
    }
    case kWord32AtomicExchangeUint8: {
      __ xchg_b(i.InputRegister(0), i.MemoryOperand(1));
      __ movzx_b(i.InputRegister(0), i.InputRegister(0));
      break;
    }
    case kWord32AtomicExchangeInt16: {
      __ xchg_w(i.InputRegister(0), i.MemoryOperand(1));
      __ movsx_w(i.InputRegister(0), i.InputRegister(0));
      break;
    }
    case kWord32AtomicExchangeUint16: {
      __ xchg_w(i.InputRegister(0), i.MemoryOperand(1));
      __ movzx_w(i.InputRegister(0), i.InputRegister(0));
      break;
    }
    case kWord32AtomicExchangeWord32: {
      __ xchg(i.InputRegister(0), i.MemoryOperand(1));
      break;
    }
    // For the narrow Word64 operations below, i.OutputRegister(1) contains
    // the high-order 32 bits for the 64bit operation. As the data exchange
    // fits in one register, the i.OutputRegister(1) needs to be cleared for
    // the correct return value to be propagated back.
    case kIA32Word64AtomicNarrowExchangeUint8: {
      __ xchg_b(i.OutputRegister(0), i.MemoryOperand(1));
      __ movzx_b(i.OutputRegister(0), i.OutputRegister(0));
      __ xor_(i.OutputRegister(1), i.OutputRegister(1));
      break;
    }
    case kIA32Word64AtomicNarrowExchangeUint16: {
      __ xchg_w(i.OutputRegister(0), i.MemoryOperand(1));
      __ movzx_w(i.OutputRegister(0), i.OutputRegister(0));
      __ xor_(i.OutputRegister(1), i.OutputRegister(1));
      break;
    }
    case kIA32Word64AtomicNarrowExchangeUint32: {
      __ xchg(i.OutputRegister(0), i.MemoryOperand(1));
      __ xor_(i.OutputRegister(1), i.OutputRegister(1));
      break;
    }
    case kIA32Word32AtomicPairExchange: {
      __ mov(i.OutputRegister(0), i.MemoryOperand(2));
      __ mov(i.OutputRegister(1), i.NextMemoryOperand(2));
      __ lock();
      __ cmpxchg8b(i.MemoryOperand(2));
      break;
    }
    case kWord32AtomicCompareExchangeInt8: {
      __ lock();
      __ cmpxchg_b(i.MemoryOperand(2), i.InputRegister(1));
      __ movsx_b(eax, eax);
      break;
    }
    case kWord32AtomicCompareExchangeUint8: {
      __ lock();
      __ cmpxchg_b(i.MemoryOperand(2), i.InputRegister(1));
      __ movzx_b(eax, eax);
      break;
    }
    case kWord32AtomicCompareExchangeInt16: {
      __ lock();
      __ cmpxchg_w(i.MemoryOperand(2), i.InputRegister(1));
      __ movsx_w(eax, eax);
      break;
    }
    case kWord32AtomicCompareExchangeUint16: {
      __ lock();
      __ cmpxchg_w(i.MemoryOperand(2), i.InputRegister(1));
      __ movzx_w(eax, eax);
      break;
    }
    case kWord32AtomicCompareExchangeWord32: {
      __ lock();
      __ cmpxchg(i.MemoryOperand(2), i.InputRegister(1));
      break;
    }
    case kIA32Word64AtomicNarrowCompareExchangeUint8: {
      __ lock();
      __ cmpxchg_b(i.MemoryOperand(2), i.InputRegister(1));
      __ movzx_b(i.OutputRegister(0), i.OutputRegister(0));
      __ xor_(i.OutputRegister(1), i.OutputRegister(1));
      break;
    }
    case kIA32Word64AtomicNarrowCompareExchangeUint16: {
      __ lock();
      __ cmpxchg_w(i.MemoryOperand(2), i.InputRegister(1));
      __ movzx_w(i.OutputRegister(0), i.OutputRegister(0));
      __ xor_(i.OutputRegister(1), i.OutputRegister(1));
      break;
    }
    case kIA32Word64AtomicNarrowCompareExchangeUint32: {
      __ lock();
      __ cmpxchg(i.MemoryOperand(2), i.InputRegister(1));
      __ xor_(i.OutputRegister(1), i.OutputRegister(1));
      break;
    }
    case kIA32Word32AtomicPairCompareExchange: {
      __ lock();
      __ cmpxchg8b(i.MemoryOperand(4));
      break;
    }
#define ATOMIC_BINOP_CASE(op, inst)                       \
  case kWord32Atomic##op##Int8: {                         \
    ASSEMBLE_ATOMIC_BINOP(inst, mov_b, cmpxchg_b);        \
    __ movsx_b(eax, eax);                                 \
    break;                                                \
  }                                                       \
  case kIA32Word64AtomicNarrow##op##Uint8: {              \
    ASSEMBLE_ATOMIC_BINOP(inst, mov_b, cmpxchg_b);        \
    __ movzx_b(i.OutputRegister(0), i.OutputRegister(0)); \
    __ xor_(i.OutputRegister(1), i.OutputRegister(1));    \
    break;                                                \
  }                                                       \
  case kWord32Atomic##op##Uint8: {                        \
    ASSEMBLE_ATOMIC_BINOP(inst, mov_b, cmpxchg_b);        \
    __ movzx_b(eax, eax);                                 \
    break;                                                \
  }                                                       \
  case kWord32Atomic##op##Int16: {                        \
    ASSEMBLE_ATOMIC_BINOP(inst, mov_w, cmpxchg_w);        \
    __ movsx_w(eax, eax);                                 \
    break;                                                \
  }                                                       \
  case kIA32Word64AtomicNarrow##op##Uint16: {             \
    ASSEMBLE_ATOMIC_BINOP(inst, mov_w, cmpxchg_w);        \
    __ movzx_w(i.OutputRegister(0), i.OutputRegister(0)); \
    __ xor_(i.OutputRegister(1), i.OutputRegister(1));    \
    break;                                                \
  }                                                       \
  case kWord32Atomic##op##Uint16: {                       \
    ASSEMBLE_ATOMIC_BINOP(inst, mov_w, cmpxchg_w);        \
    __ movzx_w(eax, eax);                                 \
    break;                                                \
  }                                                       \
  case kIA32Word64AtomicNarrow##op##Uint32: {             \
    ASSEMBLE_ATOMIC_BINOP(inst, mov, cmpxchg);            \
    __ xor_(i.OutputRegister(1), i.OutputRegister(1));    \
    break;                                                \
  }                                                       \
  case kWord32Atomic##op##Word32: {                       \
    ASSEMBLE_ATOMIC_BINOP(inst, mov, cmpxchg);            \
    break;                                                \
  }
      ATOMIC_BINOP_CASE(Add, add)
      ATOMIC_BINOP_CASE(Sub, sub)
      ATOMIC_BINOP_CASE(And, and_)
      ATOMIC_BINOP_CASE(Or, or_)
      ATOMIC_BINOP_CASE(Xor, xor_)
#undef ATOMIC_BINOP_CASE
#define ATOMIC_BINOP_CASE(op, instr1, instr2) \
  case kIA32Word32AtomicPair##op: {           \
    ASSEMBLE_I64ATOMIC_BINOP(instr1, instr2)  \
    break;                                    \
  }
      ATOMIC_BINOP_CASE(Add, add, adc)
      ATOMIC_BINOP_CASE(And, and_, and_)
      ATOMIC_BINOP_CASE(Or, or_, or_)
      ATOMIC_BINOP_CASE(Xor, xor_, xor_)
#undef ATOMIC_BINOP_CASE
    case kIA32Word32AtomicPairSub: {
      Label binop;
      __ bind(&binop);
      // Move memory operand into edx:eax
      __ mov(i.OutputRegister(0), i.MemoryOperand(2));
      __ mov(i.OutputRegister(1), i.NextMemoryOperand(2));
      // Save input registers temporarily on the stack.
      __ push(i.InputRegister(0));
      __ push(i.InputRegister(1));
      // Negate input in place
      __ neg(i.InputRegister(0));
      __ adc(i.InputRegister(1), 0);
      __ neg(i.InputRegister(1));
      // Add memory operand, negated input.
      __ add(i.InputRegister(0), i.OutputRegister(0));
      __ adc(i.InputRegister(1), i.OutputRegister(1));
      __ lock();
      __ cmpxchg8b(i.MemoryOperand(2));
      // Restore input registers
      __ pop(i.InputRegister(1));
      __ pop(i.InputRegister(0));
      __ j(not_equal, &binop);
      break;
    }
    case kWord32AtomicLoadInt8:
    case kWord32AtomicLoadUint8:
    case kWord32AtomicLoadInt16:
    case kWord32AtomicLoadUint16:
    case kWord32AtomicLoadWord32:
    case kWord32AtomicStoreWord8:
    case kWord32AtomicStoreWord16:
    case kWord32AtomicStoreWord32:
      UNREACHABLE();  // Won't be generated by instruction selector.
      break;
  }
  return kSuccess;
}  // NOLINT(readability/fn_size)

static Condition FlagsConditionToCondition(FlagsCondition condition) {
  switch (condition) {
    case kUnorderedEqual:
    case kEqual:
      return equal;
      break;
    case kUnorderedNotEqual:
    case kNotEqual:
      return not_equal;
      break;
    case kSignedLessThan:
      return less;
      break;
    case kSignedGreaterThanOrEqual:
      return greater_equal;
      break;
    case kSignedLessThanOrEqual:
      return less_equal;
      break;
    case kSignedGreaterThan:
      return greater;
      break;
    case kUnsignedLessThan:
      return below;
      break;
    case kUnsignedGreaterThanOrEqual:
      return above_equal;
      break;
    case kUnsignedLessThanOrEqual:
      return below_equal;
      break;
    case kUnsignedGreaterThan:
      return above;
      break;
    case kOverflow:
      return overflow;
      break;
    case kNotOverflow:
      return no_overflow;
      break;
    default:
      UNREACHABLE();
      break;
  }
}

// Assembles a branch after an instruction.
void CodeGenerator::AssembleArchBranch(Instruction* instr, BranchInfo* branch) {
  Label::Distance flabel_distance =
      branch->fallthru ? Label::kNear : Label::kFar;
  Label* tlabel = branch->true_label;
  Label* flabel = branch->false_label;
  if (branch->condition == kUnorderedEqual) {
    __ j(parity_even, flabel, flabel_distance);
  } else if (branch->condition == kUnorderedNotEqual) {
    __ j(parity_even, tlabel);
  }
  __ j(FlagsConditionToCondition(branch->condition), tlabel);

  // Add a jump if not falling through to the next block.
  if (!branch->fallthru) __ jmp(flabel);
}

void CodeGenerator::AssembleBranchPoisoning(FlagsCondition condition,
                                            Instruction* instr) {
  // TODO(jarin) Handle float comparisons (kUnordered[Not]Equal).
  if (condition == kUnorderedEqual || condition == kUnorderedNotEqual) {
    return;
  }

  condition = NegateFlagsCondition(condition);
  __ setcc(FlagsConditionToCondition(condition), kSpeculationPoisonRegister);
  __ add(kSpeculationPoisonRegister, Immediate(255));
  __ sar(kSpeculationPoisonRegister, 31u);
}

void CodeGenerator::AssembleArchDeoptBranch(Instruction* instr,
                                            BranchInfo* branch) {
  AssembleArchBranch(instr, branch);
}

void CodeGenerator::AssembleArchJump(RpoNumber target) {
  if (!IsNextInAssemblyOrder(target)) __ jmp(GetLabel(target));
}

void CodeGenerator::AssembleArchTrap(Instruction* instr,
                                     FlagsCondition condition) {
  class OutOfLineTrap final : public OutOfLineCode {
   public:
    OutOfLineTrap(CodeGenerator* gen, Instruction* instr)
        : OutOfLineCode(gen), instr_(instr), gen_(gen) {}

    void Generate() final {
      IA32OperandConverter i(gen_, instr_);
      TrapId trap_id =
          static_cast<TrapId>(i.InputInt32(instr_->InputCount() - 1));
      GenerateCallToTrap(trap_id);
    }

   private:
    void GenerateCallToTrap(TrapId trap_id) {
      if (trap_id == TrapId::kInvalid) {
        // We cannot test calls to the runtime in cctest/test-run-wasm.
        // Therefore we emit a call to C here instead of a call to the runtime.
        __ PrepareCallCFunction(0, esi);
        __ CallCFunction(
            ExternalReference::wasm_call_trap_callback_for_testing(), 0);
        __ LeaveFrame(StackFrame::WASM_COMPILED);
        auto call_descriptor = gen_->linkage()->GetIncomingDescriptor();
        size_t pop_size = call_descriptor->StackParameterCount() * kPointerSize;
        // Use ecx as a scratch register, we return anyways immediately.
        __ Ret(static_cast<int>(pop_size), ecx);
      } else {
        gen_->AssembleSourcePosition(instr_);
        // A direct call to a wasm runtime stub defined in this module.
        // Just encode the stub index. This will be patched at relocation.
        __ wasm_call(static_cast<Address>(trap_id), RelocInfo::WASM_STUB_CALL);
        ReferenceMap* reference_map =
            new (gen_->zone()) ReferenceMap(gen_->zone());
        gen_->RecordSafepoint(reference_map, Safepoint::kSimple, 0,
                              Safepoint::kNoLazyDeopt);
        __ AssertUnreachable(AbortReason::kUnexpectedReturnFromWasmTrap);
      }
    }

    Instruction* instr_;
    CodeGenerator* gen_;
  };
  auto ool = new (zone()) OutOfLineTrap(this, instr);
  Label* tlabel = ool->entry();
  Label end;
  if (condition == kUnorderedEqual) {
    __ j(parity_even, &end);
  } else if (condition == kUnorderedNotEqual) {
    __ j(parity_even, tlabel);
  }
  __ j(FlagsConditionToCondition(condition), tlabel);
  __ bind(&end);
}

// Assembles boolean materializations after an instruction.
void CodeGenerator::AssembleArchBoolean(Instruction* instr,
                                        FlagsCondition condition) {
  IA32OperandConverter i(this, instr);
  Label done;

  // Materialize a full 32-bit 1 or 0 value. The result register is always the
  // last output of the instruction.
  Label check;
  DCHECK_NE(0u, instr->OutputCount());
  Register reg = i.OutputRegister(instr->OutputCount() - 1);
  if (condition == kUnorderedEqual) {
    __ j(parity_odd, &check, Label::kNear);
    __ Move(reg, Immediate(0));
    __ jmp(&done, Label::kNear);
  } else if (condition == kUnorderedNotEqual) {
    __ j(parity_odd, &check, Label::kNear);
    __ mov(reg, Immediate(1));
    __ jmp(&done, Label::kNear);
  }
  Condition cc = FlagsConditionToCondition(condition);

  __ bind(&check);
  if (reg.is_byte_register()) {
    // setcc for byte registers (al, bl, cl, dl).
    __ setcc(cc, reg);
    __ movzx_b(reg, reg);
  } else {
    // Emit a branch to set a register to either 1 or 0.
    Label set;
    __ j(cc, &set, Label::kNear);
    __ Move(reg, Immediate(0));
    __ jmp(&done, Label::kNear);
    __ bind(&set);
    __ mov(reg, Immediate(1));
  }
  __ bind(&done);
}

void CodeGenerator::AssembleArchBinarySearchSwitch(Instruction* instr) {
  IA32OperandConverter i(this, instr);
  Register input = i.InputRegister(0);
  std::vector<std::pair<int32_t, Label*>> cases;
  for (size_t index = 2; index < instr->InputCount(); index += 2) {
    cases.push_back({i.InputInt32(index + 0), GetLabel(i.InputRpo(index + 1))});
  }
  AssembleArchBinarySearchSwitchRange(input, i.InputRpo(1), cases.data(),
                                      cases.data() + cases.size());
}

void CodeGenerator::AssembleArchLookupSwitch(Instruction* instr) {
  IA32OperandConverter i(this, instr);
  Register input = i.InputRegister(0);
  for (size_t index = 2; index < instr->InputCount(); index += 2) {
    __ cmp(input, Immediate(i.InputInt32(index + 0)));
    __ j(equal, GetLabel(i.InputRpo(index + 1)));
  }
  AssembleArchJump(i.InputRpo(1));
}


void CodeGenerator::AssembleArchTableSwitch(Instruction* instr) {
  IA32OperandConverter i(this, instr);
  Register input = i.InputRegister(0);
  size_t const case_count = instr->InputCount() - 2;
  Label** cases = zone()->NewArray<Label*>(case_count);
  for (size_t index = 0; index < case_count; ++index) {
    cases[index] = GetLabel(i.InputRpo(index + 2));
  }
  Label* const table = AddJumpTable(cases, case_count);
  __ cmp(input, Immediate(case_count));
  __ j(above_equal, GetLabel(i.InputRpo(1)));
  __ jmp(Operand::JumpTable(input, times_4, table));
}


// The calling convention for JSFunctions on IA32 passes arguments on the
// stack and the JSFunction and context in EDI and ESI, respectively, thus
// the steps of the call look as follows:

// --{ before the call instruction }--------------------------------------------
//                                                         |  caller frame |
//                                                         ^ esp           ^ ebp

// --{ push arguments and setup ESI, EDI }--------------------------------------
//                                       | args + receiver |  caller frame |
//                                       ^ esp                             ^ ebp
//                 [edi = JSFunction, esi = context]

// --{ call [edi + kCodeEntryOffset] }------------------------------------------
//                                 | RET | args + receiver |  caller frame |
//                                 ^ esp                                   ^ ebp

// =={ prologue of called function }============================================
// --{ push ebp }---------------------------------------------------------------
//                            | FP | RET | args + receiver |  caller frame |
//                            ^ esp                                        ^ ebp

// --{ mov ebp, esp }-----------------------------------------------------------
//                            | FP | RET | args + receiver |  caller frame |
//                            ^ ebp,esp

// --{ push esi }---------------------------------------------------------------
//                      | CTX | FP | RET | args + receiver |  caller frame |
//                      ^esp  ^ ebp

// --{ push edi }---------------------------------------------------------------
//                | FNC | CTX | FP | RET | args + receiver |  caller frame |
//                ^esp        ^ ebp

// --{ subi esp, #N }-----------------------------------------------------------
// | callee frame | FNC | CTX | FP | RET | args + receiver |  caller frame |
// ^esp                       ^ ebp

// =={ body of called function }================================================

// =={ epilogue of called function }============================================
// --{ mov esp, ebp }-----------------------------------------------------------
//                            | FP | RET | args + receiver |  caller frame |
//                            ^ esp,ebp

// --{ pop ebp }-----------------------------------------------------------
// |                               | RET | args + receiver |  caller frame |
//                                 ^ esp                                   ^ ebp

// --{ ret #A+1 }-----------------------------------------------------------
// |                                                       |  caller frame |
//                                                         ^ esp           ^ ebp

// Runtime function calls are accomplished by doing a stub call to the
// CEntry (a real code object). On IA32 passes arguments on the
// stack, the number of arguments in EAX, the address of the runtime function
// in EBX, and the context in ESI.

// --{ before the call instruction }--------------------------------------------
//                                                         |  caller frame |
//                                                         ^ esp           ^ ebp

// --{ push arguments and setup EAX, EBX, and ESI }-----------------------------
//                                       | args + receiver |  caller frame |
//                                       ^ esp                             ^ ebp
//              [eax = #args, ebx = runtime function, esi = context]

// --{ call #CEntry }-----------------------------------------------------------
//                                 | RET | args + receiver |  caller frame |
//                                 ^ esp                                   ^ ebp

// =={ body of runtime function }===============================================

// --{ runtime returns }--------------------------------------------------------
//                                                         |  caller frame |
//                                                         ^ esp           ^ ebp

// Other custom linkages (e.g. for calling directly into and out of C++) may
// need to save callee-saved registers on the stack, which is done in the
// function prologue of generated code.

// --{ before the call instruction }--------------------------------------------
//                                                         |  caller frame |
//                                                         ^ esp           ^ ebp

// --{ set up arguments in registers on stack }---------------------------------
//                                                  | args |  caller frame |
//                                                  ^ esp                  ^ ebp
//                  [r0 = arg0, r1 = arg1, ...]

// --{ call code }--------------------------------------------------------------
//                                            | RET | args |  caller frame |
//                                            ^ esp                        ^ ebp

// =={ prologue of called function }============================================
// --{ push ebp }---------------------------------------------------------------
//                                       | FP | RET | args |  caller frame |
//                                       ^ esp                             ^ ebp

// --{ mov ebp, esp }-----------------------------------------------------------
//                                       | FP | RET | args |  caller frame |
//                                       ^ ebp,esp

// --{ save registers }---------------------------------------------------------
//                                | regs | FP | RET | args |  caller frame |
//                                ^ esp  ^ ebp

// --{ subi esp, #N }-----------------------------------------------------------
//                 | callee frame | regs | FP | RET | args |  caller frame |
//                 ^esp                  ^ ebp

// =={ body of called function }================================================

// =={ epilogue of called function }============================================
// --{ restore registers }------------------------------------------------------
//                                | regs | FP | RET | args |  caller frame |
//                                ^ esp  ^ ebp

// --{ mov esp, ebp }-----------------------------------------------------------
//                                       | FP | RET | args |  caller frame |
//                                       ^ esp,ebp

// --{ pop ebp }----------------------------------------------------------------
//                                            | RET | args |  caller frame |
//                                            ^ esp                        ^ ebp

void CodeGenerator::FinishFrame(Frame* frame) {
  auto call_descriptor = linkage()->GetIncomingDescriptor();
  const RegList saves = call_descriptor->CalleeSavedRegisters();
  if (saves != 0) {  // Save callee-saved registers.
    DCHECK(!info()->is_osr());
    int pushed = 0;
    for (int i = Register::kNumRegisters - 1; i >= 0; i--) {
      if (!((1 << i) & saves)) continue;
      ++pushed;
    }
    frame->AllocateSavedCalleeRegisterSlots(pushed);
  }
}

void CodeGenerator::AssembleConstructFrame() {
  auto call_descriptor = linkage()->GetIncomingDescriptor();
  if (frame_access_state()->has_frame()) {
    if (call_descriptor->IsCFunctionCall()) {
      __ push(ebp);
      __ mov(ebp, esp);
    } else if (call_descriptor->IsJSFunctionCall()) {
      __ Prologue();
      if (call_descriptor->PushArgumentCount()) {
        __ push(kJavaScriptCallArgCountRegister);
      }
    } else {
      __ StubPrologue(info()->GetOutputStackFrameType());
      if (call_descriptor->IsWasmFunctionCall()) {
        __ push(kWasmInstanceRegister);
      }
    }
  }

  int shrink_slots = frame()->GetTotalFrameSlotCount() -
                     call_descriptor->CalculateFixedFrameSize();

  if (info()->is_osr()) {
    // TurboFan OSR-compiled functions cannot be entered directly.
    __ Abort(AbortReason::kShouldNotDirectlyEnterOsrFunction);

    // Unoptimized code jumps directly to this entrypoint while the unoptimized
    // frame is still on the stack. Optimized code uses OSR values directly from
    // the unoptimized frame. Thus, all that needs to be done is to allocate the
    // remaining stack slots.
    if (FLAG_code_comments) __ RecordComment("-- OSR entrypoint --");
    osr_pc_offset_ = __ pc_offset();
    shrink_slots -= osr_helper()->UnoptimizedFrameSlots();
    ResetSpeculationPoison();
  }

  const RegList saves = call_descriptor->CalleeSavedRegisters();
  if (shrink_slots > 0) {
    DCHECK(frame_access_state()->has_frame());
    if (info()->IsWasm() && shrink_slots > 128) {
      // For WebAssembly functions with big frames we have to do the stack
      // overflow check before we construct the frame. Otherwise we may not
      // have enough space on the stack to call the runtime for the stack
      // overflow.
      Label done;

      // If the frame is bigger than the stack, we throw the stack overflow
      // exception unconditionally. Thereby we can avoid the integer overflow
      // check in the condition code.
      if (shrink_slots * kPointerSize < FLAG_stack_size * 1024) {
        Register scratch = esi;
        __ push(scratch);
        __ mov(scratch,
               FieldOperand(kWasmInstanceRegister,
                            WasmInstanceObject::kRealStackLimitAddressOffset));
        __ mov(scratch, Operand(scratch, 0));
        __ add(scratch, Immediate(shrink_slots * kPointerSize));
        __ cmp(esp, scratch);
        __ pop(scratch);
        __ j(above_equal, &done);
      }
      __ mov(ecx, FieldOperand(kWasmInstanceRegister,
                               WasmInstanceObject::kCEntryStubOffset));
      __ Move(esi, Smi::kZero);
      __ CallRuntimeWithCEntry(Runtime::kThrowWasmStackOverflow, ecx);
      ReferenceMap* reference_map = new (zone()) ReferenceMap(zone());
      RecordSafepoint(reference_map, Safepoint::kSimple, 0,
                      Safepoint::kNoLazyDeopt);
      __ AssertUnreachable(AbortReason::kUnexpectedReturnFromWasmTrap);
      __ bind(&done);
    }

    // Skip callee-saved and return slots, which are created below.
    shrink_slots -= base::bits::CountPopulation(saves);
    shrink_slots -= frame()->GetReturnSlotCount();
    if (shrink_slots > 0) {
      __ sub(esp, Immediate(shrink_slots * kPointerSize));
    }
  }

  if (saves != 0) {  // Save callee-saved registers.
    DCHECK(!info()->is_osr());
    for (int i = Register::kNumRegisters - 1; i >= 0; i--) {
      if (((1 << i) & saves)) __ push(Register::from_code(i));
    }
  }

  // Allocate return slots (located after callee-saved).
  if (frame()->GetReturnSlotCount() > 0) {
    __ sub(esp, Immediate(frame()->GetReturnSlotCount() * kPointerSize));
  }
}

void CodeGenerator::AssembleReturn(InstructionOperand* pop) {
  auto call_descriptor = linkage()->GetIncomingDescriptor();

  const RegList saves = call_descriptor->CalleeSavedRegisters();
  // Restore registers.
  if (saves != 0) {
    const int returns = frame()->GetReturnSlotCount();
    if (returns != 0) {
      __ add(esp, Immediate(returns * kPointerSize));
    }
    for (int i = 0; i < Register::kNumRegisters; i++) {
      if (!((1 << i) & saves)) continue;
      __ pop(Register::from_code(i));
    }
  }

  // Might need ecx for scratch if pop_size is too big or if there is a variable
  // pop count.
  DCHECK_EQ(0u, call_descriptor->CalleeSavedRegisters() & ecx.bit());
  size_t pop_size = call_descriptor->StackParameterCount() * kPointerSize;
  IA32OperandConverter g(this, nullptr);
  if (call_descriptor->IsCFunctionCall()) {
    AssembleDeconstructFrame();
  } else if (frame_access_state()->has_frame()) {
    // Canonicalize JSFunction return sites for now if they always have the same
    // number of return args.
    if (pop->IsImmediate() && g.ToConstant(pop).ToInt32() == 0) {
      if (return_label_.is_bound()) {
        __ jmp(&return_label_);
        return;
      } else {
        __ bind(&return_label_);
        AssembleDeconstructFrame();
      }
    } else {
      AssembleDeconstructFrame();
    }
  }
  DCHECK_EQ(0u, call_descriptor->CalleeSavedRegisters() & edx.bit());
  DCHECK_EQ(0u, call_descriptor->CalleeSavedRegisters() & ecx.bit());
  if (pop->IsImmediate()) {
    DCHECK_EQ(Constant::kInt32, g.ToConstant(pop).type());
    pop_size += g.ToConstant(pop).ToInt32() * kPointerSize;
    __ Ret(static_cast<int>(pop_size), ecx);
  } else {
    Register pop_reg = g.ToRegister(pop);
    Register scratch_reg = pop_reg == ecx ? edx : ecx;
    __ pop(scratch_reg);
    __ lea(esp, Operand(esp, pop_reg, times_4, static_cast<int>(pop_size)));
    __ jmp(scratch_reg);
  }
}

void CodeGenerator::FinishCode() {}

void CodeGenerator::AssembleMove(InstructionOperand* source,
                                 InstructionOperand* destination) {
  IA32OperandConverter g(this, nullptr);
  // Dispatch on the source and destination operand kinds.
  switch (MoveType::InferMove(source, destination)) {
    case MoveType::kRegisterToRegister:
      if (source->IsRegister()) {
        __ mov(g.ToRegister(destination), g.ToRegister(source));
      } else {
        DCHECK(source->IsFPRegister());
        __ movaps(g.ToDoubleRegister(destination), g.ToDoubleRegister(source));
      }
      return;
    case MoveType::kRegisterToStack: {
      Operand dst = g.ToOperand(destination);
      if (source->IsRegister()) {
        __ mov(dst, g.ToRegister(source));
      } else {
        DCHECK(source->IsFPRegister());
        XMMRegister src = g.ToDoubleRegister(source);
        MachineRepresentation rep =
            LocationOperand::cast(source)->representation();
        if (rep == MachineRepresentation::kFloat32) {
          __ movss(dst, src);
        } else if (rep == MachineRepresentation::kFloat64) {
          __ movsd(dst, src);
        } else {
          DCHECK_EQ(MachineRepresentation::kSimd128, rep);
          __ movups(dst, src);
        }
      }
      return;
    }
    case MoveType::kStackToRegister: {
      Operand src = g.ToOperand(source);
      if (source->IsStackSlot()) {
        __ mov(g.ToRegister(destination), src);
      } else {
        DCHECK(source->IsFPStackSlot());
        XMMRegister dst = g.ToDoubleRegister(destination);
        MachineRepresentation rep =
            LocationOperand::cast(source)->representation();
        if (rep == MachineRepresentation::kFloat32) {
          __ movss(dst, src);
        } else if (rep == MachineRepresentation::kFloat64) {
          __ movsd(dst, src);
        } else {
          DCHECK_EQ(MachineRepresentation::kSimd128, rep);
          __ movups(dst, src);
        }
      }
      return;
    }
    case MoveType::kStackToStack: {
      Operand src = g.ToOperand(source);
      Operand dst = g.ToOperand(destination);
      if (source->IsStackSlot()) {
        __ push(src);
        __ pop(dst);
      } else {
        MachineRepresentation rep =
            LocationOperand::cast(source)->representation();
        if (rep == MachineRepresentation::kFloat32) {
          __ movss(kScratchDoubleReg, src);
          __ movss(dst, kScratchDoubleReg);
        } else if (rep == MachineRepresentation::kFloat64) {
          __ movsd(kScratchDoubleReg, src);
          __ movsd(dst, kScratchDoubleReg);
        } else {
          DCHECK_EQ(MachineRepresentation::kSimd128, rep);
          __ movups(kScratchDoubleReg, src);
          __ movups(dst, kScratchDoubleReg);
        }
      }
      return;
    }
    case MoveType::kConstantToRegister: {
      Constant src = g.ToConstant(source);
      if (destination->IsRegister()) {
        Register dst = g.ToRegister(destination);
        if (src.type() == Constant::kHeapObject) {
          __ Move(dst, src.ToHeapObject());
        } else {
          __ Move(dst, g.ToImmediate(source));
        }
      } else {
        DCHECK(destination->IsFPRegister());
        XMMRegister dst = g.ToDoubleRegister(destination);
        if (src.type() == Constant::kFloat32) {
          // TODO(turbofan): Can we do better here?
          __ Move(dst, src.ToFloat32AsInt());
        } else {
          DCHECK_EQ(src.type(), Constant::kFloat64);
          __ Move(dst, src.ToFloat64().AsUint64());
        }
      }
      return;
    }
    case MoveType::kConstantToStack: {
      Constant src = g.ToConstant(source);
      Operand dst = g.ToOperand(destination);
      if (destination->IsStackSlot()) {
        if (src.type() == Constant::kHeapObject) {
          __ mov(dst, src.ToHeapObject());
        } else {
          __ Move(dst, g.ToImmediate(source));
        }
      } else {
        DCHECK(destination->IsFPStackSlot());
        if (src.type() == Constant::kFloat32) {
          __ Move(dst, Immediate(src.ToFloat32AsInt()));
        } else {
          DCHECK_EQ(src.type(), Constant::kFloat64);
          uint64_t constant_value = src.ToFloat64().AsUint64();
          uint32_t lower = static_cast<uint32_t>(constant_value);
          uint32_t upper = static_cast<uint32_t>(constant_value >> 32);
          Operand dst0 = dst;
          Operand dst1 = g.ToOperand(destination, kPointerSize);
          __ Move(dst0, Immediate(lower));
          __ Move(dst1, Immediate(upper));
        }
      }
      return;
    }
  }
  UNREACHABLE();
}


void CodeGenerator::AssembleSwap(InstructionOperand* source,
                                 InstructionOperand* destination) {
  IA32OperandConverter g(this, nullptr);
  // Dispatch on the source and destination operand kinds.  Not all
  // combinations are possible.
  switch (MoveType::InferSwap(source, destination)) {
    case MoveType::kRegisterToRegister: {
      if (source->IsRegister()) {
        Register src = g.ToRegister(source);
        Register dst = g.ToRegister(destination);
        __ push(src);
        __ mov(src, dst);
        __ pop(dst);
      } else {
        DCHECK(source->IsFPRegister());
        XMMRegister src = g.ToDoubleRegister(source);
        XMMRegister dst = g.ToDoubleRegister(destination);
        __ movaps(kScratchDoubleReg, src);
        __ movaps(src, dst);
        __ movaps(dst, kScratchDoubleReg);
      }
      return;
    }
    case MoveType::kRegisterToStack: {
      if (source->IsRegister()) {
        Register src = g.ToRegister(source);
        __ push(src);
        frame_access_state()->IncreaseSPDelta(1);
        Operand dst = g.ToOperand(destination);
        __ mov(src, dst);
        frame_access_state()->IncreaseSPDelta(-1);
        dst = g.ToOperand(destination);
        __ pop(dst);
      } else {
        DCHECK(source->IsFPRegister());
        XMMRegister src = g.ToDoubleRegister(source);
        Operand dst = g.ToOperand(destination);
        MachineRepresentation rep =
            LocationOperand::cast(source)->representation();
        if (rep == MachineRepresentation::kFloat32) {
          __ movss(kScratchDoubleReg, dst);
          __ movss(dst, src);
          __ movaps(src, kScratchDoubleReg);
        } else if (rep == MachineRepresentation::kFloat64) {
          __ movsd(kScratchDoubleReg, dst);
          __ movsd(dst, src);
          __ movaps(src, kScratchDoubleReg);
        } else {
          DCHECK_EQ(MachineRepresentation::kSimd128, rep);
          __ movups(kScratchDoubleReg, dst);
          __ movups(dst, src);
          __ movups(src, kScratchDoubleReg);
        }
      }
      return;
    }
    case MoveType::kStackToStack: {
      if (source->IsStackSlot()) {
        Operand dst1 = g.ToOperand(destination);
        __ push(dst1);
        frame_access_state()->IncreaseSPDelta(1);
        Operand src1 = g.ToOperand(source);
        __ push(src1);
        Operand dst2 = g.ToOperand(destination);
        __ pop(dst2);
        frame_access_state()->IncreaseSPDelta(-1);
        Operand src2 = g.ToOperand(source);
        __ pop(src2);
      } else {
        DCHECK(source->IsFPStackSlot());
        Operand src0 = g.ToOperand(source);
        Operand dst0 = g.ToOperand(destination);
        MachineRepresentation rep =
            LocationOperand::cast(source)->representation();
        if (rep == MachineRepresentation::kFloat32) {
          __ movss(kScratchDoubleReg, dst0);  // Save dst in scratch register.
          __ push(src0);  // Then use stack to copy src to destination.
          __ pop(dst0);
          __ movss(src0, kScratchDoubleReg);
        } else if (rep == MachineRepresentation::kFloat64) {
          __ movsd(kScratchDoubleReg, dst0);  // Save dst in scratch register.
          __ push(src0);  // Then use stack to copy src to destination.
          __ pop(dst0);
          __ push(g.ToOperand(source, kPointerSize));
          __ pop(g.ToOperand(destination, kPointerSize));
          __ movsd(src0, kScratchDoubleReg);
        } else {
          DCHECK_EQ(MachineRepresentation::kSimd128, rep);
          __ movups(kScratchDoubleReg, dst0);  // Save dst in scratch register.
          __ push(src0);  // Then use stack to copy src to destination.
          __ pop(dst0);
          __ push(g.ToOperand(source, kPointerSize));
          __ pop(g.ToOperand(destination, kPointerSize));
          __ push(g.ToOperand(source, 2 * kPointerSize));
          __ pop(g.ToOperand(destination, 2 * kPointerSize));
          __ push(g.ToOperand(source, 3 * kPointerSize));
          __ pop(g.ToOperand(destination, 3 * kPointerSize));
          __ movups(src0, kScratchDoubleReg);
        }
      }
      return;
    }
    default:
      UNREACHABLE();
      break;
  }
}


void CodeGenerator::AssembleJumpTable(Label** targets, size_t target_count) {
  for (size_t index = 0; index < target_count; ++index) {
    __ dd(targets[index]);
  }
}

#undef __
#undef kScratchDoubleReg
#undef ASSEMBLE_COMPARE
#undef ASSEMBLE_IEEE754_BINOP
#undef ASSEMBLE_IEEE754_UNOP
#undef ASSEMBLE_BINOP
#undef ASSEMBLE_ATOMIC_BINOP
#undef ASSEMBLE_I64ATOMIC_BINOP
#undef ASSEMBLE_MOVX
#undef ASSEMBLE_SIMD_PUNPCK_SHUFFLE
#undef ASSEMBLE_SIMD_IMM_SHUFFLE

}  // namespace compiler
}  // namespace internal
}  // namespace v8