winch_codegen/isa/x64/
masm.rs

1use super::{
2    abi::X64ABI,
3    address::Address,
4    asm::{Assembler, PatchableAddToReg, VcmpKind, VcvtKind, VroundMode},
5    regs::{self, rbp, rsp},
6};
7use anyhow::{anyhow, bail, Result};
8
9use crate::masm::{
10    DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, Imm as I, IntCmpKind, LaneSelector,
11    LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind,
12    RmwOp, RoundingMode, ShiftKind, SplatKind, StoreKind, TrapCode, TruncKind, V128AbsKind,
13    V128AddKind, V128ConvertKind, V128ExtAddKind, V128ExtMulKind, V128ExtendKind, V128MaxKind,
14    V128MinKind, V128MulKind, V128NarrowKind, V128NegKind, V128SubKind, V128TruncKind,
15    VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS,
16};
17use crate::{
18    abi::{self, align_to, calculate_frame_adjustment, LocalSlot},
19    codegen::{ptr_type_from_ptr_size, CodeGenContext, CodeGenError, Emission, FuncEnv},
20    stack::{TypedReg, Val},
21};
22use crate::{
23    abi::{vmctx, ABI},
24    masm::{SPOffset, StackSlot},
25};
26use crate::{
27    isa::{
28        reg::{writable, Reg, RegClass, WritableReg},
29        CallingConvention,
30    },
31    masm::CalleeKind,
32};
33use cranelift_codegen::{
34    binemit::CodeOffset,
35    ir::{MemFlags, RelSourceLoc, SourceLoc},
36    isa::{
37        unwind::UnwindInst,
38        x64::{
39            args::{Avx512Opcode, AvxOpcode, FenceKind, CC},
40            settings as x64_settings, AtomicRmwSeqOp,
41        },
42    },
43    settings, Final, MachBufferFinalized, MachLabel,
44};
45use wasmtime_cranelift::TRAP_UNREACHABLE;
46use wasmtime_environ::{PtrSize, WasmValType};
47
48// Taken from `cranelift/codegen/src/isa/x64/lower/isle.rs`
49// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
50// need to fix up the bits that migrate from one half of the lane to the
51// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift
52// right by 0 (no movement), we want to retain all the bits so we mask with
53// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so
54// we mask with `0x7f`; etc.
55
56#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
57const I8X16_ISHL_MASKS: [u8; 128] = [
58    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
59    0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
60    0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
61    0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
62    0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
63    0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
64    0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
65    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
66];
67
68#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
69const I8X16_USHR_MASKS: [u8; 128] = [
70    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71    0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
72    0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
73    0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
74    0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
75    0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
76    0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
77    0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
78];
79
80/// x64 MacroAssembler.
81pub(crate) struct MacroAssembler {
82    /// Stack pointer offset.
83    sp_offset: u32,
84    /// This value represents the maximum stack size seen while compiling the function. While the
85    /// function is still being compiled its value will not be valid (the stack will grow and
86    /// shrink as space is reserved and freed during compilation), but once all instructions have
87    /// been seen this value will be the maximum stack usage seen.
88    sp_max: u32,
89    /// Add instructions that are used to add the constant stack max to a register.
90    stack_max_use_add: Option<PatchableAddToReg>,
91    /// Low level assembler.
92    asm: Assembler,
93    /// ISA flags.
94    flags: x64_settings::Flags,
95    /// Shared flags.vmcontext_store_context
96    shared_flags: settings::Flags,
97    /// The target pointer size.
98    ptr_size: OperandSize,
99}
100
101impl Masm for MacroAssembler {
102    type Address = Address;
103    type Ptr = u8;
104    type ABI = X64ABI;
105
106    fn frame_setup(&mut self) -> Result<()> {
107        let frame_pointer = rbp();
108        let stack_pointer = rsp();
109
110        self.asm.push_r(frame_pointer);
111
112        if self.shared_flags.unwind_info() {
113            self.asm.unwind_inst(UnwindInst::PushFrameRegs {
114                offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
115            })
116        }
117
118        self.asm
119            .mov_rr(stack_pointer, writable!(frame_pointer), OperandSize::S64);
120
121        Ok(())
122    }
123
124    fn check_stack(&mut self, vmctx: Reg) -> Result<()> {
125        let ptr_size: u8 = self.ptr_size.bytes().try_into().unwrap();
126        let scratch = regs::scratch();
127
128        self.load_ptr(
129            self.address_at_reg(vmctx, ptr_size.vmcontext_store_context().into())?,
130            writable!(scratch),
131        )?;
132
133        self.load_ptr(
134            Address::offset(scratch, ptr_size.vmstore_context_stack_limit().into()),
135            writable!(scratch),
136        )?;
137
138        self.add_stack_max(scratch);
139
140        self.asm.cmp_rr(scratch, regs::rsp(), self.ptr_size);
141        self.asm.trapif(IntCmpKind::GtU, TrapCode::STACK_OVERFLOW);
142
143        // Emit unwind info.
144        if self.shared_flags.unwind_info() {
145            self.asm.unwind_inst(UnwindInst::DefineNewFrame {
146                offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
147
148                // The Winch calling convention has no callee-save registers, so nothing will be
149                // clobbered.
150                offset_downward_to_clobbers: 0,
151            })
152        }
153        Ok(())
154    }
155
156    fn push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot> {
157        let bytes = match (reg.class(), size) {
158            (RegClass::Int, OperandSize::S64) => {
159                let word_bytes = <Self::ABI as ABI>::word_bytes() as u32;
160                self.asm.push_r(reg);
161                self.increment_sp(word_bytes);
162                word_bytes
163            }
164            (RegClass::Int, OperandSize::S32) => {
165                let bytes = size.bytes();
166                self.reserve_stack(bytes)?;
167                let sp_offset = SPOffset::from_u32(self.sp_offset);
168                self.asm
169                    .mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
170                bytes
171            }
172            (RegClass::Float, _) => {
173                let bytes = size.bytes();
174                self.reserve_stack(bytes)?;
175                let sp_offset = SPOffset::from_u32(self.sp_offset);
176                self.asm
177                    .xmm_mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
178                bytes
179            }
180            _ => unreachable!(),
181        };
182
183        Ok(StackSlot {
184            offset: SPOffset::from_u32(self.sp_offset),
185            size: bytes,
186        })
187    }
188
189    fn reserve_stack(&mut self, bytes: u32) -> Result<()> {
190        if bytes == 0 {
191            return Ok(());
192        }
193
194        self.asm
195            .sub_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
196        self.increment_sp(bytes);
197
198        Ok(())
199    }
200
201    fn free_stack(&mut self, bytes: u32) -> Result<()> {
202        if bytes == 0 {
203            return Ok(());
204        }
205        self.asm
206            .add_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
207        self.decrement_sp(bytes);
208
209        Ok(())
210    }
211
212    fn reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()> {
213        self.sp_offset = offset.as_u32();
214
215        Ok(())
216    }
217
218    fn local_address(&mut self, local: &LocalSlot) -> Result<Address> {
219        let (reg, offset) = if local.addressed_from_sp() {
220            let offset = self
221                .sp_offset
222                .checked_sub(local.offset)
223                .ok_or_else(|| CodeGenError::invalid_local_offset())?;
224            (rsp(), offset)
225        } else {
226            (rbp(), local.offset)
227        };
228
229        Ok(Address::offset(reg, offset))
230    }
231
232    fn address_from_sp(&self, offset: SPOffset) -> Result<Self::Address> {
233        Ok(Address::offset(
234            regs::rsp(),
235            self.sp_offset - offset.as_u32(),
236        ))
237    }
238
239    fn address_at_sp(&self, offset: SPOffset) -> Result<Self::Address> {
240        Ok(Address::offset(regs::rsp(), offset.as_u32()))
241    }
242
243    fn address_at_vmctx(&self, offset: u32) -> Result<Self::Address> {
244        Ok(Address::offset(vmctx!(Self), offset))
245    }
246
247    fn store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()> {
248        self.store(src.into(), dst, self.ptr_size)
249    }
250
251    fn store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()> {
252        self.store_impl(src, dst, size, TRUSTED_FLAGS)
253    }
254
255    fn wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()> {
256        match kind {
257            StoreKind::Operand(size) => {
258                self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
259            }
260            StoreKind::Atomic(size) => {
261                if size == OperandSize::S128 {
262                    // TODO: we don't support 128-bit atomic store yet.
263                    bail!(CodeGenError::unexpected_operand_size());
264                }
265                // To stay consistent with cranelift, we emit a normal store followed by a mfence,
266                // although, we could probably just emit a xchg.
267                self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
268                self.asm.fence(FenceKind::MFence);
269            }
270            StoreKind::VectorLane(LaneSelector { lane, size }) => {
271                self.ensure_has_avx()?;
272                self.asm
273                    .xmm_vpextr_rm(&dst, src, lane, size, UNTRUSTED_FLAGS)?;
274            }
275        }
276
277        Ok(())
278    }
279
280    fn pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
281        let current_sp = SPOffset::from_u32(self.sp_offset);
282        let _ = match (dst.to_reg().class(), size) {
283            (RegClass::Int, OperandSize::S32) => {
284                let addr = self.address_from_sp(current_sp)?;
285                self.asm.movzx_mr(
286                    &addr,
287                    dst,
288                    size.extend_to::<Zero>(OperandSize::S64),
289                    TRUSTED_FLAGS,
290                );
291                self.free_stack(size.bytes())?;
292            }
293            (RegClass::Int, OperandSize::S64) => {
294                self.asm.pop_r(dst);
295                self.decrement_sp(<Self::ABI as ABI>::word_bytes() as u32);
296            }
297            (RegClass::Float, _) | (RegClass::Vector, _) => {
298                let addr = self.address_from_sp(current_sp)?;
299                self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
300                self.free_stack(size.bytes())?;
301            }
302            _ => bail!(CodeGenError::invalid_operand_combination()),
303        };
304        Ok(())
305    }
306
307    fn call(
308        &mut self,
309        stack_args_size: u32,
310        mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>,
311    ) -> Result<u32> {
312        let alignment: u32 = <Self::ABI as abi::ABI>::call_stack_align().into();
313        let addend: u32 = <Self::ABI as abi::ABI>::initial_frame_size().into();
314        let delta = calculate_frame_adjustment(self.sp_offset()?.as_u32(), addend, alignment);
315        let aligned_args_size = align_to(stack_args_size, alignment);
316        let total_stack = delta + aligned_args_size;
317        self.reserve_stack(total_stack)?;
318        let (callee, cc) = load_callee(self)?;
319        match callee {
320            CalleeKind::Indirect(reg) => self.asm.call_with_reg(cc, reg),
321            CalleeKind::Direct(idx) => self.asm.call_with_name(cc, idx),
322            CalleeKind::LibCall(lib) => self.asm.call_with_lib(cc, lib, regs::scratch()),
323        };
324        Ok(total_stack)
325    }
326
327    fn load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()> {
328        self.load(src, dst, self.ptr_size)
329    }
330
331    fn compute_addr(
332        &mut self,
333        src: Self::Address,
334        dst: WritableReg,
335        size: OperandSize,
336    ) -> Result<()> {
337        self.asm.lea(&src, dst, size);
338        Ok(())
339    }
340
341    fn load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()> {
342        self.load_impl(src, dst, size, TRUSTED_FLAGS)
343    }
344
345    fn wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()> {
346        let size = kind.derive_operand_size();
347
348        match kind {
349            LoadKind::ScalarExtend(ext) => match ext {
350                ExtendKind::Signed(ext) => {
351                    self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS);
352                }
353                ExtendKind::Unsigned(_) => self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?,
354            },
355            LoadKind::Operand(_) | LoadKind::Atomic(_, _) => {
356                // The guarantees of the x86-64 memory model ensure that `SeqCst`
357                // loads are equivalent to normal loads.
358                if kind.is_atomic() && size == OperandSize::S128 {
359                    bail!(CodeGenError::unexpected_operand_size());
360                }
361
362                self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?;
363            }
364            LoadKind::VectorExtend(ext) => {
365                self.ensure_has_avx()?;
366                self.asm
367                    .xmm_vpmov_mr(&src, dst, ext.into(), UNTRUSTED_FLAGS)
368            }
369            LoadKind::Splat(_) => {
370                self.ensure_has_avx()?;
371
372                if size == OperandSize::S64 {
373                    self.asm
374                        .xmm_mov_mr(&src, dst, OperandSize::S64, UNTRUSTED_FLAGS);
375                    self.asm.xmm_vpshuf_rr(
376                        dst.to_reg(),
377                        dst,
378                        Self::vpshuf_mask_for_64_bit_splats(),
379                        OperandSize::S32,
380                    );
381                } else {
382                    self.asm
383                        .xmm_vpbroadcast_mr(&src, dst, size, UNTRUSTED_FLAGS);
384                }
385            }
386            LoadKind::VectorLane(LaneSelector { lane, size }) => {
387                self.ensure_has_avx()?;
388                let byte_tmp = regs::scratch();
389                self.load_impl(src, writable!(byte_tmp), size, UNTRUSTED_FLAGS)?;
390                self.asm
391                    .xmm_vpinsr_rrr(dst, dst.to_reg(), byte_tmp, lane, size);
392            }
393            LoadKind::VectorZero(size) => {
394                self.ensure_has_avx()?;
395                let scratch = regs::scratch();
396                self.load_impl(src, writable!(scratch), size, UNTRUSTED_FLAGS)?;
397                self.asm.avx_gpr_to_xmm(scratch, dst, size);
398            }
399        }
400
401        Ok(())
402    }
403
404    fn sp_offset(&self) -> Result<SPOffset> {
405        Ok(SPOffset::from_u32(self.sp_offset))
406    }
407
408    fn zero(&mut self, reg: WritableReg) -> Result<()> {
409        self.asm.xor_rr(
410            reg.to_reg(),
411            reg,
412            OperandSize::from_bytes(<Self::ABI>::word_bytes()),
413        );
414        Ok(())
415    }
416
417    fn mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()> {
418        match (src, dst.to_reg()) {
419            (RegImm::Reg(src), dst_reg) => match (src.class(), dst_reg.class()) {
420                (RegClass::Int, RegClass::Int) => Ok(self.asm.mov_rr(src, dst, size)),
421                (RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_mov_rr(src, dst, size)),
422                _ => bail!(CodeGenError::invalid_operand_combination()),
423            },
424            (RegImm::Imm(imm), _) => match imm {
425                I::I32(v) => Ok(self.asm.mov_ir(v as u64, dst, size)),
426                I::I64(v) => Ok(self.asm.mov_ir(v, dst, size)),
427                I::F32(v) => {
428                    let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
429                    self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
430                    Ok(())
431                }
432                I::F64(v) => {
433                    let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
434                    self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
435                    Ok(())
436                }
437                I::V128(v) => {
438                    let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
439                    self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
440                    Ok(())
441                }
442            },
443        }
444    }
445
446    fn cmov(
447        &mut self,
448        dst: WritableReg,
449        src: Reg,
450        cc: IntCmpKind,
451        size: OperandSize,
452    ) -> Result<()> {
453        match (src.class(), dst.to_reg().class()) {
454            (RegClass::Int, RegClass::Int) => Ok(self.asm.cmov(src, dst, cc, size)),
455            (RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_cmov(src, dst, cc, size)),
456            _ => Err(anyhow!(CodeGenError::invalid_operand_combination())),
457        }
458    }
459
460    fn add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
461        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
462        match (rhs, dst) {
463            (RegImm::Imm(imm), _) => {
464                if let Some(v) = imm.to_i32() {
465                    self.asm.add_ir(v, dst, size);
466                } else {
467                    let scratch = regs::scratch();
468                    self.load_constant(&imm, writable!(scratch), size)?;
469                    self.asm.add_rr(scratch, dst, size);
470                }
471            }
472
473            (RegImm::Reg(src), dst) => {
474                self.asm.add_rr(src, dst, size);
475            }
476        }
477
478        Ok(())
479    }
480
481    fn checked_uadd(
482        &mut self,
483        dst: WritableReg,
484        lhs: Reg,
485        rhs: RegImm,
486        size: OperandSize,
487        trap: TrapCode,
488    ) -> Result<()> {
489        self.add(dst, lhs, rhs, size)?;
490        self.asm.trapif(CC::B, trap);
491        Ok(())
492    }
493
494    fn sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
495        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
496        match (rhs, dst) {
497            (RegImm::Imm(imm), reg) => {
498                if let Some(v) = imm.to_i32() {
499                    self.asm.sub_ir(v, reg, size);
500                } else {
501                    let scratch = regs::scratch();
502                    self.load_constant(&imm, writable!(scratch), size)?;
503                    self.asm.sub_rr(scratch, reg, size);
504                }
505            }
506
507            (RegImm::Reg(src), dst) => {
508                self.asm.sub_rr(src, dst, size);
509            }
510        }
511
512        Ok(())
513    }
514
515    fn mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
516        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
517        match (rhs, dst) {
518            (RegImm::Imm(imm), _) => {
519                if let Some(v) = imm.to_i32() {
520                    self.asm.mul_ir(v, dst, size);
521                } else {
522                    let scratch = regs::scratch();
523                    self.load_constant(&imm, writable!(scratch), size)?;
524                    self.asm.mul_rr(scratch, dst, size);
525                }
526            }
527
528            (RegImm::Reg(src), dst) => {
529                self.asm.mul_rr(src, dst, size);
530            }
531        }
532
533        Ok(())
534    }
535
536    fn float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
537        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
538        self.asm.xmm_add_rr(rhs, dst, size);
539        Ok(())
540    }
541
542    fn float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
543        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
544        self.asm.xmm_sub_rr(rhs, dst, size);
545        Ok(())
546    }
547
548    fn float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
549        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
550        self.asm.xmm_mul_rr(rhs, dst, size);
551        Ok(())
552    }
553
554    fn float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
555        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
556        self.asm.xmm_div_rr(rhs, dst, size);
557        Ok(())
558    }
559
560    fn float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
561        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
562        self.asm.xmm_min_seq(rhs, dst, size);
563        Ok(())
564    }
565
566    fn float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
567        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
568        self.asm.xmm_max_seq(rhs, dst, size);
569        Ok(())
570    }
571
572    fn float_copysign(
573        &mut self,
574        dst: WritableReg,
575        lhs: Reg,
576        rhs: Reg,
577        size: OperandSize,
578    ) -> Result<()> {
579        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
580        let scratch_gpr = regs::scratch();
581        let scratch_xmm = regs::scratch_xmm();
582        let sign_mask = match size {
583            OperandSize::S32 => I::I32(0x80000000),
584            OperandSize::S64 => I::I64(0x8000000000000000),
585            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
586                bail!(CodeGenError::unexpected_operand_size())
587            }
588        };
589        self.load_constant(&sign_mask, writable!(scratch_gpr), size)?;
590        self.asm
591            .gpr_to_xmm(scratch_gpr, writable!(scratch_xmm), size);
592
593        // Clear everything except sign bit in src.
594        self.asm.xmm_and_rr(scratch_xmm, writable!(rhs), size);
595
596        // Clear sign bit in dst using scratch to store result. Then copy the
597        // result back to dst.
598        self.asm
599            .xmm_andn_rr(dst.to_reg(), writable!(scratch_xmm), size);
600        self.asm.xmm_mov_rr(scratch_xmm, dst, size);
601
602        // Copy sign bit from src to dst.
603        self.asm.xmm_or_rr(rhs, dst, size);
604        Ok(())
605    }
606
607    fn float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
608        debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
609        let mask = match size {
610            OperandSize::S32 => I::I32(0x80000000),
611            OperandSize::S64 => I::I64(0x8000000000000000),
612            OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
613                bail!(CodeGenError::unexpected_operand_size())
614            }
615        };
616        let scratch_gpr = regs::scratch();
617        self.load_constant(&mask, writable!(scratch_gpr), size)?;
618        let scratch_xmm = regs::scratch_xmm();
619        self.asm
620            .gpr_to_xmm(scratch_gpr, writable!(scratch_xmm), size);
621        self.asm.xmm_xor_rr(scratch_xmm, dst, size);
622        Ok(())
623    }
624
625    fn float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
626        debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
627        let mask = match size {
628            OperandSize::S32 => I::I32(0x7fffffff),
629            OperandSize::S64 => I::I64(0x7fffffffffffffff),
630            OperandSize::S128 | OperandSize::S16 | OperandSize::S8 => {
631                bail!(CodeGenError::unexpected_operand_size())
632            }
633        };
634        let scratch_gpr = regs::scratch();
635        self.load_constant(&mask, writable!(scratch_gpr), size)?;
636        let scratch_xmm = regs::scratch_xmm();
637        self.asm
638            .gpr_to_xmm(scratch_gpr, writable!(scratch_xmm), size);
639        self.asm.xmm_and_rr(scratch_xmm, dst, size);
640        Ok(())
641    }
642
643    fn float_round<
644        F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>,
645    >(
646        &mut self,
647        mode: RoundingMode,
648        env: &mut FuncEnv<Self::Ptr>,
649        context: &mut CodeGenContext<Emission>,
650        size: OperandSize,
651        mut fallback: F,
652    ) -> Result<()> {
653        if self.flags.has_sse41() {
654            let src = context.pop_to_reg(self, None)?;
655            self.asm
656                .xmm_rounds_rr(src.into(), writable!(src.into()), mode, size);
657            context.stack.push(src.into());
658            Ok(())
659        } else {
660            fallback(env, context, self)
661        }
662    }
663
664    fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
665        self.asm.sqrt(src, dst, size);
666        Ok(())
667    }
668
669    fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
670        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
671        match (rhs, dst) {
672            (RegImm::Imm(imm), _) => {
673                if let Some(v) = imm.to_i32() {
674                    self.asm.and_ir(v, dst, size);
675                } else {
676                    let scratch = regs::scratch();
677                    self.load_constant(&imm, writable!(scratch), size)?;
678                    self.asm.and_rr(scratch, dst, size);
679                }
680            }
681
682            (RegImm::Reg(src), dst) => {
683                self.asm.and_rr(src, dst, size);
684            }
685        }
686
687        Ok(())
688    }
689
690    fn or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
691        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
692        match (rhs, dst) {
693            (RegImm::Imm(imm), _) => {
694                if let Some(v) = imm.to_i32() {
695                    self.asm.or_ir(v, dst, size);
696                } else {
697                    let scratch = regs::scratch();
698                    self.load_constant(&imm, writable!(scratch), size)?;
699                    self.asm.or_rr(scratch, dst, size);
700                }
701            }
702
703            (RegImm::Reg(src), dst) => {
704                self.asm.or_rr(src, dst, size);
705            }
706        }
707
708        Ok(())
709    }
710
711    fn xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
712        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
713        match (rhs, dst) {
714            (RegImm::Imm(imm), _) => {
715                if let Some(v) = imm.to_i32() {
716                    self.asm.xor_ir(v, dst, size);
717                } else {
718                    let scratch = regs::scratch();
719                    self.load_constant(&imm, writable!(scratch), size)?;
720                    self.asm.xor_rr(scratch, dst, size);
721                }
722            }
723
724            (RegImm::Reg(src), _) => {
725                self.asm.xor_rr(src, dst, size);
726            }
727        }
728
729        Ok(())
730    }
731
732    fn shift_ir(
733        &mut self,
734        dst: WritableReg,
735        imm: u64,
736        lhs: Reg,
737        kind: ShiftKind,
738        size: OperandSize,
739    ) -> Result<()> {
740        Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
741        self.asm.shift_ir(imm as u8, dst, kind, size);
742        Ok(())
743    }
744
745    fn shift(
746        &mut self,
747        context: &mut CodeGenContext<Emission>,
748        kind: ShiftKind,
749        size: OperandSize,
750    ) -> Result<()> {
751        // Number of bits to shift must be in the CL register.
752        let src = context.pop_to_reg(self, Some(regs::rcx()))?;
753        let dst = context.pop_to_reg(self, None)?;
754
755        self.asm
756            .shift_rr(src.into(), writable!(dst.into()), kind, size);
757
758        context.free_reg(src);
759        context.stack.push(dst.into());
760
761        Ok(())
762    }
763
764    fn div(
765        &mut self,
766        context: &mut CodeGenContext<Emission>,
767        kind: DivKind,
768        size: OperandSize,
769    ) -> Result<()> {
770        // Allocate rdx:rax.
771        let rdx = context.reg(regs::rdx(), self)?;
772        let rax = context.reg(regs::rax(), self)?;
773
774        // Allocate the divisor, which can be any gpr.
775        let divisor = context.pop_to_reg(self, None)?;
776
777        // Mark rax as allocatable.
778        context.free_reg(rax);
779        // Move the top value to rax.
780        let rax = context.pop_to_reg(self, Some(rax))?;
781        self.asm.div(divisor.into(), (rax.into(), rdx), kind, size);
782
783        // Free the divisor and rdx.
784        context.free_reg(divisor);
785        context.free_reg(rdx);
786
787        // Push the quotient.
788        context.stack.push(rax.into());
789        Ok(())
790    }
791
792    fn rem(
793        &mut self,
794        context: &mut CodeGenContext<Emission>,
795        kind: RemKind,
796        size: OperandSize,
797    ) -> Result<()> {
798        // Allocate rdx:rax.
799        let rdx = context.reg(regs::rdx(), self)?;
800        let rax = context.reg(regs::rax(), self)?;
801
802        // Allocate the divisor, which can be any gpr.
803        let divisor = context.pop_to_reg(self, None)?;
804
805        // Mark rax as allocatable.
806        context.free_reg(rax);
807        // Move the top value to rax.
808        let rax = context.pop_to_reg(self, Some(rax))?;
809        self.asm.rem(divisor.reg, (rax.into(), rdx), kind, size);
810
811        // Free the divisor and rax.
812        context.free_reg(divisor);
813        context.free_reg(rax);
814
815        // Push the remainder.
816        context.stack.push(Val::reg(rdx, divisor.ty));
817
818        Ok(())
819    }
820
821    fn frame_restore(&mut self) -> Result<()> {
822        debug_assert_eq!(self.sp_offset, 0);
823        self.asm.pop_r(writable!(rbp()));
824        self.asm.ret();
825        Ok(())
826    }
827
828    fn finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>> {
829        if let Some(patch) = self.stack_max_use_add {
830            patch.finalize(i32::try_from(self.sp_max).unwrap(), self.asm.buffer_mut());
831        }
832
833        Ok(self.asm.finalize(base))
834    }
835
836    fn address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address> {
837        Ok(Address::offset(reg, offset))
838    }
839
840    fn cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()> {
841        match src2 {
842            RegImm::Imm(imm) => {
843                if let Some(v) = imm.to_i32() {
844                    self.asm.cmp_ir(src1, v, size);
845                } else {
846                    let scratch = regs::scratch();
847                    self.load_constant(&imm, writable!(scratch), size)?;
848                    self.asm.cmp_rr(src1, scratch, size);
849                }
850            }
851            RegImm::Reg(src2) => {
852                self.asm.cmp_rr(src1, src2, size);
853            }
854        }
855
856        Ok(())
857    }
858
859    fn cmp_with_set(
860        &mut self,
861        dst: WritableReg,
862        src: RegImm,
863        kind: IntCmpKind,
864        size: OperandSize,
865    ) -> Result<()> {
866        self.cmp(dst.to_reg(), src, size)?;
867        self.asm.setcc(kind, dst);
868        Ok(())
869    }
870
871    fn float_cmp_with_set(
872        &mut self,
873        dst: WritableReg,
874        src1: Reg,
875        src2: Reg,
876        kind: FloatCmpKind,
877        size: OperandSize,
878    ) -> Result<()> {
879        // Float comparisons needs to be ordered (that is, comparing with a NaN
880        // should return 0) except for not equal which needs to be unordered.
881        // We use ucomis{s, d} because comis{s, d} has an undefined result if
882        // either operand is NaN. Since ucomis{s, d} is unordered, we need to
883        // compensate to make the comparison ordered.  Ucomis{s, d} sets the
884        // ZF, PF, and CF flags if there is an unordered result.
885        let (src1, src2, set_kind) = match kind {
886            FloatCmpKind::Eq => (src1, src2, IntCmpKind::Eq),
887            FloatCmpKind::Ne => (src1, src2, IntCmpKind::Ne),
888            FloatCmpKind::Gt => (src1, src2, IntCmpKind::GtU),
889            FloatCmpKind::Ge => (src1, src2, IntCmpKind::GeU),
890            // Reversing the operands and using the complementary comparison
891            // avoids needing to perform an additional SETNP and AND
892            // instruction.
893            // SETNB and SETNBE check if the carry flag is unset (i.e., not
894            // less than and not unordered) so we get the intended result
895            // without having to look at the parity flag.
896            FloatCmpKind::Lt => (src2, src1, IntCmpKind::GtU),
897            FloatCmpKind::Le => (src2, src1, IntCmpKind::GeU),
898        };
899        self.asm.ucomis(src1, src2, size);
900        self.asm.setcc(set_kind, dst);
901        let _ = match kind {
902            FloatCmpKind::Eq | FloatCmpKind::Gt | FloatCmpKind::Ge => {
903                // Return false if either operand is NaN by ensuring PF is
904                // unset.
905                let scratch = regs::scratch();
906                self.asm.setnp(writable!(scratch));
907                self.asm.and_rr(scratch, dst, size);
908            }
909            FloatCmpKind::Ne => {
910                // Return true if either operand is NaN by checking if PF is
911                // set.
912                let scratch = regs::scratch();
913                self.asm.setp(writable!(scratch));
914                self.asm.or_rr(scratch, dst, size);
915            }
916            FloatCmpKind::Lt | FloatCmpKind::Le => (),
917        };
918        Ok(())
919    }
920
921    fn clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
922        if self.flags.has_lzcnt() {
923            self.asm.lzcnt(src, dst, size);
924        } else {
925            let scratch = regs::scratch();
926
927            // Use the following approach:
928            // dst = size.num_bits() - bsr(src) - is_not_zero
929            //     = size.num.bits() + -bsr(src) - is_not_zero.
930            self.asm.bsr(src.into(), dst, size);
931            self.asm.setcc(IntCmpKind::Ne, writable!(scratch.into()));
932            self.asm.neg(dst.to_reg(), dst, size);
933            self.asm.add_ir(size.num_bits() as i32, dst, size);
934            self.asm.sub_rr(scratch, dst, size);
935        }
936
937        Ok(())
938    }
939
940    fn ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
941        if self.flags.has_bmi1() {
942            self.asm.tzcnt(src, dst, size);
943        } else {
944            let scratch = regs::scratch();
945
946            // Use the following approach:
947            // dst = bsf(src) + (is_zero * size.num_bits())
948            //     = bsf(src) + (is_zero << size.log2()).
949            // BSF outputs the correct value for every value except 0.
950            // When the value is 0, BSF outputs 0, correct output for ctz is
951            // the number of bits.
952            self.asm.bsf(src.into(), dst.into(), size);
953            self.asm.setcc(IntCmpKind::Eq, writable!(scratch.into()));
954            self.asm
955                .shift_ir(size.log2(), writable!(scratch), ShiftKind::Shl, size);
956            self.asm.add_rr(scratch, dst, size);
957        }
958
959        Ok(())
960    }
961
962    fn get_label(&mut self) -> Result<MachLabel> {
963        let buffer = self.asm.buffer_mut();
964        Ok(buffer.get_label())
965    }
966
967    fn bind(&mut self, label: MachLabel) -> Result<()> {
968        let buffer = self.asm.buffer_mut();
969        buffer.bind_label(label, &mut Default::default());
970        Ok(())
971    }
972
973    fn branch(
974        &mut self,
975        kind: IntCmpKind,
976        lhs: Reg,
977        rhs: RegImm,
978        taken: MachLabel,
979        size: OperandSize,
980    ) -> Result<()> {
981        use IntCmpKind::*;
982
983        match &(lhs, rhs) {
984            (rlhs, RegImm::Reg(rrhs)) => {
985                // If the comparison kind is zero or not zero and both operands
986                // are the same register, emit a test instruction. Else we emit
987                // a normal comparison.
988                if (kind == Eq || kind == Ne) && (rlhs == rrhs) {
989                    self.asm.test_rr(*rlhs, *rrhs, size);
990                } else {
991                    self.cmp(lhs, rhs, size)?;
992                }
993            }
994            _ => self.cmp(lhs, rhs, size)?,
995        }
996        self.asm.jmp_if(kind, taken);
997        Ok(())
998    }
999
1000    fn jmp(&mut self, target: MachLabel) -> Result<()> {
1001        self.asm.jmp(target);
1002        Ok(())
1003    }
1004
1005    fn popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()> {
1006        let src = context.pop_to_reg(self, None)?;
1007        if self.flags.has_popcnt() && self.flags.has_sse42() {
1008            self.asm.popcnt(src.into(), size);
1009            context.stack.push(src.into());
1010            Ok(())
1011        } else {
1012            // The fallback functionality here is based on `MacroAssembler::popcnt64` in:
1013            // https://searchfox.org/mozilla-central/source/js/src/jit/x64/MacroAssembler-x64-inl.h#495
1014
1015            let tmp = writable!(context.any_gpr(self)?);
1016            let dst = writable!(src.into());
1017            let (masks, shift_amt) = match size {
1018                OperandSize::S64 => (
1019                    [
1020                        0x5555555555555555, // m1
1021                        0x3333333333333333, // m2
1022                        0x0f0f0f0f0f0f0f0f, // m4
1023                        0x0101010101010101, // h01
1024                    ],
1025                    56u8,
1026                ),
1027                // 32-bit popcount is the same, except the masks are half as
1028                // wide and we shift by 24 at the end rather than 56
1029                OperandSize::S32 => (
1030                    [0x55555555i64, 0x33333333i64, 0x0f0f0f0fi64, 0x01010101i64],
1031                    24u8,
1032                ),
1033                _ => bail!(CodeGenError::unexpected_operand_size()),
1034            };
1035            self.asm.mov_rr(src.into(), tmp, size);
1036
1037            // x -= (x >> 1) & m1;
1038            self.asm.shift_ir(1u8, dst, ShiftKind::ShrU, size);
1039            let lhs = dst.to_reg();
1040            self.and(writable!(lhs), lhs, RegImm::i64(masks[0]), size)?;
1041            self.asm.sub_rr(dst.to_reg(), tmp, size);
1042
1043            // x = (x & m2) + ((x >> 2) & m2);
1044            self.asm.mov_rr(tmp.to_reg(), dst, size);
1045            // Load `0x3333...` into the scratch reg once, allowing us to use
1046            // `and_rr` and avoid inadvertently loading it twice as with `and`
1047            let scratch = regs::scratch();
1048            self.load_constant(&I::i64(masks[1]), writable!(scratch), size)?;
1049            self.asm.and_rr(scratch, dst, size);
1050            self.asm.shift_ir(2u8, tmp, ShiftKind::ShrU, size);
1051            self.asm.and_rr(scratch, tmp, size);
1052            self.asm.add_rr(dst.to_reg(), tmp, size);
1053
1054            // x = (x + (x >> 4)) & m4;
1055            self.asm.mov_rr(tmp.to_reg(), dst.into(), size);
1056            self.asm.shift_ir(4u8, dst.into(), ShiftKind::ShrU, size);
1057            self.asm.add_rr(tmp.to_reg(), dst, size);
1058            let lhs = dst.to_reg();
1059            self.and(writable!(lhs), lhs, RegImm::i64(masks[2]), size)?;
1060
1061            // (x * h01) >> shift_amt
1062            let lhs = dst.to_reg();
1063            self.mul(writable!(lhs), lhs, RegImm::i64(masks[3]), size)?;
1064            self.asm
1065                .shift_ir(shift_amt, dst.into(), ShiftKind::ShrU, size);
1066
1067            context.stack.push(src.into());
1068            context.free_reg(tmp.to_reg());
1069
1070            Ok(())
1071        }
1072    }
1073
1074    fn wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1075        self.asm.mov_rr(src.into(), dst, OperandSize::S32);
1076        Ok(())
1077    }
1078
1079    fn extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()> {
1080        match kind {
1081            ExtendKind::Signed(ext) => {
1082                self.asm.movsx_rr(src, dst, ext);
1083            }
1084            ExtendKind::Unsigned(ext) => {
1085                self.asm.movzx_rr(src, dst, ext);
1086            }
1087        }
1088
1089        Ok(())
1090    }
1091
1092    fn signed_truncate(
1093        &mut self,
1094        dst: WritableReg,
1095        src: Reg,
1096        src_size: OperandSize,
1097        dst_size: OperandSize,
1098        kind: TruncKind,
1099    ) -> Result<()> {
1100        self.asm.cvt_float_to_sint_seq(
1101            src,
1102            dst,
1103            regs::scratch(),
1104            regs::scratch_xmm(),
1105            src_size,
1106            dst_size,
1107            kind.is_checked(),
1108        );
1109        Ok(())
1110    }
1111
1112    fn unsigned_truncate(
1113        &mut self,
1114        ctx: &mut CodeGenContext<Emission>,
1115        src_size: OperandSize,
1116        dst_size: OperandSize,
1117        kind: TruncKind,
1118    ) -> Result<()> {
1119        let dst_ty = match dst_size {
1120            OperandSize::S32 => WasmValType::I32,
1121            OperandSize::S64 => WasmValType::I64,
1122            _ => bail!(CodeGenError::unexpected_operand_size()),
1123        };
1124
1125        ctx.convert_op_with_tmp_reg(
1126            self,
1127            dst_ty,
1128            RegClass::Float,
1129            |masm, dst, src, tmp_fpr, dst_size| {
1130                masm.asm.cvt_float_to_uint_seq(
1131                    src,
1132                    writable!(dst),
1133                    regs::scratch(),
1134                    regs::scratch_xmm(),
1135                    tmp_fpr,
1136                    src_size,
1137                    dst_size,
1138                    kind.is_checked(),
1139                );
1140
1141                Ok(())
1142            },
1143        )
1144    }
1145
1146    fn signed_convert(
1147        &mut self,
1148        dst: WritableReg,
1149        src: Reg,
1150        src_size: OperandSize,
1151        dst_size: OperandSize,
1152    ) -> Result<()> {
1153        self.asm.cvt_sint_to_float(src, dst, src_size, dst_size);
1154        Ok(())
1155    }
1156
1157    fn unsigned_convert(
1158        &mut self,
1159        dst: WritableReg,
1160        src: Reg,
1161        tmp_gpr: Reg,
1162        src_size: OperandSize,
1163        dst_size: OperandSize,
1164    ) -> Result<()> {
1165        // Need to convert unsigned uint32 to uint64 for conversion instruction sequence.
1166        if let OperandSize::S32 = src_size {
1167            self.extend(
1168                writable!(src),
1169                src,
1170                ExtendKind::Unsigned(Extend::I64Extend32),
1171            )?;
1172        }
1173
1174        self.asm
1175            .cvt_uint64_to_float_seq(src, dst, regs::scratch(), tmp_gpr, dst_size);
1176        Ok(())
1177    }
1178
1179    fn reinterpret_float_as_int(
1180        &mut self,
1181        dst: WritableReg,
1182        src: Reg,
1183        size: OperandSize,
1184    ) -> Result<()> {
1185        self.asm.xmm_to_gpr(src, dst, size);
1186        Ok(())
1187    }
1188
1189    fn reinterpret_int_as_float(
1190        &mut self,
1191        dst: WritableReg,
1192        src: Reg,
1193        size: OperandSize,
1194    ) -> Result<()> {
1195        self.asm.gpr_to_xmm(src.into(), dst, size);
1196        Ok(())
1197    }
1198
1199    fn demote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1200        self.asm
1201            .cvt_float_to_float(src.into(), dst.into(), OperandSize::S64, OperandSize::S32);
1202        Ok(())
1203    }
1204
1205    fn promote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1206        self.asm
1207            .cvt_float_to_float(src.into(), dst, OperandSize::S32, OperandSize::S64);
1208        Ok(())
1209    }
1210
1211    fn unreachable(&mut self) -> Result<()> {
1212        self.asm.trap(TRAP_UNREACHABLE);
1213        Ok(())
1214    }
1215
1216    fn trap(&mut self, code: TrapCode) -> Result<()> {
1217        self.asm.trap(code);
1218        Ok(())
1219    }
1220
1221    fn trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()> {
1222        self.asm.trapif(cc, code);
1223        Ok(())
1224    }
1225
1226    fn trapz(&mut self, src: Reg, code: TrapCode) -> Result<()> {
1227        self.asm.test_rr(src, src, self.ptr_size);
1228        self.asm.trapif(IntCmpKind::Eq, code);
1229        Ok(())
1230    }
1231
1232    fn jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()> {
1233        // At least one default target.
1234        debug_assert!(targets.len() >= 1);
1235        let default_index = targets.len() - 1;
1236        // Emit bounds check, by conditionally moving the max cases
1237        // into the given index reg if the contents of the index reg
1238        // are greater.
1239        let max = default_index;
1240        let size = OperandSize::S32;
1241        self.asm.mov_ir(max as u64, writable!(tmp), size);
1242        self.asm.cmp_rr(tmp, index, size);
1243        self.asm.cmov(tmp, writable!(index), IntCmpKind::LtU, size);
1244
1245        let default = targets[default_index];
1246        let rest = &targets[0..default_index];
1247        let tmp1 = regs::scratch();
1248        self.asm.jmp_table(rest.into(), default, index, tmp1, tmp);
1249        Ok(())
1250    }
1251
1252    fn start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)> {
1253        Ok(self.asm.buffer_mut().start_srcloc(loc))
1254    }
1255
1256    fn end_source_loc(&mut self) -> Result<()> {
1257        self.asm.buffer_mut().end_srcloc();
1258        Ok(())
1259    }
1260
1261    fn current_code_offset(&self) -> Result<CodeOffset> {
1262        Ok(self.asm.buffer().cur_offset())
1263    }
1264
1265    fn add128(
1266        &mut self,
1267        dst_lo: WritableReg,
1268        dst_hi: WritableReg,
1269        lhs_lo: Reg,
1270        lhs_hi: Reg,
1271        rhs_lo: Reg,
1272        rhs_hi: Reg,
1273    ) -> Result<()> {
1274        Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1275        Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1276        self.asm.add_rr(rhs_lo, dst_lo, OperandSize::S64);
1277        self.asm.adc_rr(rhs_hi, dst_hi, OperandSize::S64);
1278        Ok(())
1279    }
1280
1281    fn sub128(
1282        &mut self,
1283        dst_lo: WritableReg,
1284        dst_hi: WritableReg,
1285        lhs_lo: Reg,
1286        lhs_hi: Reg,
1287        rhs_lo: Reg,
1288        rhs_hi: Reg,
1289    ) -> Result<()> {
1290        Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1291        Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1292        self.asm.sub_rr(rhs_lo, dst_lo, OperandSize::S64);
1293        self.asm.sbb_rr(rhs_hi, dst_hi, OperandSize::S64);
1294        Ok(())
1295    }
1296
1297    fn mul_wide(
1298        &mut self,
1299        context: &mut CodeGenContext<Emission>,
1300        kind: MulWideKind,
1301    ) -> Result<()> {
1302        // Reserve rax/rdx since they're required by the `mul_wide` instruction
1303        // being used here.
1304        let rax = context.reg(regs::rax(), self)?;
1305        let rdx = context.reg(regs::rdx(), self)?;
1306
1307        // The rhs of this binop can be in any register
1308        let rhs = context.pop_to_reg(self, None)?;
1309        // Mark rax as allocatable. and then force the lhs operand to be placed
1310        // in `rax`.
1311        context.free_reg(rax);
1312        let lhs = context.pop_to_reg(self, Some(rax))?;
1313
1314        self.asm.mul_wide(
1315            writable!(rax),
1316            writable!(rdx),
1317            lhs.reg,
1318            rhs.reg,
1319            kind,
1320            OperandSize::S64,
1321        );
1322
1323        // No longer using the rhs register after the multiplication has been
1324        // executed.
1325        context.free_reg(rhs);
1326
1327        // The low bits of the result are in rax, where `lhs` was allocated to
1328        context.stack.push(lhs.into());
1329        // The high bits of the result are in rdx, which we previously reserved.
1330        context.stack.push(Val::Reg(TypedReg::i64(rdx)));
1331
1332        Ok(())
1333    }
1334
1335    fn splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()> {
1336        // Get the source and destination operands set up first.
1337        let (src, dst) = match size {
1338            // Floats can use the same register for `src` and `dst`.
1339            SplatKind::F32x4 | SplatKind::F64x2 => {
1340                let reg = context.pop_to_reg(self, None)?.reg;
1341                (RegImm::reg(reg), writable!(reg))
1342            }
1343            // For ints, we need to load the operand into a vector register if
1344            // it's not a constant.
1345            SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 | SplatKind::I64x2 => {
1346                let dst = writable!(context.any_fpr(self)?);
1347                let src = if size == SplatKind::I64x2 {
1348                    context.pop_i64_const().map(RegImm::i64)
1349                } else {
1350                    context.pop_i32_const().map(RegImm::i32)
1351                }
1352                .map_or_else(
1353                    || -> Result<RegImm> {
1354                        let reg = context.pop_to_reg(self, None)?.reg;
1355                        self.reinterpret_int_as_float(
1356                            dst,
1357                            reg,
1358                            match size {
1359                                SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 => {
1360                                    OperandSize::S32
1361                                }
1362                                SplatKind::I64x2 => OperandSize::S64,
1363                                SplatKind::F32x4 | SplatKind::F64x2 => unreachable!(),
1364                            },
1365                        )?;
1366                        context.free_reg(reg);
1367                        Ok(RegImm::Reg(dst.to_reg()))
1368                    },
1369                    Ok,
1370                )?;
1371                (src, dst)
1372            }
1373        };
1374
1375        // Perform the splat on the operands.
1376        if size == SplatKind::I64x2 || size == SplatKind::F64x2 {
1377            self.ensure_has_avx()?;
1378            let mask = Self::vpshuf_mask_for_64_bit_splats();
1379            match src {
1380                RegImm::Reg(src) => self.asm.xmm_vpshuf_rr(src, dst, mask, OperandSize::S32),
1381                RegImm::Imm(imm) => {
1382                    let src = self.asm.add_constant(&imm.to_bytes());
1383                    self.asm
1384                        .xmm_vpshuf_mr(&src, dst, mask, OperandSize::S32, MemFlags::trusted());
1385                }
1386            }
1387        } else {
1388            self.ensure_has_avx2()?;
1389
1390            match src {
1391                RegImm::Reg(src) => self.asm.xmm_vpbroadcast_rr(src, dst, size.lane_size()),
1392                RegImm::Imm(imm) => {
1393                    let src = self.asm.add_constant(&imm.to_bytes());
1394                    self.asm
1395                        .xmm_vpbroadcast_mr(&src, dst, size.lane_size(), MemFlags::trusted());
1396                }
1397            }
1398        }
1399
1400        context
1401            .stack
1402            .push(Val::reg(dst.to_reg(), WasmValType::V128));
1403        Ok(())
1404    }
1405
1406    fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()> {
1407        self.ensure_has_avx()?;
1408
1409        // Use `vpshufb` with `lanes` to set the lanes in `lhs` and `rhs`
1410        // separately to either the selected index or 0.
1411        // Then use `vpor` to combine `lhs` and `rhs` into `dst`.
1412        // Setting the most significant bit in the mask's lane to 1 will
1413        // result in corresponding lane in the destination register being
1414        // set to 0. 0x80 sets the most significant bit to 1.
1415        let mut mask_lhs: [u8; 16] = [0x80; 16];
1416        let mut mask_rhs: [u8; 16] = [0x80; 16];
1417        for i in 0..lanes.len() {
1418            if lanes[i] < 16 {
1419                mask_lhs[i] = lanes[i];
1420            } else {
1421                mask_rhs[i] = lanes[i] - 16;
1422            }
1423        }
1424        let mask_lhs = self.asm.add_constant(&mask_lhs);
1425        let mask_rhs = self.asm.add_constant(&mask_rhs);
1426
1427        self.asm.xmm_vpshufb_rrm(dst, lhs, &mask_lhs);
1428        let scratch = writable!(regs::scratch_xmm());
1429        self.asm.xmm_vpshufb_rrm(scratch, rhs, &mask_rhs);
1430        self.asm.vpor(dst, dst.to_reg(), scratch.to_reg());
1431        Ok(())
1432    }
1433
1434    fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()> {
1435        self.ensure_has_avx()?;
1436
1437        // Clamp rhs to [0, 15 (i.e., 0xF)] and substitute 0 for anything
1438        // outside that range.
1439        // Each lane is a signed byte so the maximum value is 0x7F. Adding
1440        // 0x70 to any value higher than 0xF will saturate resulting in a value
1441        // of 0xFF (i.e., 0).
1442        let clamp = self.asm.add_constant(&[0x70; 16]);
1443        self.asm.xmm_vpaddusb_rrm(writable!(rhs), rhs, &clamp);
1444
1445        // Don't need to subtract 0x70 since `vpshufb` uses the least
1446        // significant 4 bits which are the same after adding 0x70.
1447        self.asm.xmm_vpshufb_rrr(dst, lhs, rhs);
1448        Ok(())
1449    }
1450
1451    fn atomic_rmw(
1452        &mut self,
1453        context: &mut CodeGenContext<Emission>,
1454        addr: Self::Address,
1455        size: OperandSize,
1456        op: RmwOp,
1457        flags: MemFlags,
1458        extend: Option<Extend<Zero>>,
1459    ) -> Result<()> {
1460        let res = match op {
1461            RmwOp::Add => {
1462                let operand = context.pop_to_reg(self, None)?;
1463                self.asm
1464                    .lock_xadd(addr, operand.reg, writable!(operand.reg), size, flags);
1465                operand.reg
1466            }
1467            RmwOp::Sub => {
1468                let operand = context.pop_to_reg(self, None)?;
1469                self.asm.neg(operand.reg, writable!(operand.reg), size);
1470                self.asm
1471                    .lock_xadd(addr, operand.reg, writable!(operand.reg), size, flags);
1472                operand.reg
1473            }
1474            RmwOp::Xchg => {
1475                let operand = context.pop_to_reg(self, None)?;
1476                self.asm
1477                    .xchg(addr, operand.reg, writable!(operand.reg), size, flags);
1478                operand.reg
1479            }
1480            RmwOp::And | RmwOp::Or | RmwOp::Xor => {
1481                let op = match op {
1482                    RmwOp::And => AtomicRmwSeqOp::And,
1483                    RmwOp::Or => AtomicRmwSeqOp::Or,
1484                    RmwOp::Xor => AtomicRmwSeqOp::Xor,
1485                    _ => unreachable!(
1486                        "invalid op for atomic_rmw_seq, should be one of `or`, `and` or `xor`"
1487                    ),
1488                };
1489                let dst = context.reg(regs::rax(), self)?;
1490                let operand = context.pop_to_reg(self, None)?;
1491
1492                self.asm
1493                    .atomic_rmw_seq(addr, operand.reg, writable!(dst), size, flags, op);
1494
1495                context.free_reg(operand.reg);
1496                dst
1497            }
1498        };
1499
1500        let dst_ty = match extend {
1501            Some(ext) => {
1502                // We don't need to zero-extend from 32 to 64bits.
1503                if !(ext.from_bits() == 32 && ext.to_bits() == 64) {
1504                    self.asm.movzx_rr(res, writable!(res), ext.into());
1505                }
1506
1507                WasmValType::int_from_bits(ext.to_bits())
1508            }
1509            None => WasmValType::int_from_bits(size.num_bits()),
1510        };
1511
1512        context.stack.push(TypedReg::new(dst_ty, res).into());
1513
1514        Ok(())
1515    }
1516
1517    fn extract_lane(
1518        &mut self,
1519        src: Reg,
1520        dst: WritableReg,
1521        lane: u8,
1522        kind: ExtractLaneKind,
1523    ) -> Result<()> {
1524        self.ensure_has_avx()?;
1525
1526        match kind {
1527            ExtractLaneKind::I8x16S
1528            | ExtractLaneKind::I8x16U
1529            | ExtractLaneKind::I16x8S
1530            | ExtractLaneKind::I16x8U
1531            | ExtractLaneKind::I32x4
1532            | ExtractLaneKind::I64x2 => self.asm.xmm_vpextr_rr(dst, src, lane, kind.lane_size()),
1533            ExtractLaneKind::F32x4 | ExtractLaneKind::F64x2 if lane == 0 => {
1534                // If the `src` and `dst` registers are the same, then the
1535                // appropriate value is already in the correct position in
1536                // the register.
1537                assert!(src == dst.to_reg());
1538            }
1539            ExtractLaneKind::F32x4 => self.asm.xmm_vpshuf_rr(src, dst, lane, kind.lane_size()),
1540            ExtractLaneKind::F64x2 => {
1541                // `0b11_10` selects the high and low 32-bits of the second
1542                // 64-bit, so `0b11_10_11_10` splats the 64-bit value across
1543                // both lanes. Since we put an `f64` on the stack, we use
1544                // the splatted value.
1545                // Double-check `lane == 0` was handled in another branch.
1546                assert!(lane == 1);
1547                self.asm
1548                    .xmm_vpshuf_rr(src, dst, 0b11_10_11_10, OperandSize::S32)
1549            }
1550        }
1551
1552        // Sign-extend to 32-bits for sign extended kinds.
1553        match kind {
1554            ExtractLaneKind::I8x16S | ExtractLaneKind::I16x8S => {
1555                self.asm.movsx_rr(dst.to_reg(), dst, kind.into())
1556            }
1557            _ => (),
1558        }
1559
1560        Ok(())
1561    }
1562
1563    fn replace_lane(
1564        &mut self,
1565        src: RegImm,
1566        dst: WritableReg,
1567        lane: u8,
1568        kind: ReplaceLaneKind,
1569    ) -> Result<()> {
1570        self.ensure_has_avx()?;
1571
1572        match kind {
1573            ReplaceLaneKind::I8x16
1574            | ReplaceLaneKind::I16x8
1575            | ReplaceLaneKind::I32x4
1576            | ReplaceLaneKind::I64x2 => match src {
1577                RegImm::Reg(reg) => {
1578                    self.asm
1579                        .xmm_vpinsr_rrr(dst, dst.to_reg(), reg, lane, kind.lane_size());
1580                }
1581                RegImm::Imm(imm) => {
1582                    let address = self.asm.add_constant(&imm.to_bytes());
1583                    self.asm
1584                        .xmm_vpinsr_rrm(dst, dst.to_reg(), &address, lane, kind.lane_size());
1585                }
1586            },
1587            ReplaceLaneKind::F32x4 => {
1588                // Immediate for `vinsertps` uses first 3 bits to determine
1589                // which elements of the destination to set to 0. The next 2
1590                // bits specify which element of the destination will be
1591                // overwritten.
1592                let imm = lane << 4;
1593                match src {
1594                    RegImm::Reg(reg) => self.asm.xmm_vinsertps_rrr(dst, dst.to_reg(), reg, imm),
1595                    RegImm::Imm(val) => {
1596                        let address = self.asm.add_constant(&val.to_bytes());
1597                        self.asm.xmm_vinsertps_rrm(dst, dst.to_reg(), &address, imm);
1598                    }
1599                }
1600            }
1601            ReplaceLaneKind::F64x2 => match src {
1602                RegImm::Reg(reg) => match lane {
1603                    0 => self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), reg),
1604                    1 => self.asm.xmm_vmovlhps_rrr(dst, dst.to_reg(), reg),
1605                    _ => unreachable!(),
1606                },
1607                RegImm::Imm(imm) => {
1608                    let address = self.asm.add_constant(&imm.to_bytes());
1609                    match lane {
1610                        0 => {
1611                            // Memory load variant of `vmovsd` zeroes the upper
1612                            // 64 bits of the register so need to load the
1613                            // immediate to a register to use the register
1614                            // variant of `vmovsd` to perform the merge.
1615                            let scratch = writable!(regs::scratch_xmm());
1616                            self.asm.xmm_vmovsd_rm(scratch, &address);
1617                            self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), scratch.to_reg());
1618                        }
1619                        1 => self.asm.xmm_vmovlhps_rrm(dst, dst.to_reg(), &address),
1620                        _ => unreachable!(),
1621                    }
1622                }
1623            },
1624        }
1625        Ok(())
1626    }
1627
1628    fn atomic_cas(
1629        &mut self,
1630        context: &mut CodeGenContext<Emission>,
1631        addr: Self::Address,
1632        size: OperandSize,
1633        flags: MemFlags,
1634        extend: Option<Extend<Zero>>,
1635    ) -> Result<()> {
1636        // `cmpxchg` expects `expected` to be in the `*a*` register.
1637        // reserve rax for the expected argument.
1638        let rax = context.reg(regs::rax(), self)?;
1639
1640        let replacement = context.pop_to_reg(self, None)?;
1641
1642        // mark `rax` as allocatable again.
1643        context.free_reg(rax);
1644        let expected = context.pop_to_reg(self, Some(regs::rax()))?;
1645
1646        self.asm.cmpxchg(
1647            addr,
1648            expected.reg,
1649            replacement.reg,
1650            writable!(expected.reg),
1651            size,
1652            flags,
1653        );
1654
1655        if let Some(extend) = extend {
1656            // We don't need to zero-extend from 32 to 64bits.
1657            if !(extend.from_bits() == 32 && extend.to_bits() == 64) {
1658                self.asm
1659                    .movzx_rr(expected.reg.into(), writable!(expected.reg.into()), extend);
1660            }
1661        }
1662
1663        context.stack.push(expected.into());
1664        context.free_reg(replacement);
1665
1666        Ok(())
1667    }
1668
1669    fn v128_eq(
1670        &mut self,
1671        dst: WritableReg,
1672        lhs: Reg,
1673        rhs: Reg,
1674        kind: VectorEqualityKind,
1675    ) -> Result<()> {
1676        self.ensure_has_avx()?;
1677
1678        match kind {
1679            VectorEqualityKind::I8x16
1680            | VectorEqualityKind::I16x8
1681            | VectorEqualityKind::I32x4
1682            | VectorEqualityKind::I64x2 => {
1683                self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size())
1684            }
1685            VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1686                self.asm
1687                    .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Eq)
1688            }
1689        }
1690        Ok(())
1691    }
1692
1693    fn v128_ne(
1694        &mut self,
1695        dst: WritableReg,
1696        lhs: Reg,
1697        rhs: Reg,
1698        kind: VectorEqualityKind,
1699    ) -> Result<()> {
1700        self.ensure_has_avx()?;
1701
1702        match kind {
1703            VectorEqualityKind::I8x16
1704            | VectorEqualityKind::I16x8
1705            | VectorEqualityKind::I32x4
1706            | VectorEqualityKind::I64x2 => {
1707                // Check for equality and invert the results.
1708                self.asm
1709                    .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1710                self.asm
1711                    .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1712                self.asm.xmm_vex_rr(AvxOpcode::Vpxor, lhs, rhs, dst);
1713            }
1714            VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1715                self.asm
1716                    .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Ne)
1717            }
1718        }
1719        Ok(())
1720    }
1721
1722    fn v128_lt(
1723        &mut self,
1724        dst: WritableReg,
1725        lhs: Reg,
1726        rhs: Reg,
1727        kind: VectorCompareKind,
1728    ) -> Result<()> {
1729        self.ensure_has_avx()?;
1730
1731        match kind {
1732            VectorCompareKind::I8x16S
1733            | VectorCompareKind::I16x8S
1734            | VectorCompareKind::I32x4S
1735            | VectorCompareKind::I64x2S => {
1736                // Perform a greater than check with reversed parameters.
1737                self.asm.xmm_vpcmpgt_rrr(dst, rhs, lhs, kind.lane_size())
1738            }
1739            VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1740                // Set `lhs` to min values, check for equality, then invert the
1741                // result.
1742                // If `lhs` is smaller, then equality check will fail and result
1743                // will be inverted to true. Otherwise the equality check will
1744                // pass and be inverted to false.
1745                self.asm
1746                    .xmm_vpminu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1747                self.asm
1748                    .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1749                self.asm
1750                    .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1751                self.asm.xmm_vex_rr(AvxOpcode::Vpxor, lhs, rhs, dst);
1752            }
1753            VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1754                self.asm
1755                    .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Lt)
1756            }
1757        }
1758        Ok(())
1759    }
1760
1761    fn v128_le(
1762        &mut self,
1763        dst: WritableReg,
1764        lhs: Reg,
1765        rhs: Reg,
1766        kind: VectorCompareKind,
1767    ) -> Result<()> {
1768        self.ensure_has_avx()?;
1769
1770        match kind {
1771            VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1772                // Set the `rhs` vector to the signed minimum values and then
1773                // compare them with `lhs` for equality.
1774                self.asm
1775                    .xmm_vpmins_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1776                self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1777            }
1778            VectorCompareKind::I64x2S => {
1779                // Do a greater than check and invert the results.
1780                self.asm
1781                    .xmm_vpcmpgt_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1782                self.asm
1783                    .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1784                self.asm.xmm_vex_rr(AvxOpcode::Vpxor, lhs, rhs, dst);
1785            }
1786            VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1787                // Set the `rhs` vector to the signed minimum values and then
1788                // compare them with `lhs` for equality.
1789                self.asm
1790                    .xmm_vpminu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1791                self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1792            }
1793            VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1794                self.asm
1795                    .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Le)
1796            }
1797        }
1798        Ok(())
1799    }
1800
1801    fn v128_gt(
1802        &mut self,
1803        dst: WritableReg,
1804        lhs: Reg,
1805        rhs: Reg,
1806        kind: VectorCompareKind,
1807    ) -> Result<()> {
1808        self.ensure_has_avx()?;
1809
1810        match kind {
1811            VectorCompareKind::I8x16S
1812            | VectorCompareKind::I16x8S
1813            | VectorCompareKind::I32x4S
1814            | VectorCompareKind::I64x2S => {
1815                self.asm.xmm_vpcmpgt_rrr(dst, lhs, rhs, kind.lane_size())
1816            }
1817            VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1818                // Set `lhs` to max values, check for equality, then invert the
1819                // result.
1820                // If `lhs` is larger, then equality check will fail and result
1821                // will be inverted to true. Otherwise the equality check will
1822                // pass and be inverted to false.
1823                self.asm
1824                    .xmm_vpmaxu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1825                self.asm
1826                    .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1827                self.asm
1828                    .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1829                self.asm.xmm_vex_rr(AvxOpcode::Vpxor, lhs, rhs, dst);
1830            }
1831            VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1832                // Do a less than comparison with the operands swapped.
1833                self.asm
1834                    .xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Lt)
1835            }
1836        }
1837        Ok(())
1838    }
1839
1840    fn v128_ge(
1841        &mut self,
1842        dst: WritableReg,
1843        lhs: Reg,
1844        rhs: Reg,
1845        kind: VectorCompareKind,
1846    ) -> Result<()> {
1847        self.ensure_has_avx()?;
1848
1849        match kind {
1850            VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1851                // Set each lane to maximum value and then compare for equality.
1852                self.asm
1853                    .xmm_vpmaxs_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1854                self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1855            }
1856            VectorCompareKind::I64x2S => {
1857                // Perform a greater than comparison with operands swapped,
1858                // then invert the results.
1859                self.asm
1860                    .xmm_vpcmpgt_rrr(writable!(rhs), rhs, lhs, kind.lane_size());
1861                self.asm.xmm_vpcmpeq_rrr(dst, lhs, lhs, kind.lane_size());
1862                self.asm
1863                    .xmm_vex_rr(AvxOpcode::Vpxor, dst.to_reg(), rhs, dst);
1864            }
1865            VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1866                // Set lanes to maximum values and compare them for equality.
1867                self.asm
1868                    .xmm_vpmaxu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1869                self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1870            }
1871            VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1872                // Perform a less than or equal comparison on swapped operands.
1873                self.asm
1874                    .xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Le)
1875            }
1876        }
1877
1878        Ok(())
1879    }
1880
1881    fn fence(&mut self) -> Result<()> {
1882        self.asm.fence(FenceKind::MFence);
1883        Ok(())
1884    }
1885
1886    fn v128_not(&mut self, dst: WritableReg) -> Result<()> {
1887        self.ensure_has_avx()?;
1888
1889        let tmp = regs::scratch_xmm();
1890        // First, we initialize `tmp` with all ones, by comparing it with itself.
1891        self.asm
1892            .xmm_vex_rr(AvxOpcode::Vpcmpeqd, tmp, tmp, writable!(tmp));
1893        // then we `xor` tmp and `dst` together, yielding `!dst`.
1894        self.asm
1895            .xmm_vex_rr(AvxOpcode::Vpxor, tmp, dst.to_reg(), dst);
1896        Ok(())
1897    }
1898
1899    fn v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1900        self.ensure_has_avx()?;
1901        self.asm.xmm_vex_rr(AvxOpcode::Vpand, src1, src2, dst);
1902        Ok(())
1903    }
1904
1905    fn v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1906        self.ensure_has_avx()?;
1907        self.asm.xmm_vex_rr(AvxOpcode::Vpandn, src1, src2, dst);
1908        Ok(())
1909    }
1910
1911    fn v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1912        self.ensure_has_avx()?;
1913        self.asm.xmm_vex_rr(AvxOpcode::Vpor, src1, src2, dst);
1914        Ok(())
1915    }
1916
1917    fn v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1918        self.ensure_has_avx()?;
1919        self.asm.xmm_vex_rr(AvxOpcode::Vpxor, src1, src2, dst);
1920        Ok(())
1921    }
1922
1923    fn v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()> {
1924        self.ensure_has_avx()?;
1925        let tmp = regs::scratch_xmm();
1926        self.v128_and(src1, mask, writable!(tmp))?;
1927        self.v128_and_not(mask, src2, dst)?;
1928        self.v128_or(dst.to_reg(), tmp, dst)?;
1929
1930        Ok(())
1931    }
1932
1933    fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
1934        self.ensure_has_avx()?;
1935        self.asm.xmm_vptest(src, src);
1936        self.asm.setcc(IntCmpKind::Ne, dst);
1937        Ok(())
1938    }
1939
1940    fn v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()> {
1941        self.ensure_has_avx()?;
1942        match kind {
1943            V128ConvertKind::I32x4S => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF32),
1944            V128ConvertKind::I32x4LowS => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF64),
1945            V128ConvertKind::I32x4U => {
1946                let scratch = writable!(regs::scratch_xmm());
1947
1948                // Split each 32-bit integer into 16-bit parts.
1949                // `scratch` will contain the low bits and `dst` will contain
1950                // the high bits.
1951                self.asm
1952                    .xmm_vpsll_rr(src, scratch, 0x10, kind.src_lane_size());
1953                self.asm
1954                    .xmm_vpsrl_rr(scratch.to_reg(), scratch, 0x10, kind.src_lane_size());
1955                self.asm
1956                    .xmm_vpsub_rrr(src, scratch.to_reg(), dst, kind.src_lane_size());
1957
1958                // Convert the low bits in `scratch` to floating point numbers.
1959                self.asm
1960                    .xmm_vcvt_rr(scratch.to_reg(), scratch, VcvtKind::I32ToF32);
1961
1962                // Prevent overflow by right shifting high bits.
1963                self.asm
1964                    .xmm_vpsrl_rr(dst.to_reg(), dst, 1, kind.src_lane_size());
1965                // Convert high bits in `dst` to floating point numbers.
1966                self.asm.xmm_vcvt_rr(dst.to_reg(), dst, VcvtKind::I32ToF32);
1967                // Double high bits in `dst` to reverse right shift.
1968                self.asm
1969                    .xmm_vaddp_rrr(dst.to_reg(), dst.to_reg(), dst, kind.src_lane_size());
1970                // Add high bits in `dst` to low bits in `scratch`.
1971                self.asm
1972                    .xmm_vaddp_rrr(dst.to_reg(), scratch.to_reg(), dst, kind.src_lane_size());
1973            }
1974            V128ConvertKind::I32x4LowU => {
1975                // See
1976                // https://github.com/bytecodealliance/wasmtime/blob/bb886ffc3c81a476d8ba06311ff2dede15a6f7e1/cranelift/codegen/src/isa/x64/lower.isle#L3668
1977                // for details on the Cranelift AVX implementation.
1978                // Use `vunpcklp` to create doubles from the integers.
1979                // Interleaving 0x1.0p52 (i.e., 0x43300000) with the integers
1980                // creates a byte array for a double that sets the mantissa
1981                // bits to the original integer value.
1982                let conversion_constant = self
1983                    .asm
1984                    .add_constant(&[0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43]);
1985                self.asm
1986                    .xmm_vunpcklp_rrm(src, &conversion_constant, dst, kind.src_lane_size());
1987                // Subtract the 0x1.0p52 added above.
1988                let conversion_constant = self.asm.add_constant(&[
1989                    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
1990                    0x00, 0x30, 0x43,
1991                ]);
1992                self.asm.xmm_vsub_rrm(
1993                    dst.to_reg(),
1994                    &conversion_constant,
1995                    dst,
1996                    kind.dst_lane_size(),
1997                );
1998            }
1999        }
2000        Ok(())
2001    }
2002
2003    fn v128_narrow(
2004        &mut self,
2005        src1: Reg,
2006        src2: Reg,
2007        dst: WritableReg,
2008        kind: V128NarrowKind,
2009    ) -> Result<()> {
2010        self.ensure_has_avx()?;
2011        match kind {
2012            V128NarrowKind::I16x8S | V128NarrowKind::I32x4S => {
2013                self.asm
2014                    .xmm_vpackss_rrr(src1, src2, dst, kind.dst_lane_size())
2015            }
2016            V128NarrowKind::I16x8U | V128NarrowKind::I32x4U => {
2017                self.asm
2018                    .xmm_vpackus_rrr(src1, src2, dst, kind.dst_lane_size())
2019            }
2020        }
2021        Ok(())
2022    }
2023
2024    fn v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2025        self.ensure_has_avx()?;
2026        self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F64ToF32);
2027        Ok(())
2028    }
2029
2030    fn v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2031        self.ensure_has_avx()?;
2032        self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F32ToF64);
2033        Ok(())
2034    }
2035
2036    fn v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()> {
2037        self.ensure_has_avx()?;
2038        match kind {
2039            V128ExtendKind::LowI8x16S
2040            | V128ExtendKind::LowI8x16U
2041            | V128ExtendKind::LowI16x8S
2042            | V128ExtendKind::LowI16x8U
2043            | V128ExtendKind::LowI32x4S
2044            | V128ExtendKind::LowI32x4U => self.asm.xmm_vpmov_rr(src, dst, kind.into()),
2045            V128ExtendKind::HighI8x16S | V128ExtendKind::HighI16x8S => {
2046                self.asm.xmm_vpalignr_rrr(src, src, dst, 0x8);
2047                self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2048            }
2049            V128ExtendKind::HighI8x16U | V128ExtendKind::HighI16x8U => {
2050                let scratch = regs::scratch_xmm();
2051                self.asm
2052                    .xmm_vex_rr(AvxOpcode::Vpxor, scratch, scratch, writable!(scratch));
2053                self.asm
2054                    .xmm_vpunpckh_rrr(src, scratch, dst, kind.src_lane_size());
2055            }
2056            V128ExtendKind::HighI32x4S => {
2057                // Move the 3rd element (i.e., 0b10) to the 1st (rightmost)
2058                // position and the 4th element (i.e., 0b11) to the 2nd (second
2059                // from the right) position and then perform the extend.
2060                self.asm
2061                    .xmm_vpshuf_rr(src, dst, 0b11_10_11_10, kind.src_lane_size());
2062                self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2063            }
2064            V128ExtendKind::HighI32x4U => {
2065                // Set `scratch` to a vector 0s.
2066                let scratch = regs::scratch_xmm();
2067                self.asm
2068                    .xmm_vxorp_rrr(scratch, scratch, writable!(scratch), kind.src_lane_size());
2069                // Interleave the 0 bits into the two 32-bit integers to zero extend them.
2070                self.asm
2071                    .xmm_vunpckhp_rrr(src, scratch, dst, kind.src_lane_size());
2072            }
2073        }
2074        Ok(())
2075    }
2076
2077    fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()> {
2078        self.ensure_has_avx()?;
2079
2080        let op = match kind {
2081            V128AddKind::F32x4 => AvxOpcode::Vaddps,
2082            V128AddKind::F64x2 => AvxOpcode::Vaddpd,
2083            V128AddKind::I8x16 => AvxOpcode::Vpaddb,
2084            V128AddKind::I8x16SatS => AvxOpcode::Vpaddsb,
2085            V128AddKind::I8x16SatU => AvxOpcode::Vpaddusb,
2086            V128AddKind::I16x8 => AvxOpcode::Vpaddw,
2087            V128AddKind::I16x8SatS => AvxOpcode::Vpaddsw,
2088            V128AddKind::I16x8SatU => AvxOpcode::Vpaddusw,
2089            V128AddKind::I32x4 => AvxOpcode::Vpaddd,
2090            V128AddKind::I64x2 => AvxOpcode::Vpaddq,
2091        };
2092        self.asm.xmm_vex_rr(op, lhs, rhs, dst);
2093        Ok(())
2094    }
2095
2096    fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()> {
2097        self.ensure_has_avx()?;
2098
2099        let op = match kind {
2100            V128SubKind::F32x4 => AvxOpcode::Vsubps,
2101            V128SubKind::F64x2 => AvxOpcode::Vsubpd,
2102            V128SubKind::I8x16 => AvxOpcode::Vpsubb,
2103            V128SubKind::I8x16SatS => AvxOpcode::Vpsubsb,
2104            V128SubKind::I8x16SatU => AvxOpcode::Vpsubusb,
2105            V128SubKind::I16x8 => AvxOpcode::Vpsubw,
2106            V128SubKind::I16x8SatS => AvxOpcode::Vpsubsw,
2107            V128SubKind::I16x8SatU => AvxOpcode::Vpsubusw,
2108            V128SubKind::I32x4 => AvxOpcode::Vpsubd,
2109            V128SubKind::I64x2 => AvxOpcode::Vpsubq,
2110        };
2111        self.asm.xmm_vex_rr(op, lhs, rhs, dst);
2112        Ok(())
2113    }
2114
2115    fn v128_mul(
2116        &mut self,
2117        context: &mut CodeGenContext<Emission>,
2118        kind: V128MulKind,
2119    ) -> Result<()> {
2120        self.ensure_has_avx()?;
2121
2122        let rhs = context.pop_to_reg(self, None)?;
2123        let lhs = context.pop_to_reg(self, None)?;
2124
2125        let mul_avx = |this: &mut Self, op| {
2126            this.asm
2127                .xmm_vex_rr(op, lhs.reg, rhs.reg, writable!(lhs.reg));
2128        };
2129
2130        let mul_i64x2_avx512 = |this: &mut Self| {
2131            this.asm
2132                .xmm_rm_rvex3(Avx512Opcode::Vpmullq, lhs.reg, rhs.reg, writable!(lhs.reg));
2133        };
2134
2135        let mul_i64x2_fallback =
2136            |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
2137                // Standard AVX doesn't have an instruction for i64x2 multiplication, instead, we have to fallback
2138                // to an instruction sequence using 32bits multiplication (taken from cranelift
2139                // implementation, in `isa/x64/lower.isle`):
2140                //
2141                // > Otherwise, for i64x2 multiplication we describe a lane A as being composed of
2142                // > a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand
2143                // > multiplication can then be written as:
2144                //
2145                // >    Ah Al
2146                // > *  Bh Bl
2147                // >    -----
2148                // >    Al * Bl
2149                // > + (Ah * Bl) << 32
2150                // > + (Al * Bh) << 32
2151                //
2152                // > So for each lane we will compute:
2153                //
2154                // >   A * B  = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
2155                //
2156                // > Note, the algorithm will use `pmuludq` which operates directly on the lower
2157                // > 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of
2158                // > the lane of the destination. For this reason we don't need shifts to isolate
2159                // > the lower 32-bits, however, we will need to use shifts to isolate the high
2160                // > 32-bits when doing calculations, i.e., `Ah == A >> 32`.
2161
2162                let tmp1 = regs::scratch_xmm();
2163                let tmp2 = context.any_fpr(this)?;
2164
2165                // tmp1 = lhs_hi = (lhs >> 32)
2166                this.asm
2167                    .xmm_vex_ri(AvxOpcode::Vpsrlq, lhs.reg, 32, writable!(tmp1));
2168                // tmp2 = lhs_hi * rhs_low = tmp1 * rhs
2169                this.asm
2170                    .xmm_vex_rr(AvxOpcode::Vpmuldq, tmp1, rhs.reg, writable!(tmp2));
2171
2172                // tmp1 = rhs_hi = rhs >> 32
2173                this.asm
2174                    .xmm_vex_ri(AvxOpcode::Vpsrlq, rhs.reg, 32, writable!(tmp1));
2175
2176                // tmp1 = lhs_low * rhs_high = tmp1 * lhs
2177                this.asm
2178                    .xmm_vex_rr(AvxOpcode::Vpmuludq, tmp1, lhs.reg, writable!(tmp1));
2179
2180                // tmp1 = ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2
2181                this.asm
2182                    .xmm_vex_rr(AvxOpcode::Vpaddq, tmp1, tmp2, writable!(tmp1));
2183
2184                //tmp1 = tmp1 << 32
2185                this.asm
2186                    .xmm_vex_ri(AvxOpcode::Vpsllq, tmp1, 32, writable!(tmp1));
2187
2188                // tmp2 = lhs_lo + rhs_lo
2189                this.asm
2190                    .xmm_vex_rr(AvxOpcode::Vpmuludq, lhs.reg, rhs.reg, writable!(tmp2));
2191
2192                // finally, with `lhs` as destination:
2193                // lhs = (lhs_low * rhs_low) + ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2
2194                this.asm
2195                    .xmm_vex_rr(AvxOpcode::Vpaddq, tmp1, tmp2, writable!(lhs.reg));
2196
2197                context.free_reg(tmp2);
2198
2199                Ok(())
2200            };
2201
2202        match kind {
2203            V128MulKind::F32x4 => mul_avx(self, AvxOpcode::Vmulps),
2204            V128MulKind::F64x2 => mul_avx(self, AvxOpcode::Vmulpd),
2205            V128MulKind::I16x8 => mul_avx(self, AvxOpcode::Vpmullw),
2206            V128MulKind::I32x4 => mul_avx(self, AvxOpcode::Vpmulld),
2207            // This is the fast path when AVX512 is available.
2208            V128MulKind::I64x2
2209                if self.ensure_has_avx512vl().is_ok() && self.ensure_has_avx512dq().is_ok() =>
2210            {
2211                mul_i64x2_avx512(self)
2212            }
2213            // Otherwise, we emit AVX fallback sequence.
2214            V128MulKind::I64x2 => mul_i64x2_fallback(self, context)?,
2215        }
2216
2217        context.stack.push(lhs.into());
2218        context.free_reg(rhs);
2219
2220        Ok(())
2221    }
2222
2223    fn v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()> {
2224        self.ensure_has_avx()?;
2225
2226        match kind {
2227            V128AbsKind::I8x16 | V128AbsKind::I16x8 | V128AbsKind::I32x4 => {
2228                self.asm.xmm_vpabs_rr(src, dst, kind.lane_size())
2229            }
2230            V128AbsKind::I64x2 => {
2231                let scratch = writable!(regs::scratch_xmm());
2232                // Perform an arithmetic right shift of 31 bits. If the number
2233                // is positive, this will result in all zeroes in the upper
2234                // 32-bits. If the number is negative, this will result in all
2235                // ones in the upper 32-bits.
2236                self.asm.xmm_vpsra_rri(src, scratch, 0x1f, OperandSize::S32);
2237                // Copy the ones and zeroes in the high bits of each 64-bit
2238                // lane to the low bits of each 64-bit lane.
2239                self.asm
2240                    .xmm_vpshuf_rr(scratch.to_reg(), scratch, 0b11_11_01_01, OperandSize::S32);
2241                // Flip the bits in lanes that were negative in `src` and leave
2242                // the positive lanes as they are. Positive lanes will have a
2243                // zero mask in `scratch` so xor doesn't affect them.
2244                self.asm
2245                    .xmm_vex_rr(AvxOpcode::Vpxor, src, scratch.to_reg(), dst);
2246                // Subtract the mask from the results of xor which will
2247                // complete the two's complement for lanes which were negative.
2248                self.asm
2249                    .xmm_vpsub_rrr(dst.to_reg(), scratch.to_reg(), dst, kind.lane_size());
2250            }
2251            V128AbsKind::F32x4 | V128AbsKind::F64x2 => {
2252                let scratch = writable!(regs::scratch_xmm());
2253                // Create a mask of all ones.
2254                self.asm.xmm_vpcmpeq_rrr(
2255                    scratch,
2256                    scratch.to_reg(),
2257                    scratch.to_reg(),
2258                    kind.lane_size(),
2259                );
2260                // Right shift the mask so each lane is a single zero followed
2261                // by all ones.
2262                self.asm
2263                    .xmm_vpsrl_rr(scratch.to_reg(), scratch, 0x1, kind.lane_size());
2264                // Use the mask to zero the sign bit in each lane which will
2265                // make the float value positive.
2266                self.asm
2267                    .xmm_vandp_rrr(src, scratch.to_reg(), dst, kind.lane_size());
2268            }
2269        }
2270        Ok(())
2271    }
2272
2273    fn v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()> {
2274        self.ensure_has_avx()?;
2275
2276        let tmp = regs::scratch_xmm();
2277        match kind {
2278            V128NegKind::I8x16 | V128NegKind::I16x8 | V128NegKind::I32x4 | V128NegKind::I64x2 => {
2279                self.v128_xor(tmp, tmp, writable!(tmp))?;
2280                self.v128_sub(tmp, op.to_reg(), op, kind.into())?;
2281            }
2282            V128NegKind::F32x4 | V128NegKind::F64x2 => {
2283                // Create a mask of all 1s.
2284                self.asm
2285                    .xmm_vpcmpeq_rrr(writable!(tmp), tmp, tmp, kind.lane_size());
2286                // Left shift the lanes in the mask so only the sign bit in the
2287                // mask is set to 1.
2288                self.asm.xmm_vpsll_rr(
2289                    tmp,
2290                    writable!(tmp),
2291                    (kind.lane_size().num_bits() - 1) as u32,
2292                    kind.lane_size(),
2293                );
2294                // Use the mask to flip the sign bit.
2295                self.asm
2296                    .xmm_vxorp_rrr(op.to_reg(), tmp, op, kind.lane_size());
2297            }
2298        }
2299        Ok(())
2300    }
2301
2302    fn v128_shift(
2303        &mut self,
2304        context: &mut CodeGenContext<Emission>,
2305        lane_width: OperandSize,
2306        kind: ShiftKind,
2307    ) -> Result<()> {
2308        self.ensure_has_avx()?;
2309        let shift_amount = context.pop_to_reg(self, None)?.reg;
2310        let operand = context.pop_to_reg(self, None)?.reg;
2311
2312        let tmp_xmm = regs::scratch_xmm();
2313        let tmp = regs::scratch();
2314        let amount_mask = lane_width.num_bits() - 1;
2315        self.and(
2316            writable!(shift_amount),
2317            shift_amount,
2318            RegImm::i32(amount_mask as i32),
2319            OperandSize::S32,
2320        )?;
2321
2322        let shl_normal = |this: &mut Self, op: AvxOpcode| {
2323            this.asm
2324                .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
2325            this.asm
2326                .xmm_vex_rr(op, operand, tmp_xmm, writable!(operand));
2327        };
2328
2329        let shift_i8x16 = |this: &mut Self, masks: &'static [u8], op: AvxOpcode| {
2330            // The case for i8x16 is a little bit trickier because x64 doesn't provide a 8bit
2331            // shift instruction. Instead, we shift as 16bits, and then mask the bits in the
2332            // 8bits lane, for example (with 2 8bits lanes):
2333            // - Before shifting:
2334            // 01001101 11101110
2335            // - shifting by 2 left:
2336            // 00110111 10111000
2337            //       ^^_ these bits come from the previous byte, and need to be masked.
2338            // - The mask:
2339            // 11111100 11111111
2340            // - After masking:
2341            // 00110100 10111000
2342            //
2343            // The mask is loaded from a well known memory, depending on the shift amount.
2344
2345            this.asm
2346                .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
2347
2348            // perform 16 bit shift
2349            this.asm
2350                .xmm_vex_rr(op, operand, tmp_xmm, writable!(operand));
2351
2352            // get a handle to the masks array constant.
2353            let masks_addr = this.asm.add_constant(masks);
2354
2355            // Load the masks array effective address into the tmp register.
2356            this.asm.lea(&masks_addr, writable!(tmp), OperandSize::S64);
2357
2358            // Compute the offset of the mask that we need to use. This is shift_amount * 16 ==
2359            // shift_amount << 4.
2360            this.asm
2361                .shift_ir(4, writable!(shift_amount), ShiftKind::Shl, OperandSize::S32);
2362
2363            // Load the mask to tmp_xmm.
2364            this.asm.xmm_vmovdqu_mr(
2365                &Address::ImmRegRegShift {
2366                    simm32: 0,
2367                    base: tmp,
2368                    index: shift_amount,
2369                    shift: 0,
2370                },
2371                writable!(tmp_xmm),
2372                MemFlags::trusted(),
2373            );
2374
2375            // Mask unwanted bits from operand.
2376            this.asm
2377                .xmm_vex_rr(AvxOpcode::Vpand, tmp_xmm, operand, writable!(operand));
2378        };
2379
2380        let i64x2_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
2381            const SIGN_MASK: u128 = 0x8000000000000000_8000000000000000;
2382
2383            // AVX doesn't have an instruction for i64x2 signed right shift. Instead we use the
2384            // following formula (from hacker's delight 2-7), where x is the value and n the shift
2385            // amount, for each lane:
2386            // t = (1 << 63) >> n; ((x >> n) ^ t) - t
2387
2388            // we need an extra scratch register
2389            let tmp_xmm2 = context.any_fpr(this)?;
2390
2391            this.asm
2392                .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
2393
2394            let cst = this.asm.add_constant(&SIGN_MASK.to_le_bytes());
2395
2396            this.asm
2397                .xmm_vmovdqu_mr(&cst, writable!(tmp_xmm2), MemFlags::trusted());
2398            this.asm
2399                .xmm_vex_rr(AvxOpcode::Vpsrlq, tmp_xmm2, tmp_xmm, writable!(tmp_xmm2));
2400            this.asm
2401                .xmm_vex_rr(AvxOpcode::Vpsrlq, operand, tmp_xmm, writable!(operand));
2402            this.asm
2403                .xmm_vex_rr(AvxOpcode::Vpxor, operand, tmp_xmm2, writable!(operand));
2404            this.asm
2405                .xmm_vex_rr(AvxOpcode::Vpsubq, operand, tmp_xmm2, writable!(operand));
2406
2407            context.free_reg(tmp_xmm2);
2408
2409            Ok(())
2410        };
2411
2412        let i8x16_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
2413            // Since the x86 instruction set does not have an 8x16 shift instruction and the
2414            // approach used for `ishl` and `ushr` cannot be easily used (the masks do not
2415            // preserve the sign), we use a different approach here: separate the low and
2416            // high lanes, shift them separately, and merge them into the final result.
2417            //
2418            // Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
2419            // s15]:
2420            //
2421            //   lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
2422            //   shifted_lo.i16x8 = shift each lane of `low`
2423            //   hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
2424            //   shifted_hi.i16x8 = shift each lane of `high`
2425            //   result = [s0'', s1'', ..., s15'']
2426
2427            // In order for `packsswb` later to only use the high byte of each
2428            // 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
2429            // fill in the upper bits appropriately.
2430            this.asm
2431                .add_ir(8, writable!(shift_amount), OperandSize::S32);
2432            this.asm
2433                .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
2434
2435            let tmp_lo = context.any_fpr(this)?;
2436            let tmp_hi = context.any_fpr(this)?;
2437
2438            // Extract lower and upper bytes.
2439            this.asm
2440                .xmm_vex_rr(AvxOpcode::Vpunpcklbw, operand, operand, writable!(tmp_lo));
2441            this.asm
2442                .xmm_vex_rr(AvxOpcode::Vpunpckhbw, operand, operand, writable!(tmp_hi));
2443
2444            // Perform 16bit right shift of upper and lower bytes.
2445            this.asm
2446                .xmm_vex_rr(AvxOpcode::Vpsraw, tmp_lo, tmp_xmm, writable!(tmp_lo));
2447            this.asm
2448                .xmm_vex_rr(AvxOpcode::Vpsraw, tmp_hi, tmp_xmm, writable!(tmp_hi));
2449
2450            // Merge lower and upper bytes back.
2451            this.asm
2452                .xmm_vex_rr(AvxOpcode::Vpacksswb, tmp_lo, tmp_hi, writable!(operand));
2453
2454            context.free_reg(tmp_lo);
2455            context.free_reg(tmp_hi);
2456
2457            Ok(())
2458        };
2459
2460        match (lane_width, kind) {
2461            // shl
2462            (OperandSize::S8, ShiftKind::Shl) => {
2463                shift_i8x16(self, &I8X16_ISHL_MASKS, AvxOpcode::Vpsllw)
2464            }
2465            (OperandSize::S16, ShiftKind::Shl) => shl_normal(self, AvxOpcode::Vpsllw),
2466            (OperandSize::S32, ShiftKind::Shl) => shl_normal(self, AvxOpcode::Vpslld),
2467            (OperandSize::S64, ShiftKind::Shl) => shl_normal(self, AvxOpcode::Vpsllq),
2468            // shr_u
2469            (OperandSize::S8, ShiftKind::ShrU) => {
2470                shift_i8x16(self, &I8X16_USHR_MASKS, AvxOpcode::Vpsrlw)
2471            }
2472            (OperandSize::S16, ShiftKind::ShrU) => shl_normal(self, AvxOpcode::Vpsrlw),
2473            (OperandSize::S32, ShiftKind::ShrU) => shl_normal(self, AvxOpcode::Vpsrld),
2474            (OperandSize::S64, ShiftKind::ShrU) => shl_normal(self, AvxOpcode::Vpsrlq),
2475            // shr_s
2476            (OperandSize::S8, ShiftKind::ShrS) => i8x16_shr_s(self, context)?,
2477            (OperandSize::S16, ShiftKind::ShrS) => shl_normal(self, AvxOpcode::Vpsraw),
2478            (OperandSize::S32, ShiftKind::ShrS) => shl_normal(self, AvxOpcode::Vpsrad),
2479            (OperandSize::S64, ShiftKind::ShrS) => i64x2_shr_s(self, context)?,
2480
2481            _ => bail!(CodeGenError::invalid_operand_combination()),
2482        }
2483
2484        context.free_reg(shift_amount);
2485        context
2486            .stack
2487            .push(TypedReg::new(WasmValType::V128, operand).into());
2488        Ok(())
2489    }
2490
2491    fn v128_q15mulr_sat_s(
2492        &mut self,
2493        lhs: Reg,
2494        rhs: Reg,
2495        dst: WritableReg,
2496        size: OperandSize,
2497    ) -> Result<()> {
2498        self.ensure_has_avx()?;
2499
2500        self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size);
2501
2502        // Need to handle edge case of multiplying -1 by -1 (0x8000 in Q15
2503        // format) because of how `vpmulhrs` handles rounding. `vpmulhrs`
2504        // produces 0x8000 in that case when the correct result is 0x7FFF (that
2505        // is, +1) so need to check if the result is 0x8000 and flip the bits
2506        // of the result if it is.
2507        let address = self.asm.add_constant(&[
2508            0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
2509            0x00, 0x80,
2510        ]);
2511        self.asm
2512            .xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size);
2513        self.asm
2514            .xmm_vex_rr(AvxOpcode::Vpxor, dst.to_reg(), rhs, dst);
2515        Ok(())
2516    }
2517
2518    fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2519        self.ensure_has_avx()?;
2520
2521        let scratch = regs::scratch_xmm();
2522        // Create a mask of all 0s.
2523        self.asm
2524            .xmm_vex_rr(AvxOpcode::Vpxor, scratch, scratch, writable!(scratch));
2525        // Sets lane in `dst` to not zero if `src` lane was zero, and lane in
2526        // `dst` to zero if `src` lane was not zero.
2527        self.asm.xmm_vpcmpeq_rrr(writable!(src), src, scratch, size);
2528        // Sets ZF if all values are zero (i.e., if all original values were not zero).
2529        self.asm.xmm_vptest(src, src);
2530        // Set byte if ZF=1.
2531        self.asm.setcc(IntCmpKind::Eq, dst);
2532        Ok(())
2533    }
2534
2535    fn v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2536        self.ensure_has_avx()?;
2537
2538        match size {
2539            OperandSize::S8 => self.asm.xmm_vpmovmsk_rr(src, dst, size, OperandSize::S32),
2540            OperandSize::S16 => {
2541                // Signed conversion of 16-bit integers to 8-bit integers.
2542                self.asm
2543                    .xmm_vpackss_rrr(src, src, writable!(src), OperandSize::S8);
2544                // Creates a mask from each byte in `src`.
2545                self.asm
2546                    .xmm_vpmovmsk_rr(src, dst, OperandSize::S8, OperandSize::S32);
2547                // Removes 8 bits added as a result of the `vpackss` step.
2548                self.asm
2549                    .shift_ir(0x8, dst, ShiftKind::ShrU, OperandSize::S32);
2550            }
2551            OperandSize::S32 | OperandSize::S64 => self.asm.xmm_vmovskp_rr(src, dst, size, size),
2552            _ => unimplemented!(),
2553        }
2554
2555        Ok(())
2556    }
2557
2558    fn v128_trunc(
2559        &mut self,
2560        context: &mut CodeGenContext<Emission>,
2561        kind: V128TruncKind,
2562    ) -> Result<()> {
2563        self.ensure_has_avx()?;
2564
2565        let reg = writable!(context.pop_to_reg(self, None)?.reg);
2566        match kind {
2567            V128TruncKind::F32x4 | V128TruncKind::F64x2 => self.asm.xmm_vroundp_rri(
2568                reg.to_reg(),
2569                reg,
2570                VroundMode::TowardZero,
2571                kind.dst_lane_size(),
2572            ),
2573            V128TruncKind::I32x4FromF32x4S => {
2574                self.v128_trunc_sat_f32x4_s(reg, kind.src_lane_size(), kind.dst_lane_size());
2575            }
2576            V128TruncKind::I32x4FromF32x4U => {
2577                let temp_reg = writable!(context.any_fpr(self)?);
2578                self.v128_trunc_sat_f32x4_u(
2579                    reg,
2580                    temp_reg,
2581                    kind.src_lane_size(),
2582                    kind.dst_lane_size(),
2583                );
2584                context.free_reg(temp_reg.to_reg());
2585            }
2586            V128TruncKind::I32x4FromF64x2SZero => {
2587                self.v128_trunc_sat_f64x2_s_zero(reg, kind.src_lane_size());
2588            }
2589            V128TruncKind::I32x4FromF64x2UZero => {
2590                self.v128_trunc_sat_f64x2_u_zero(reg, kind.src_lane_size(), kind.dst_lane_size());
2591            }
2592        }
2593
2594        context.stack.push(TypedReg::v128(reg.to_reg()).into());
2595        Ok(())
2596    }
2597
2598    fn v128_min(
2599        &mut self,
2600        src1: Reg,
2601        src2: Reg,
2602        dst: WritableReg,
2603        kind: V128MinKind,
2604    ) -> Result<()> {
2605        self.ensure_has_avx()?;
2606
2607        match kind {
2608            V128MinKind::I8x16S
2609            | V128MinKind::I8x16U
2610            | V128MinKind::I16x8S
2611            | V128MinKind::I16x8U
2612            | V128MinKind::I32x4S
2613            | V128MinKind::I32x4U => {
2614                let op = match kind {
2615                    V128MinKind::I8x16S => AvxOpcode::Vpminsb,
2616                    V128MinKind::I8x16U => AvxOpcode::Vpminub,
2617                    V128MinKind::I16x8S => AvxOpcode::Vpminsw,
2618                    V128MinKind::I16x8U => AvxOpcode::Vpminuw,
2619                    V128MinKind::I32x4S => AvxOpcode::Vpminsd,
2620                    V128MinKind::I32x4U => AvxOpcode::Vpminud,
2621                    _ => unreachable!(),
2622                };
2623                self.asm.xmm_vex_rr(op, src1, src2, dst);
2624            }
2625            V128MinKind::F32x4 | V128MinKind::F64x2 => {
2626                // Handling +0 and -0 as well as NaN values are not commutative
2627                // when using `vminp` so we have to compensate.
2628                let scratch = writable!(regs::scratch_xmm());
2629                // Perform two comparison operations with the operands swapped
2630                // and OR the result to propagate 0 (positive and negative) and
2631                // NaN.
2632                self.asm
2633                    .xmm_vminp_rrr(src1, src2, scratch, kind.lane_size());
2634                self.asm.xmm_vminp_rrr(src2, src1, dst, kind.lane_size());
2635                // Use a single OR instruction to set the sign bit if either
2636                // result has the sign bit set to correctly propagate -0.
2637                self.asm
2638                    .xmm_vorp_rrr(dst.to_reg(), scratch.to_reg(), dst, kind.lane_size());
2639                // Set lanes with NaN to all 1s.
2640                self.asm.xmm_vcmpp_rrr(
2641                    writable!(src2),
2642                    src2,
2643                    dst.to_reg(),
2644                    kind.lane_size(),
2645                    VcmpKind::Unord,
2646                );
2647                // Doesn't change non-NaN values. For NaN values, sets all bits.
2648                self.asm
2649                    .xmm_vorp_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2650                self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2651            }
2652        }
2653
2654        Ok(())
2655    }
2656
2657    fn v128_max(
2658        &mut self,
2659        src1: Reg,
2660        src2: Reg,
2661        dst: WritableReg,
2662        kind: V128MaxKind,
2663    ) -> Result<()> {
2664        self.ensure_has_avx()?;
2665
2666        match kind {
2667            V128MaxKind::I8x16S
2668            | V128MaxKind::I8x16U
2669            | V128MaxKind::I16x8S
2670            | V128MaxKind::I16x8U
2671            | V128MaxKind::I32x4S
2672            | V128MaxKind::I32x4U => {
2673                let op = match kind {
2674                    V128MaxKind::I8x16S => AvxOpcode::Vpmaxsb,
2675                    V128MaxKind::I8x16U => AvxOpcode::Vpmaxub,
2676                    V128MaxKind::I16x8S => AvxOpcode::Vpmaxsw,
2677                    V128MaxKind::I16x8U => AvxOpcode::Vpmaxuw,
2678                    V128MaxKind::I32x4S => AvxOpcode::Vpmaxsd,
2679                    V128MaxKind::I32x4U => AvxOpcode::Vpmaxud,
2680                    _ => unreachable!(),
2681                };
2682                self.asm.xmm_vex_rr(op, src1, src2, dst);
2683            }
2684            V128MaxKind::F32x4 | V128MaxKind::F64x2 => {
2685                // Handling +0 and -0 as well as NaN values are not commutative
2686                // when using `vmaxp` so we have to compensate.
2687                let scratch = writable!(regs::scratch_xmm());
2688                // Perform two comparison operations with the operands swapped
2689                // so we can propagate 0 (positive and negative) and NaNs
2690                // correctly.
2691                self.asm
2692                    .xmm_vmaxp_rrr(src1, src2, scratch, kind.lane_size());
2693                self.asm.xmm_vmaxp_rrr(src2, src1, dst, kind.lane_size());
2694                // This combination of XOR, OR, and SUB will set the sign bit
2695                // on a 0 result to the correct value for a max operation.
2696                self.asm
2697                    .xmm_vxorp_rrr(dst.to_reg(), scratch.to_reg(), dst, kind.lane_size());
2698                self.asm.xmm_vorp_rrr(
2699                    dst.to_reg(),
2700                    scratch.to_reg(),
2701                    writable!(src2),
2702                    kind.lane_size(),
2703                );
2704                self.asm
2705                    .xmm_vsub_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2706                // Set lanes of NaN values to 1.
2707                self.asm.xmm_vcmpp_rrr(
2708                    writable!(src2),
2709                    src2,
2710                    src2,
2711                    kind.lane_size(),
2712                    VcmpKind::Unord,
2713                );
2714                self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2715            }
2716        }
2717        Ok(())
2718    }
2719
2720    fn v128_extmul(
2721        &mut self,
2722        context: &mut CodeGenContext<Emission>,
2723        kind: V128ExtMulKind,
2724    ) -> Result<()> {
2725        self.ensure_has_avx()?;
2726
2727        // The implementation for extmul is not optimized; for simplicity's sake, we simply perform
2728        // an extension followed by a multiplication using already implemented primitives.
2729
2730        let src1 = context.pop_to_reg(self, None)?;
2731        let src2 = context.pop_to_reg(self, None)?;
2732
2733        let ext_kind = kind.into();
2734        self.v128_extend(src1.reg, writable!(src1.reg), ext_kind)?;
2735        self.v128_extend(src2.reg, writable!(src2.reg), ext_kind)?;
2736
2737        context.stack.push(src2.into());
2738        context.stack.push(src1.into());
2739
2740        self.v128_mul(context, kind.into())
2741    }
2742
2743    fn v128_extadd_pairwise(
2744        &mut self,
2745        src: Reg,
2746        dst: WritableReg,
2747        kind: V128ExtAddKind,
2748    ) -> Result<()> {
2749        self.ensure_has_avx()?;
2750
2751        match kind {
2752            V128ExtAddKind::I8x16S => {
2753                let scratch = regs::scratch_xmm();
2754                // Use `vpmaddubsw` with a vector of 16 8-bit 1's which will
2755                // sign extend `src` to 16 bits and add adjacent words.
2756                // Need to supply constant as first operand since first operand
2757                // is treated as unsigned and the second operand is signed.
2758                let mask = self.asm.add_constant(&[1; 16]);
2759                self.asm.xmm_mov_mr(
2760                    &mask,
2761                    writable!(scratch),
2762                    OperandSize::S128,
2763                    MemFlags::trusted(),
2764                );
2765                self.asm
2766                    .xmm_vex_rr(AvxOpcode::Vpmaddubsw, scratch, src, dst);
2767            }
2768            V128ExtAddKind::I8x16U => {
2769                // Same approach as the signed variant but treat `src` as
2770                // unsigned instead of signed by passing it as the first
2771                // operand.
2772                let mask = self.asm.add_constant(&[1; 16]);
2773                self.asm
2774                    .xmm_vpmaddubs_rmr(src, &mask, dst, OperandSize::S16);
2775            }
2776            V128ExtAddKind::I16x8S => {
2777                // Similar approach to the two variants above. The vector is 8
2778                // lanes of 16-bit 1's and `vpmaddwd` treats both operands as
2779                // signed.
2780                let mask = self
2781                    .asm
2782                    .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
2783                self.asm.xmm_vpmaddwd_rmr(src, &mask, dst);
2784            }
2785            V128ExtAddKind::I16x8U => {
2786                // Similar approach as the signed variant.
2787                // `vpmaddwd` operates on signed integers and the operand is
2788                // unsigned so the operand needs to be converted to a signed
2789                // format and than that process needs to be reversed after
2790                // `vpmaddwd`.
2791                // Flip the sign bit for 8 16-bit lanes.
2792                let xor_mask = self.asm.add_constant(&[
2793                    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
2794                    0x80, 0x00, 0x80,
2795                ]);
2796                self.asm.xmm_vpxor_rmr(src, &xor_mask, dst);
2797
2798                let madd_mask = self
2799                    .asm
2800                    .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
2801                self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst);
2802
2803                // Reverse the XOR. The XOR effectively subtracts 32,768 from
2804                // both pairs that are added together so 65,536 (0x10000)
2805                // needs to be added to 4 lanes of 32-bit values.
2806                let add_mask = self
2807                    .asm
2808                    .add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]);
2809                self.asm
2810                    .xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32);
2811            }
2812        }
2813        Ok(())
2814    }
2815
2816    fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> {
2817        self.ensure_has_avx()?;
2818        self.asm.xmm_vex_rr(AvxOpcode::Vpmaddwd, lhs, rhs, dst);
2819        Ok(())
2820    }
2821
2822    fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {
2823        self.ensure_has_avx()?;
2824
2825        let reg = writable!(context.pop_to_reg(self, None)?.reg);
2826        let scratch = writable!(regs::scratch_xmm());
2827
2828        // This works by using a lookup table to determine the count of bits
2829        // set in the upper 4 bits and lower 4 bits separately and then adding
2830        // the counts.
2831
2832        // A mask to zero out the upper 4 bits in each lane.
2833        let address = self.asm.add_constant(&[
2834            0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
2835            0x0F, 0x0F,
2836        ]);
2837        // Zero out the upper 4 bits of each lane.
2838        self.asm.xmm_vpand_rrm(reg.to_reg(), &address, scratch);
2839        // Right shift bytes in input by 4 bits to put the upper 4 bits in the
2840        // lower 4 bits.
2841        self.asm
2842            .xmm_vpsrl_rr(reg.to_reg(), reg, 0x4, OperandSize::S16);
2843        // Zero out the upper 4 bits of each shifted lane.
2844        self.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);
2845
2846        // Write a lookup table of 4 bit values to number of bits set to a
2847        // register so we only perform the memory read once.
2848        // Index (hex) | Value (binary) | Population Count
2849        // 0x0         | 0000          | 0
2850        // 0x1         | 0001          | 1
2851        // 0x2         | 0010          | 1
2852        // 0x3         | 0011          | 2
2853        // 0x4         | 0100          | 1
2854        // 0x5         | 0101          | 2
2855        // 0x6         | 0110          | 2
2856        // 0x7         | 0111          | 3
2857        // 0x8         | 1000          | 1
2858        // 0x9         | 1001          | 2
2859        // 0xA         | 1010          | 2
2860        // 0xB         | 1011          | 3
2861        // 0xC         | 1100          | 2
2862        // 0xD         | 1101          | 3
2863        // 0xE         | 1110          | 3
2864        // 0xF         | 1111          | 4
2865        let address = self.asm.add_constant(&[
2866            0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,
2867        ]);
2868        let reg2 = writable!(context.any_fpr(self)?);
2869        self.asm
2870            .xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());
2871        // Use the upper 4 bits as an index into the lookup table.
2872        self.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());
2873        // Use the lower 4 bits as an index into the lookup table.
2874        self.asm
2875            .xmm_vpshufb_rrr(scratch, reg2.to_reg(), scratch.to_reg());
2876        context.free_reg(reg2.to_reg());
2877
2878        // Add the counts of the upper 4 bits and the lower 4 bits to get the
2879        // total number of bits set.
2880        self.asm
2881            .xmm_vpadd_rrr(reg.to_reg(), scratch.to_reg(), reg, OperandSize::S8);
2882
2883        context.stack.push(TypedReg::v128(reg.to_reg()).into());
2884        Ok(())
2885    }
2886
2887    fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2888        self.ensure_has_avx()?;
2889        self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);
2890        Ok(())
2891    }
2892
2893    fn v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2894        self.ensure_has_avx()?;
2895        self.asm.xmm_vdivp_rrr(lhs, rhs, dst, size);
2896        Ok(())
2897    }
2898
2899    fn v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2900        self.ensure_has_avx()?;
2901        self.asm.xmm_vsqrtp_rr(src, dst, size);
2902        Ok(())
2903    }
2904
2905    fn v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2906        self.ensure_has_avx()?;
2907        self.asm
2908            .xmm_vroundp_rri(src, dst, VroundMode::TowardPositiveInfinity, size);
2909        Ok(())
2910    }
2911
2912    fn v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2913        self.ensure_has_avx()?;
2914        self.asm
2915            .xmm_vroundp_rri(src, dst, VroundMode::TowardNegativeInfinity, size);
2916        Ok(())
2917    }
2918
2919    fn v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2920        self.ensure_has_avx()?;
2921        self.asm
2922            .xmm_vroundp_rri(src, dst, VroundMode::TowardNearest, size);
2923        Ok(())
2924    }
2925
2926    fn v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2927        self.ensure_has_avx()?;
2928        // Reverse operands since Wasm specifies returning the first operand if
2929        // either operand is NaN while x86 returns the second operand.
2930        self.asm.xmm_vminp_rrr(rhs, lhs, dst, size);
2931        Ok(())
2932    }
2933
2934    fn v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2935        self.ensure_has_avx()?;
2936        // Reverse operands since Wasm specifies returning the first operand if
2937        // either operand is NaN while x86 returns the second operand.
2938        self.asm.xmm_vmaxp_rrr(rhs, lhs, dst, size);
2939        Ok(())
2940    }
2941}
2942
2943impl MacroAssembler {
2944    /// Create an x64 MacroAssembler.
2945    pub fn new(
2946        ptr_size: impl PtrSize,
2947        shared_flags: settings::Flags,
2948        isa_flags: x64_settings::Flags,
2949    ) -> Result<Self> {
2950        let ptr_type: WasmValType = ptr_type_from_ptr_size(ptr_size.size()).into();
2951
2952        Ok(Self {
2953            sp_offset: 0,
2954            sp_max: 0,
2955            stack_max_use_add: None,
2956            asm: Assembler::new(shared_flags.clone(), isa_flags.clone()),
2957            flags: isa_flags,
2958            shared_flags,
2959            ptr_size: ptr_type.try_into()?,
2960        })
2961    }
2962
2963    /// Add the maximum stack used to a register, recording an obligation to update the
2964    /// add-with-immediate instruction emitted to use the real stack max when the masm is being
2965    /// finalized.
2966    fn add_stack_max(&mut self, reg: Reg) {
2967        assert!(self.stack_max_use_add.is_none());
2968        let patch = PatchableAddToReg::new(reg, OperandSize::S64, self.asm.buffer_mut());
2969        self.stack_max_use_add.replace(patch);
2970    }
2971
2972    fn ensure_has_avx(&self) -> Result<()> {
2973        anyhow::ensure!(self.flags.has_avx(), CodeGenError::UnimplementedForNoAvx);
2974        Ok(())
2975    }
2976
2977    fn ensure_has_avx2(&self) -> Result<()> {
2978        anyhow::ensure!(self.flags.has_avx2(), CodeGenError::UnimplementedForNoAvx2);
2979        Ok(())
2980    }
2981
2982    fn ensure_has_avx512vl(&self) -> Result<()> {
2983        anyhow::ensure!(
2984            self.flags.has_avx512vl(),
2985            CodeGenError::UnimplementedForNoAvx512VL
2986        );
2987        Ok(())
2988    }
2989
2990    fn ensure_has_avx512dq(&self) -> Result<()> {
2991        anyhow::ensure!(
2992            self.flags.has_avx512dq(),
2993            CodeGenError::UnimplementedForNoAvx512DQ
2994        );
2995        Ok(())
2996    }
2997
2998    fn increment_sp(&mut self, bytes: u32) {
2999        self.sp_offset += bytes;
3000
3001        // NOTE: we use `max` here to track the largest stack allocation in `sp_max`. Once we have
3002        // seen the entire function, this value will represent the maximum size for the stack
3003        // frame.
3004        self.sp_max = self.sp_max.max(self.sp_offset);
3005    }
3006
3007    fn decrement_sp(&mut self, bytes: u32) {
3008        assert!(
3009            self.sp_offset >= bytes,
3010            "sp offset = {}; bytes = {}",
3011            self.sp_offset,
3012            bytes
3013        );
3014        self.sp_offset -= bytes;
3015    }
3016
3017    fn load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()> {
3018        match constant {
3019            I::I32(v) => Ok(self.asm.mov_ir(*v as u64, dst, size)),
3020            I::I64(v) => Ok(self.asm.mov_ir(*v, dst, size)),
3021            _ => Err(anyhow!(CodeGenError::unsupported_imm())),
3022        }
3023    }
3024
3025    /// A common implementation for zero-extend stack loads.
3026    fn load_impl(
3027        &mut self,
3028        src: Address,
3029        dst: WritableReg,
3030        size: OperandSize,
3031        flags: MemFlags,
3032    ) -> Result<()> {
3033        if dst.to_reg().is_int() {
3034            let ext = size.extend_to::<Zero>(OperandSize::S64);
3035            self.asm.movzx_mr(&src, dst, ext, flags);
3036        } else {
3037            self.asm.xmm_mov_mr(&src, dst, size, flags);
3038        }
3039
3040        Ok(())
3041    }
3042
3043    /// A common implementation for stack stores.
3044    fn store_impl(
3045        &mut self,
3046        src: RegImm,
3047        dst: Address,
3048        size: OperandSize,
3049        flags: MemFlags,
3050    ) -> Result<()> {
3051        let _ = match src {
3052            RegImm::Imm(imm) => match imm {
3053                I::I32(v) => self.asm.mov_im(v as i32, &dst, size, flags),
3054                I::I64(v) => match v.try_into() {
3055                    Ok(v) => self.asm.mov_im(v, &dst, size, flags),
3056                    Err(_) => {
3057                        // If the immediate doesn't sign extend, use a scratch
3058                        // register.
3059                        let scratch = regs::scratch();
3060                        self.asm.mov_ir(v, writable!(scratch), size);
3061                        self.asm.mov_rm(scratch, &dst, size, flags);
3062                    }
3063                },
3064                I::F32(v) => {
3065                    let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3066                    let float_scratch = regs::scratch_xmm();
3067                    // Always trusted, since we are loading the constant from
3068                    // the constant pool.
3069                    self.asm
3070                        .xmm_mov_mr(&addr, writable!(float_scratch), size, MemFlags::trusted());
3071                    self.asm.xmm_mov_rm(float_scratch, &dst, size, flags);
3072                }
3073                I::F64(v) => {
3074                    let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3075                    let float_scratch = regs::scratch_xmm();
3076                    // Similar to above, always trusted since we are loading the
3077                    // constant from the constant pool.
3078                    self.asm
3079                        .xmm_mov_mr(&addr, writable!(float_scratch), size, MemFlags::trusted());
3080                    self.asm.xmm_mov_rm(float_scratch, &dst, size, flags);
3081                }
3082                I::V128(v) => {
3083                    let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3084                    let vector_scratch = regs::scratch_xmm();
3085                    // Always trusted, since we are loading the constant from
3086                    // the constant pool.
3087                    self.asm.xmm_mov_mr(
3088                        &addr,
3089                        writable!(vector_scratch),
3090                        size,
3091                        MemFlags::trusted(),
3092                    );
3093                    self.asm.xmm_mov_rm(vector_scratch, &dst, size, flags);
3094                }
3095            },
3096            RegImm::Reg(reg) => {
3097                if reg.is_int() {
3098                    self.asm.mov_rm(reg, &dst, size, flags);
3099                } else {
3100                    self.asm.xmm_mov_rm(reg, &dst, size, flags);
3101                }
3102            }
3103        };
3104        Ok(())
3105    }
3106
3107    fn ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()> {
3108        if dst != lhs {
3109            Err(anyhow!(CodeGenError::invalid_two_arg_form()))
3110        } else {
3111            Ok(())
3112        }
3113    }
3114
3115    /// The mask to use when performing a `vpshuf` operation for a 64-bit splat.
3116    fn vpshuf_mask_for_64_bit_splats() -> u8 {
3117        // Results in the first 4 bytes and second 4 bytes being
3118        // swapped and then the swapped bytes being copied.
3119        // [d0, d1, d2, d3, d4, d5, d6, d7, ...] yields
3120        // [d4, d5, d6, d7, d0, d1, d2, d3, d4, d5, d6, d7, d0, d1, d2, d3].
3121        0b01_00_01_00
3122    }
3123
3124    fn v128_trunc_sat_f32x4_s(
3125        &mut self,
3126        reg: WritableReg,
3127        src_lane_size: OperandSize,
3128        dst_lane_size: OperandSize,
3129    ) {
3130        let scratch = writable!(regs::scratch_xmm());
3131        // Create a mask to handle NaN values (1 for not NaN, 0 for
3132        // NaN).
3133        self.asm.xmm_vcmpp_rrr(
3134            scratch,
3135            reg.to_reg(),
3136            reg.to_reg(),
3137            src_lane_size,
3138            VcmpKind::Eq,
3139        );
3140        // Zero out any NaN values.
3141        self.asm
3142            .xmm_vandp_rrr(reg.to_reg(), scratch.to_reg(), reg, src_lane_size);
3143        // Create a mask for the sign bits.
3144        self.asm
3145            .xmm_vex_rr(AvxOpcode::Vpxor, scratch.to_reg(), reg.to_reg(), scratch);
3146        // Convert floats to integers.
3147        self.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3148        // Apply sign mask to the converted integers.
3149        self.asm
3150            .xmm_vex_rr(AvxOpcode::Vpand, reg.to_reg(), scratch.to_reg(), scratch);
3151        // Create a saturation mask of all 1s for negative numbers,
3152        // all 0s for positive numbers. The arithmetic shift will cop
3153        // the sign bit.
3154        self.asm
3155            .xmm_vpsra_rri(scratch.to_reg(), scratch, 0x1F, dst_lane_size);
3156        // Combine converted integers with saturation mask.
3157        self.asm
3158            .xmm_vex_rr(AvxOpcode::Vpxor, reg.to_reg(), scratch.to_reg(), reg);
3159    }
3160
3161    fn v128_trunc_sat_f32x4_u(
3162        &mut self,
3163        reg: WritableReg,
3164        temp_reg: WritableReg,
3165        src_lane_size: OperandSize,
3166        dst_lane_size: OperandSize,
3167    ) {
3168        let scratch = writable!(regs::scratch_xmm());
3169        // Set scratch to all zeros.
3170        self.asm
3171            .xmm_vxorp_rrr(reg.to_reg(), reg.to_reg(), scratch, src_lane_size);
3172        // Clamp negative numbers to 0.
3173        self.asm
3174            .xmm_vmaxp_rrr(reg.to_reg(), scratch.to_reg(), reg, src_lane_size);
3175        // Create a vector of all 1s.
3176        self.asm
3177            .xmm_vpcmpeq_rrr(scratch, scratch.to_reg(), scratch.to_reg(), src_lane_size);
3178        // Set scratch to 0x7FFFFFFF (max signed 32-bit integer) by
3179        // performing a logical shift right.
3180        self.asm
3181            .xmm_vpsrl_rr(scratch.to_reg(), scratch, 0x1, src_lane_size);
3182        // Convert max signed int to float as a reference point for saturation.
3183        self.asm
3184            .xmm_vcvt_rr(scratch.to_reg(), scratch, VcvtKind::I32ToF32);
3185        // Convert the floats to integers and put the results in `reg2`.
3186        // This is signed and not unsigned so we need to handle the
3187        // value for the high bit in each lane.
3188        self.asm
3189            .xmm_vcvt_rr(reg.to_reg(), temp_reg, VcvtKind::F32ToI32);
3190        // Set `reg` lanes to the amount that the value in the lane
3191        // exceeds the maximum signed 32-bit integer.
3192        self.asm
3193            .xmm_vsub_rrr(reg.to_reg(), scratch.to_reg(), reg, dst_lane_size);
3194        // Create mask in `scratch` for numbers that are larger than
3195        // the maximum signed 32-bit integer. Lanes that don't fit
3196        // in 32-bits ints will be 1.
3197        self.asm.xmm_vcmpp_rrr(
3198            scratch,
3199            scratch.to_reg(),
3200            reg.to_reg(),
3201            dst_lane_size,
3202            VcmpKind::Le,
3203        );
3204        // Convert the excess over signed 32-bits from floats to integers.
3205        self.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3206        // Apply large number mask to excess values which will flip the
3207        // bits in any lanes that exceed signed 32-bits. Adding this
3208        // flipped value to the signed value will set the high bit and
3209        // the carry behavior will update the other bits correctly.
3210        self.asm
3211            .xmm_vex_rr(AvxOpcode::Vpxor, reg.to_reg(), scratch.to_reg(), scratch);
3212        // Set `reg` to all 0s.
3213        self.asm
3214            .xmm_vex_rr(AvxOpcode::Vpxor, reg.to_reg(), reg.to_reg(), reg);
3215        // Ensure excess values are not negative by taking max b/w
3216        // excess values and zero.
3217        self.asm
3218            .xmm_vpmaxs_rrr(reg, scratch.to_reg(), reg.to_reg(), dst_lane_size);
3219        // Perform the addition between the signed conversion value (in
3220        // `reg2`) and the flipped excess value (in `reg`) to get the
3221        // unsigned value.
3222        self.asm
3223            .xmm_vpadd_rrr(reg.to_reg(), temp_reg.to_reg(), reg, dst_lane_size);
3224    }
3225
3226    fn v128_trunc_sat_f64x2_s_zero(&mut self, reg: WritableReg, src_lane_size: OperandSize) {
3227        let scratch = writable!(regs::scratch_xmm());
3228        // Create a NaN mask (1s for non-NaN, 0s for NaN).
3229        self.asm.xmm_vcmpp_rrr(
3230            scratch,
3231            reg.to_reg(),
3232            reg.to_reg(),
3233            src_lane_size,
3234            VcmpKind::Eq,
3235        );
3236        // Clamp NaN values to maximum 64-bit float that can be
3237        // converted to an i32.
3238        let address = self.asm.add_constant(&[
3239            0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF,
3240            0xDF, 0x41,
3241        ]);
3242        self.asm
3243            .xmm_vandp_rrm(scratch.to_reg(), &address, scratch, src_lane_size);
3244        // Handle the saturation for values too large to fit in an i32.
3245        self.asm
3246            .xmm_vminp_rrr(reg.to_reg(), scratch.to_reg(), reg, src_lane_size);
3247        // Convert the floats to integers.
3248        self.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F64ToI32);
3249    }
3250
3251    fn v128_trunc_sat_f64x2_u_zero(
3252        &mut self,
3253        reg: WritableReg,
3254        src_lane_size: OperandSize,
3255        dst_lane_size: OperandSize,
3256    ) {
3257        let scratch = writable!(regs::scratch_xmm());
3258        // Zero out the scratch register.
3259        self.asm
3260            .xmm_vxorp_rrr(scratch.to_reg(), scratch.to_reg(), scratch, src_lane_size);
3261        // Clamp negative values to zero.
3262        self.asm
3263            .xmm_vmaxp_rrr(reg.to_reg(), scratch.to_reg(), reg, src_lane_size);
3264        // Clamp value to maximum unsigned 32-bit integer value
3265        // (0x41F0000000000000).
3266        let address = self.asm.add_constant(&[
3267            0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF,
3268            0xEF, 0x41,
3269        ]);
3270        self.asm
3271            .xmm_vminp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3272        // Truncate floating point values.
3273        self.asm
3274            .xmm_vroundp_rri(reg.to_reg(), reg, VroundMode::TowardZero, src_lane_size);
3275        // Add 2^52 (doubles store 52 bits in their mantissa) to each
3276        // lane causing values in the lower bits to be shifted into
3277        // position for integer conversion.
3278        let address = self.asm.add_constant(&[
3279            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3280            0x30, 0x43,
3281        ]);
3282        self.asm
3283            .xmm_vaddp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3284        // Takes lanes 0 and 2 from `reg` (converted values) and lanes
3285        // 0 and 2 from `scratch` (zeroes) to put the converted ints in
3286        // the lower lanes and zeroes in the upper lanes.
3287        self.asm.xmm_vshufp_rrri(
3288            reg.to_reg(),
3289            scratch.to_reg(),
3290            reg,
3291            0b10_00_10_00,
3292            dst_lane_size,
3293        );
3294    }
3295
3296    /// Given a vector of floats where lanes with NaN values are set to all 1s
3297    /// in `reg` and a vector register `dst` with a mix of non-NaN values and
3298    /// possibly non-canonical NaN values, this canonicalize any NaNs in `dst`.
3299    fn canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize) {
3300        // Canonical NaNs do not preserve the sign bit, have the exponent bits
3301        // all set, and have only the high bit of the mantissa set so shift by
3302        // that number.
3303        // The mask we're producing in this step will be inverted in the next
3304        // step.
3305        let amount_to_shift = 1 + size.mantissa_bits() + 1;
3306        self.asm
3307            .xmm_vpsrl_rr(mask.to_reg(), mask, amount_to_shift as u32, size);
3308        // The mask will be inverted by the ANDN so non-NaN values will be all
3309        // 1s and NaN values will set the sign bit, exponent bits, and zero out
3310        // almost all of the mantissa.
3311        self.asm
3312            .xmm_vandnp_rrr(mask.to_reg(), dst.to_reg(), dst, size);
3313    }
3314}