1use super::{
2 abi::X64ABI,
3 address::Address,
4 asm::{Assembler, PatchableAddToReg, VcmpKind, VcvtKind, VroundMode},
5 regs::{self, rbp, rsp},
6};
7use anyhow::{anyhow, bail, Result};
8
9use crate::masm::{
10 DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, Imm as I, IntCmpKind, LaneSelector,
11 LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm, RemKind, ReplaceLaneKind,
12 RmwOp, RoundingMode, ShiftKind, SplatKind, StoreKind, TrapCode, TruncKind, V128AbsKind,
13 V128AddKind, V128ConvertKind, V128ExtAddKind, V128ExtMulKind, V128ExtendKind, V128MaxKind,
14 V128MinKind, V128MulKind, V128NarrowKind, V128NegKind, V128SubKind, V128TruncKind,
15 VectorCompareKind, VectorEqualityKind, Zero, TRUSTED_FLAGS, UNTRUSTED_FLAGS,
16};
17use crate::{
18 abi::{self, align_to, calculate_frame_adjustment, LocalSlot},
19 codegen::{ptr_type_from_ptr_size, CodeGenContext, CodeGenError, Emission, FuncEnv},
20 stack::{TypedReg, Val},
21};
22use crate::{
23 abi::{vmctx, ABI},
24 masm::{SPOffset, StackSlot},
25};
26use crate::{
27 isa::{
28 reg::{writable, Reg, RegClass, WritableReg},
29 CallingConvention,
30 },
31 masm::CalleeKind,
32};
33use cranelift_codegen::{
34 binemit::CodeOffset,
35 ir::{MemFlags, RelSourceLoc, SourceLoc},
36 isa::{
37 unwind::UnwindInst,
38 x64::{
39 args::{Avx512Opcode, AvxOpcode, FenceKind, CC},
40 settings as x64_settings, AtomicRmwSeqOp,
41 },
42 },
43 settings, Final, MachBufferFinalized, MachLabel,
44};
45use wasmtime_cranelift::TRAP_UNREACHABLE;
46use wasmtime_environ::{PtrSize, WasmValType};
47
48#[rustfmt::skip] const I8X16_ISHL_MASKS: [u8; 128] = [
58 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
59 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
60 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
61 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
62 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
63 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
64 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
65 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
66];
67
68#[rustfmt::skip] const I8X16_USHR_MASKS: [u8; 128] = [
70 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
72 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
73 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
74 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
75 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
76 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
77 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
78];
79
80pub(crate) struct MacroAssembler {
82 sp_offset: u32,
84 sp_max: u32,
89 stack_max_use_add: Option<PatchableAddToReg>,
91 asm: Assembler,
93 flags: x64_settings::Flags,
95 shared_flags: settings::Flags,
97 ptr_size: OperandSize,
99}
100
101impl Masm for MacroAssembler {
102 type Address = Address;
103 type Ptr = u8;
104 type ABI = X64ABI;
105
106 fn frame_setup(&mut self) -> Result<()> {
107 let frame_pointer = rbp();
108 let stack_pointer = rsp();
109
110 self.asm.push_r(frame_pointer);
111
112 if self.shared_flags.unwind_info() {
113 self.asm.unwind_inst(UnwindInst::PushFrameRegs {
114 offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
115 })
116 }
117
118 self.asm
119 .mov_rr(stack_pointer, writable!(frame_pointer), OperandSize::S64);
120
121 Ok(())
122 }
123
124 fn check_stack(&mut self, vmctx: Reg) -> Result<()> {
125 let ptr_size: u8 = self.ptr_size.bytes().try_into().unwrap();
126 let scratch = regs::scratch();
127
128 self.load_ptr(
129 self.address_at_reg(vmctx, ptr_size.vmcontext_store_context().into())?,
130 writable!(scratch),
131 )?;
132
133 self.load_ptr(
134 Address::offset(scratch, ptr_size.vmstore_context_stack_limit().into()),
135 writable!(scratch),
136 )?;
137
138 self.add_stack_max(scratch);
139
140 self.asm.cmp_rr(scratch, regs::rsp(), self.ptr_size);
141 self.asm.trapif(IntCmpKind::GtU, TrapCode::STACK_OVERFLOW);
142
143 if self.shared_flags.unwind_info() {
145 self.asm.unwind_inst(UnwindInst::DefineNewFrame {
146 offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
147
148 offset_downward_to_clobbers: 0,
151 })
152 }
153 Ok(())
154 }
155
156 fn push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot> {
157 let bytes = match (reg.class(), size) {
158 (RegClass::Int, OperandSize::S64) => {
159 let word_bytes = <Self::ABI as ABI>::word_bytes() as u32;
160 self.asm.push_r(reg);
161 self.increment_sp(word_bytes);
162 word_bytes
163 }
164 (RegClass::Int, OperandSize::S32) => {
165 let bytes = size.bytes();
166 self.reserve_stack(bytes)?;
167 let sp_offset = SPOffset::from_u32(self.sp_offset);
168 self.asm
169 .mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
170 bytes
171 }
172 (RegClass::Float, _) => {
173 let bytes = size.bytes();
174 self.reserve_stack(bytes)?;
175 let sp_offset = SPOffset::from_u32(self.sp_offset);
176 self.asm
177 .xmm_mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
178 bytes
179 }
180 _ => unreachable!(),
181 };
182
183 Ok(StackSlot {
184 offset: SPOffset::from_u32(self.sp_offset),
185 size: bytes,
186 })
187 }
188
189 fn reserve_stack(&mut self, bytes: u32) -> Result<()> {
190 if bytes == 0 {
191 return Ok(());
192 }
193
194 self.asm
195 .sub_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
196 self.increment_sp(bytes);
197
198 Ok(())
199 }
200
201 fn free_stack(&mut self, bytes: u32) -> Result<()> {
202 if bytes == 0 {
203 return Ok(());
204 }
205 self.asm
206 .add_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
207 self.decrement_sp(bytes);
208
209 Ok(())
210 }
211
212 fn reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()> {
213 self.sp_offset = offset.as_u32();
214
215 Ok(())
216 }
217
218 fn local_address(&mut self, local: &LocalSlot) -> Result<Address> {
219 let (reg, offset) = if local.addressed_from_sp() {
220 let offset = self
221 .sp_offset
222 .checked_sub(local.offset)
223 .ok_or_else(|| CodeGenError::invalid_local_offset())?;
224 (rsp(), offset)
225 } else {
226 (rbp(), local.offset)
227 };
228
229 Ok(Address::offset(reg, offset))
230 }
231
232 fn address_from_sp(&self, offset: SPOffset) -> Result<Self::Address> {
233 Ok(Address::offset(
234 regs::rsp(),
235 self.sp_offset - offset.as_u32(),
236 ))
237 }
238
239 fn address_at_sp(&self, offset: SPOffset) -> Result<Self::Address> {
240 Ok(Address::offset(regs::rsp(), offset.as_u32()))
241 }
242
243 fn address_at_vmctx(&self, offset: u32) -> Result<Self::Address> {
244 Ok(Address::offset(vmctx!(Self), offset))
245 }
246
247 fn store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()> {
248 self.store(src.into(), dst, self.ptr_size)
249 }
250
251 fn store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()> {
252 self.store_impl(src, dst, size, TRUSTED_FLAGS)
253 }
254
255 fn wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()> {
256 match kind {
257 StoreKind::Operand(size) => {
258 self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
259 }
260 StoreKind::Atomic(size) => {
261 if size == OperandSize::S128 {
262 bail!(CodeGenError::unexpected_operand_size());
264 }
265 self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
268 self.asm.fence(FenceKind::MFence);
269 }
270 StoreKind::VectorLane(LaneSelector { lane, size }) => {
271 self.ensure_has_avx()?;
272 self.asm
273 .xmm_vpextr_rm(&dst, src, lane, size, UNTRUSTED_FLAGS)?;
274 }
275 }
276
277 Ok(())
278 }
279
280 fn pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
281 let current_sp = SPOffset::from_u32(self.sp_offset);
282 let _ = match (dst.to_reg().class(), size) {
283 (RegClass::Int, OperandSize::S32) => {
284 let addr = self.address_from_sp(current_sp)?;
285 self.asm.movzx_mr(
286 &addr,
287 dst,
288 size.extend_to::<Zero>(OperandSize::S64),
289 TRUSTED_FLAGS,
290 );
291 self.free_stack(size.bytes())?;
292 }
293 (RegClass::Int, OperandSize::S64) => {
294 self.asm.pop_r(dst);
295 self.decrement_sp(<Self::ABI as ABI>::word_bytes() as u32);
296 }
297 (RegClass::Float, _) | (RegClass::Vector, _) => {
298 let addr = self.address_from_sp(current_sp)?;
299 self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
300 self.free_stack(size.bytes())?;
301 }
302 _ => bail!(CodeGenError::invalid_operand_combination()),
303 };
304 Ok(())
305 }
306
307 fn call(
308 &mut self,
309 stack_args_size: u32,
310 mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>,
311 ) -> Result<u32> {
312 let alignment: u32 = <Self::ABI as abi::ABI>::call_stack_align().into();
313 let addend: u32 = <Self::ABI as abi::ABI>::initial_frame_size().into();
314 let delta = calculate_frame_adjustment(self.sp_offset()?.as_u32(), addend, alignment);
315 let aligned_args_size = align_to(stack_args_size, alignment);
316 let total_stack = delta + aligned_args_size;
317 self.reserve_stack(total_stack)?;
318 let (callee, cc) = load_callee(self)?;
319 match callee {
320 CalleeKind::Indirect(reg) => self.asm.call_with_reg(cc, reg),
321 CalleeKind::Direct(idx) => self.asm.call_with_name(cc, idx),
322 CalleeKind::LibCall(lib) => self.asm.call_with_lib(cc, lib, regs::scratch()),
323 };
324 Ok(total_stack)
325 }
326
327 fn load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()> {
328 self.load(src, dst, self.ptr_size)
329 }
330
331 fn compute_addr(
332 &mut self,
333 src: Self::Address,
334 dst: WritableReg,
335 size: OperandSize,
336 ) -> Result<()> {
337 self.asm.lea(&src, dst, size);
338 Ok(())
339 }
340
341 fn load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()> {
342 self.load_impl(src, dst, size, TRUSTED_FLAGS)
343 }
344
345 fn wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()> {
346 let size = kind.derive_operand_size();
347
348 match kind {
349 LoadKind::ScalarExtend(ext) => match ext {
350 ExtendKind::Signed(ext) => {
351 self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS);
352 }
353 ExtendKind::Unsigned(_) => self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?,
354 },
355 LoadKind::Operand(_) | LoadKind::Atomic(_, _) => {
356 if kind.is_atomic() && size == OperandSize::S128 {
359 bail!(CodeGenError::unexpected_operand_size());
360 }
361
362 self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?;
363 }
364 LoadKind::VectorExtend(ext) => {
365 self.ensure_has_avx()?;
366 self.asm
367 .xmm_vpmov_mr(&src, dst, ext.into(), UNTRUSTED_FLAGS)
368 }
369 LoadKind::Splat(_) => {
370 self.ensure_has_avx()?;
371
372 if size == OperandSize::S64 {
373 self.asm
374 .xmm_mov_mr(&src, dst, OperandSize::S64, UNTRUSTED_FLAGS);
375 self.asm.xmm_vpshuf_rr(
376 dst.to_reg(),
377 dst,
378 Self::vpshuf_mask_for_64_bit_splats(),
379 OperandSize::S32,
380 );
381 } else {
382 self.asm
383 .xmm_vpbroadcast_mr(&src, dst, size, UNTRUSTED_FLAGS);
384 }
385 }
386 LoadKind::VectorLane(LaneSelector { lane, size }) => {
387 self.ensure_has_avx()?;
388 let byte_tmp = regs::scratch();
389 self.load_impl(src, writable!(byte_tmp), size, UNTRUSTED_FLAGS)?;
390 self.asm
391 .xmm_vpinsr_rrr(dst, dst.to_reg(), byte_tmp, lane, size);
392 }
393 LoadKind::VectorZero(size) => {
394 self.ensure_has_avx()?;
395 let scratch = regs::scratch();
396 self.load_impl(src, writable!(scratch), size, UNTRUSTED_FLAGS)?;
397 self.asm.avx_gpr_to_xmm(scratch, dst, size);
398 }
399 }
400
401 Ok(())
402 }
403
404 fn sp_offset(&self) -> Result<SPOffset> {
405 Ok(SPOffset::from_u32(self.sp_offset))
406 }
407
408 fn zero(&mut self, reg: WritableReg) -> Result<()> {
409 self.asm.xor_rr(
410 reg.to_reg(),
411 reg,
412 OperandSize::from_bytes(<Self::ABI>::word_bytes()),
413 );
414 Ok(())
415 }
416
417 fn mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()> {
418 match (src, dst.to_reg()) {
419 (RegImm::Reg(src), dst_reg) => match (src.class(), dst_reg.class()) {
420 (RegClass::Int, RegClass::Int) => Ok(self.asm.mov_rr(src, dst, size)),
421 (RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_mov_rr(src, dst, size)),
422 _ => bail!(CodeGenError::invalid_operand_combination()),
423 },
424 (RegImm::Imm(imm), _) => match imm {
425 I::I32(v) => Ok(self.asm.mov_ir(v as u64, dst, size)),
426 I::I64(v) => Ok(self.asm.mov_ir(v, dst, size)),
427 I::F32(v) => {
428 let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
429 self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
430 Ok(())
431 }
432 I::F64(v) => {
433 let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
434 self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
435 Ok(())
436 }
437 I::V128(v) => {
438 let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
439 self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
440 Ok(())
441 }
442 },
443 }
444 }
445
446 fn cmov(
447 &mut self,
448 dst: WritableReg,
449 src: Reg,
450 cc: IntCmpKind,
451 size: OperandSize,
452 ) -> Result<()> {
453 match (src.class(), dst.to_reg().class()) {
454 (RegClass::Int, RegClass::Int) => Ok(self.asm.cmov(src, dst, cc, size)),
455 (RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_cmov(src, dst, cc, size)),
456 _ => Err(anyhow!(CodeGenError::invalid_operand_combination())),
457 }
458 }
459
460 fn add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
461 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
462 match (rhs, dst) {
463 (RegImm::Imm(imm), _) => {
464 if let Some(v) = imm.to_i32() {
465 self.asm.add_ir(v, dst, size);
466 } else {
467 let scratch = regs::scratch();
468 self.load_constant(&imm, writable!(scratch), size)?;
469 self.asm.add_rr(scratch, dst, size);
470 }
471 }
472
473 (RegImm::Reg(src), dst) => {
474 self.asm.add_rr(src, dst, size);
475 }
476 }
477
478 Ok(())
479 }
480
481 fn checked_uadd(
482 &mut self,
483 dst: WritableReg,
484 lhs: Reg,
485 rhs: RegImm,
486 size: OperandSize,
487 trap: TrapCode,
488 ) -> Result<()> {
489 self.add(dst, lhs, rhs, size)?;
490 self.asm.trapif(CC::B, trap);
491 Ok(())
492 }
493
494 fn sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
495 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
496 match (rhs, dst) {
497 (RegImm::Imm(imm), reg) => {
498 if let Some(v) = imm.to_i32() {
499 self.asm.sub_ir(v, reg, size);
500 } else {
501 let scratch = regs::scratch();
502 self.load_constant(&imm, writable!(scratch), size)?;
503 self.asm.sub_rr(scratch, reg, size);
504 }
505 }
506
507 (RegImm::Reg(src), dst) => {
508 self.asm.sub_rr(src, dst, size);
509 }
510 }
511
512 Ok(())
513 }
514
515 fn mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
516 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
517 match (rhs, dst) {
518 (RegImm::Imm(imm), _) => {
519 if let Some(v) = imm.to_i32() {
520 self.asm.mul_ir(v, dst, size);
521 } else {
522 let scratch = regs::scratch();
523 self.load_constant(&imm, writable!(scratch), size)?;
524 self.asm.mul_rr(scratch, dst, size);
525 }
526 }
527
528 (RegImm::Reg(src), dst) => {
529 self.asm.mul_rr(src, dst, size);
530 }
531 }
532
533 Ok(())
534 }
535
536 fn float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
537 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
538 self.asm.xmm_add_rr(rhs, dst, size);
539 Ok(())
540 }
541
542 fn float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
543 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
544 self.asm.xmm_sub_rr(rhs, dst, size);
545 Ok(())
546 }
547
548 fn float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
549 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
550 self.asm.xmm_mul_rr(rhs, dst, size);
551 Ok(())
552 }
553
554 fn float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
555 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
556 self.asm.xmm_div_rr(rhs, dst, size);
557 Ok(())
558 }
559
560 fn float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
561 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
562 self.asm.xmm_min_seq(rhs, dst, size);
563 Ok(())
564 }
565
566 fn float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
567 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
568 self.asm.xmm_max_seq(rhs, dst, size);
569 Ok(())
570 }
571
572 fn float_copysign(
573 &mut self,
574 dst: WritableReg,
575 lhs: Reg,
576 rhs: Reg,
577 size: OperandSize,
578 ) -> Result<()> {
579 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
580 let scratch_gpr = regs::scratch();
581 let scratch_xmm = regs::scratch_xmm();
582 let sign_mask = match size {
583 OperandSize::S32 => I::I32(0x80000000),
584 OperandSize::S64 => I::I64(0x8000000000000000),
585 OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
586 bail!(CodeGenError::unexpected_operand_size())
587 }
588 };
589 self.load_constant(&sign_mask, writable!(scratch_gpr), size)?;
590 self.asm
591 .gpr_to_xmm(scratch_gpr, writable!(scratch_xmm), size);
592
593 self.asm.xmm_and_rr(scratch_xmm, writable!(rhs), size);
595
596 self.asm
599 .xmm_andn_rr(dst.to_reg(), writable!(scratch_xmm), size);
600 self.asm.xmm_mov_rr(scratch_xmm, dst, size);
601
602 self.asm.xmm_or_rr(rhs, dst, size);
604 Ok(())
605 }
606
607 fn float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
608 debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
609 let mask = match size {
610 OperandSize::S32 => I::I32(0x80000000),
611 OperandSize::S64 => I::I64(0x8000000000000000),
612 OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
613 bail!(CodeGenError::unexpected_operand_size())
614 }
615 };
616 let scratch_gpr = regs::scratch();
617 self.load_constant(&mask, writable!(scratch_gpr), size)?;
618 let scratch_xmm = regs::scratch_xmm();
619 self.asm
620 .gpr_to_xmm(scratch_gpr, writable!(scratch_xmm), size);
621 self.asm.xmm_xor_rr(scratch_xmm, dst, size);
622 Ok(())
623 }
624
625 fn float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
626 debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
627 let mask = match size {
628 OperandSize::S32 => I::I32(0x7fffffff),
629 OperandSize::S64 => I::I64(0x7fffffffffffffff),
630 OperandSize::S128 | OperandSize::S16 | OperandSize::S8 => {
631 bail!(CodeGenError::unexpected_operand_size())
632 }
633 };
634 let scratch_gpr = regs::scratch();
635 self.load_constant(&mask, writable!(scratch_gpr), size)?;
636 let scratch_xmm = regs::scratch_xmm();
637 self.asm
638 .gpr_to_xmm(scratch_gpr, writable!(scratch_xmm), size);
639 self.asm.xmm_and_rr(scratch_xmm, dst, size);
640 Ok(())
641 }
642
643 fn float_round<
644 F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>,
645 >(
646 &mut self,
647 mode: RoundingMode,
648 env: &mut FuncEnv<Self::Ptr>,
649 context: &mut CodeGenContext<Emission>,
650 size: OperandSize,
651 mut fallback: F,
652 ) -> Result<()> {
653 if self.flags.has_sse41() {
654 let src = context.pop_to_reg(self, None)?;
655 self.asm
656 .xmm_rounds_rr(src.into(), writable!(src.into()), mode, size);
657 context.stack.push(src.into());
658 Ok(())
659 } else {
660 fallback(env, context, self)
661 }
662 }
663
664 fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
665 self.asm.sqrt(src, dst, size);
666 Ok(())
667 }
668
669 fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
670 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
671 match (rhs, dst) {
672 (RegImm::Imm(imm), _) => {
673 if let Some(v) = imm.to_i32() {
674 self.asm.and_ir(v, dst, size);
675 } else {
676 let scratch = regs::scratch();
677 self.load_constant(&imm, writable!(scratch), size)?;
678 self.asm.and_rr(scratch, dst, size);
679 }
680 }
681
682 (RegImm::Reg(src), dst) => {
683 self.asm.and_rr(src, dst, size);
684 }
685 }
686
687 Ok(())
688 }
689
690 fn or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
691 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
692 match (rhs, dst) {
693 (RegImm::Imm(imm), _) => {
694 if let Some(v) = imm.to_i32() {
695 self.asm.or_ir(v, dst, size);
696 } else {
697 let scratch = regs::scratch();
698 self.load_constant(&imm, writable!(scratch), size)?;
699 self.asm.or_rr(scratch, dst, size);
700 }
701 }
702
703 (RegImm::Reg(src), dst) => {
704 self.asm.or_rr(src, dst, size);
705 }
706 }
707
708 Ok(())
709 }
710
711 fn xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
712 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
713 match (rhs, dst) {
714 (RegImm::Imm(imm), _) => {
715 if let Some(v) = imm.to_i32() {
716 self.asm.xor_ir(v, dst, size);
717 } else {
718 let scratch = regs::scratch();
719 self.load_constant(&imm, writable!(scratch), size)?;
720 self.asm.xor_rr(scratch, dst, size);
721 }
722 }
723
724 (RegImm::Reg(src), _) => {
725 self.asm.xor_rr(src, dst, size);
726 }
727 }
728
729 Ok(())
730 }
731
732 fn shift_ir(
733 &mut self,
734 dst: WritableReg,
735 imm: u64,
736 lhs: Reg,
737 kind: ShiftKind,
738 size: OperandSize,
739 ) -> Result<()> {
740 Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
741 self.asm.shift_ir(imm as u8, dst, kind, size);
742 Ok(())
743 }
744
745 fn shift(
746 &mut self,
747 context: &mut CodeGenContext<Emission>,
748 kind: ShiftKind,
749 size: OperandSize,
750 ) -> Result<()> {
751 let src = context.pop_to_reg(self, Some(regs::rcx()))?;
753 let dst = context.pop_to_reg(self, None)?;
754
755 self.asm
756 .shift_rr(src.into(), writable!(dst.into()), kind, size);
757
758 context.free_reg(src);
759 context.stack.push(dst.into());
760
761 Ok(())
762 }
763
764 fn div(
765 &mut self,
766 context: &mut CodeGenContext<Emission>,
767 kind: DivKind,
768 size: OperandSize,
769 ) -> Result<()> {
770 let rdx = context.reg(regs::rdx(), self)?;
772 let rax = context.reg(regs::rax(), self)?;
773
774 let divisor = context.pop_to_reg(self, None)?;
776
777 context.free_reg(rax);
779 let rax = context.pop_to_reg(self, Some(rax))?;
781 self.asm.div(divisor.into(), (rax.into(), rdx), kind, size);
782
783 context.free_reg(divisor);
785 context.free_reg(rdx);
786
787 context.stack.push(rax.into());
789 Ok(())
790 }
791
792 fn rem(
793 &mut self,
794 context: &mut CodeGenContext<Emission>,
795 kind: RemKind,
796 size: OperandSize,
797 ) -> Result<()> {
798 let rdx = context.reg(regs::rdx(), self)?;
800 let rax = context.reg(regs::rax(), self)?;
801
802 let divisor = context.pop_to_reg(self, None)?;
804
805 context.free_reg(rax);
807 let rax = context.pop_to_reg(self, Some(rax))?;
809 self.asm.rem(divisor.reg, (rax.into(), rdx), kind, size);
810
811 context.free_reg(divisor);
813 context.free_reg(rax);
814
815 context.stack.push(Val::reg(rdx, divisor.ty));
817
818 Ok(())
819 }
820
821 fn frame_restore(&mut self) -> Result<()> {
822 debug_assert_eq!(self.sp_offset, 0);
823 self.asm.pop_r(writable!(rbp()));
824 self.asm.ret();
825 Ok(())
826 }
827
828 fn finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>> {
829 if let Some(patch) = self.stack_max_use_add {
830 patch.finalize(i32::try_from(self.sp_max).unwrap(), self.asm.buffer_mut());
831 }
832
833 Ok(self.asm.finalize(base))
834 }
835
836 fn address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address> {
837 Ok(Address::offset(reg, offset))
838 }
839
840 fn cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()> {
841 match src2 {
842 RegImm::Imm(imm) => {
843 if let Some(v) = imm.to_i32() {
844 self.asm.cmp_ir(src1, v, size);
845 } else {
846 let scratch = regs::scratch();
847 self.load_constant(&imm, writable!(scratch), size)?;
848 self.asm.cmp_rr(src1, scratch, size);
849 }
850 }
851 RegImm::Reg(src2) => {
852 self.asm.cmp_rr(src1, src2, size);
853 }
854 }
855
856 Ok(())
857 }
858
859 fn cmp_with_set(
860 &mut self,
861 dst: WritableReg,
862 src: RegImm,
863 kind: IntCmpKind,
864 size: OperandSize,
865 ) -> Result<()> {
866 self.cmp(dst.to_reg(), src, size)?;
867 self.asm.setcc(kind, dst);
868 Ok(())
869 }
870
871 fn float_cmp_with_set(
872 &mut self,
873 dst: WritableReg,
874 src1: Reg,
875 src2: Reg,
876 kind: FloatCmpKind,
877 size: OperandSize,
878 ) -> Result<()> {
879 let (src1, src2, set_kind) = match kind {
886 FloatCmpKind::Eq => (src1, src2, IntCmpKind::Eq),
887 FloatCmpKind::Ne => (src1, src2, IntCmpKind::Ne),
888 FloatCmpKind::Gt => (src1, src2, IntCmpKind::GtU),
889 FloatCmpKind::Ge => (src1, src2, IntCmpKind::GeU),
890 FloatCmpKind::Lt => (src2, src1, IntCmpKind::GtU),
897 FloatCmpKind::Le => (src2, src1, IntCmpKind::GeU),
898 };
899 self.asm.ucomis(src1, src2, size);
900 self.asm.setcc(set_kind, dst);
901 let _ = match kind {
902 FloatCmpKind::Eq | FloatCmpKind::Gt | FloatCmpKind::Ge => {
903 let scratch = regs::scratch();
906 self.asm.setnp(writable!(scratch));
907 self.asm.and_rr(scratch, dst, size);
908 }
909 FloatCmpKind::Ne => {
910 let scratch = regs::scratch();
913 self.asm.setp(writable!(scratch));
914 self.asm.or_rr(scratch, dst, size);
915 }
916 FloatCmpKind::Lt | FloatCmpKind::Le => (),
917 };
918 Ok(())
919 }
920
921 fn clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
922 if self.flags.has_lzcnt() {
923 self.asm.lzcnt(src, dst, size);
924 } else {
925 let scratch = regs::scratch();
926
927 self.asm.bsr(src.into(), dst, size);
931 self.asm.setcc(IntCmpKind::Ne, writable!(scratch.into()));
932 self.asm.neg(dst.to_reg(), dst, size);
933 self.asm.add_ir(size.num_bits() as i32, dst, size);
934 self.asm.sub_rr(scratch, dst, size);
935 }
936
937 Ok(())
938 }
939
940 fn ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
941 if self.flags.has_bmi1() {
942 self.asm.tzcnt(src, dst, size);
943 } else {
944 let scratch = regs::scratch();
945
946 self.asm.bsf(src.into(), dst.into(), size);
953 self.asm.setcc(IntCmpKind::Eq, writable!(scratch.into()));
954 self.asm
955 .shift_ir(size.log2(), writable!(scratch), ShiftKind::Shl, size);
956 self.asm.add_rr(scratch, dst, size);
957 }
958
959 Ok(())
960 }
961
962 fn get_label(&mut self) -> Result<MachLabel> {
963 let buffer = self.asm.buffer_mut();
964 Ok(buffer.get_label())
965 }
966
967 fn bind(&mut self, label: MachLabel) -> Result<()> {
968 let buffer = self.asm.buffer_mut();
969 buffer.bind_label(label, &mut Default::default());
970 Ok(())
971 }
972
973 fn branch(
974 &mut self,
975 kind: IntCmpKind,
976 lhs: Reg,
977 rhs: RegImm,
978 taken: MachLabel,
979 size: OperandSize,
980 ) -> Result<()> {
981 use IntCmpKind::*;
982
983 match &(lhs, rhs) {
984 (rlhs, RegImm::Reg(rrhs)) => {
985 if (kind == Eq || kind == Ne) && (rlhs == rrhs) {
989 self.asm.test_rr(*rlhs, *rrhs, size);
990 } else {
991 self.cmp(lhs, rhs, size)?;
992 }
993 }
994 _ => self.cmp(lhs, rhs, size)?,
995 }
996 self.asm.jmp_if(kind, taken);
997 Ok(())
998 }
999
1000 fn jmp(&mut self, target: MachLabel) -> Result<()> {
1001 self.asm.jmp(target);
1002 Ok(())
1003 }
1004
1005 fn popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()> {
1006 let src = context.pop_to_reg(self, None)?;
1007 if self.flags.has_popcnt() && self.flags.has_sse42() {
1008 self.asm.popcnt(src.into(), size);
1009 context.stack.push(src.into());
1010 Ok(())
1011 } else {
1012 let tmp = writable!(context.any_gpr(self)?);
1016 let dst = writable!(src.into());
1017 let (masks, shift_amt) = match size {
1018 OperandSize::S64 => (
1019 [
1020 0x5555555555555555, 0x3333333333333333, 0x0f0f0f0f0f0f0f0f, 0x0101010101010101, ],
1025 56u8,
1026 ),
1027 OperandSize::S32 => (
1030 [0x55555555i64, 0x33333333i64, 0x0f0f0f0fi64, 0x01010101i64],
1031 24u8,
1032 ),
1033 _ => bail!(CodeGenError::unexpected_operand_size()),
1034 };
1035 self.asm.mov_rr(src.into(), tmp, size);
1036
1037 self.asm.shift_ir(1u8, dst, ShiftKind::ShrU, size);
1039 let lhs = dst.to_reg();
1040 self.and(writable!(lhs), lhs, RegImm::i64(masks[0]), size)?;
1041 self.asm.sub_rr(dst.to_reg(), tmp, size);
1042
1043 self.asm.mov_rr(tmp.to_reg(), dst, size);
1045 let scratch = regs::scratch();
1048 self.load_constant(&I::i64(masks[1]), writable!(scratch), size)?;
1049 self.asm.and_rr(scratch, dst, size);
1050 self.asm.shift_ir(2u8, tmp, ShiftKind::ShrU, size);
1051 self.asm.and_rr(scratch, tmp, size);
1052 self.asm.add_rr(dst.to_reg(), tmp, size);
1053
1054 self.asm.mov_rr(tmp.to_reg(), dst.into(), size);
1056 self.asm.shift_ir(4u8, dst.into(), ShiftKind::ShrU, size);
1057 self.asm.add_rr(tmp.to_reg(), dst, size);
1058 let lhs = dst.to_reg();
1059 self.and(writable!(lhs), lhs, RegImm::i64(masks[2]), size)?;
1060
1061 let lhs = dst.to_reg();
1063 self.mul(writable!(lhs), lhs, RegImm::i64(masks[3]), size)?;
1064 self.asm
1065 .shift_ir(shift_amt, dst.into(), ShiftKind::ShrU, size);
1066
1067 context.stack.push(src.into());
1068 context.free_reg(tmp.to_reg());
1069
1070 Ok(())
1071 }
1072 }
1073
1074 fn wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1075 self.asm.mov_rr(src.into(), dst, OperandSize::S32);
1076 Ok(())
1077 }
1078
1079 fn extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()> {
1080 match kind {
1081 ExtendKind::Signed(ext) => {
1082 self.asm.movsx_rr(src, dst, ext);
1083 }
1084 ExtendKind::Unsigned(ext) => {
1085 self.asm.movzx_rr(src, dst, ext);
1086 }
1087 }
1088
1089 Ok(())
1090 }
1091
1092 fn signed_truncate(
1093 &mut self,
1094 dst: WritableReg,
1095 src: Reg,
1096 src_size: OperandSize,
1097 dst_size: OperandSize,
1098 kind: TruncKind,
1099 ) -> Result<()> {
1100 self.asm.cvt_float_to_sint_seq(
1101 src,
1102 dst,
1103 regs::scratch(),
1104 regs::scratch_xmm(),
1105 src_size,
1106 dst_size,
1107 kind.is_checked(),
1108 );
1109 Ok(())
1110 }
1111
1112 fn unsigned_truncate(
1113 &mut self,
1114 ctx: &mut CodeGenContext<Emission>,
1115 src_size: OperandSize,
1116 dst_size: OperandSize,
1117 kind: TruncKind,
1118 ) -> Result<()> {
1119 let dst_ty = match dst_size {
1120 OperandSize::S32 => WasmValType::I32,
1121 OperandSize::S64 => WasmValType::I64,
1122 _ => bail!(CodeGenError::unexpected_operand_size()),
1123 };
1124
1125 ctx.convert_op_with_tmp_reg(
1126 self,
1127 dst_ty,
1128 RegClass::Float,
1129 |masm, dst, src, tmp_fpr, dst_size| {
1130 masm.asm.cvt_float_to_uint_seq(
1131 src,
1132 writable!(dst),
1133 regs::scratch(),
1134 regs::scratch_xmm(),
1135 tmp_fpr,
1136 src_size,
1137 dst_size,
1138 kind.is_checked(),
1139 );
1140
1141 Ok(())
1142 },
1143 )
1144 }
1145
1146 fn signed_convert(
1147 &mut self,
1148 dst: WritableReg,
1149 src: Reg,
1150 src_size: OperandSize,
1151 dst_size: OperandSize,
1152 ) -> Result<()> {
1153 self.asm.cvt_sint_to_float(src, dst, src_size, dst_size);
1154 Ok(())
1155 }
1156
1157 fn unsigned_convert(
1158 &mut self,
1159 dst: WritableReg,
1160 src: Reg,
1161 tmp_gpr: Reg,
1162 src_size: OperandSize,
1163 dst_size: OperandSize,
1164 ) -> Result<()> {
1165 if let OperandSize::S32 = src_size {
1167 self.extend(
1168 writable!(src),
1169 src,
1170 ExtendKind::Unsigned(Extend::I64Extend32),
1171 )?;
1172 }
1173
1174 self.asm
1175 .cvt_uint64_to_float_seq(src, dst, regs::scratch(), tmp_gpr, dst_size);
1176 Ok(())
1177 }
1178
1179 fn reinterpret_float_as_int(
1180 &mut self,
1181 dst: WritableReg,
1182 src: Reg,
1183 size: OperandSize,
1184 ) -> Result<()> {
1185 self.asm.xmm_to_gpr(src, dst, size);
1186 Ok(())
1187 }
1188
1189 fn reinterpret_int_as_float(
1190 &mut self,
1191 dst: WritableReg,
1192 src: Reg,
1193 size: OperandSize,
1194 ) -> Result<()> {
1195 self.asm.gpr_to_xmm(src.into(), dst, size);
1196 Ok(())
1197 }
1198
1199 fn demote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1200 self.asm
1201 .cvt_float_to_float(src.into(), dst.into(), OperandSize::S64, OperandSize::S32);
1202 Ok(())
1203 }
1204
1205 fn promote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1206 self.asm
1207 .cvt_float_to_float(src.into(), dst, OperandSize::S32, OperandSize::S64);
1208 Ok(())
1209 }
1210
1211 fn unreachable(&mut self) -> Result<()> {
1212 self.asm.trap(TRAP_UNREACHABLE);
1213 Ok(())
1214 }
1215
1216 fn trap(&mut self, code: TrapCode) -> Result<()> {
1217 self.asm.trap(code);
1218 Ok(())
1219 }
1220
1221 fn trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()> {
1222 self.asm.trapif(cc, code);
1223 Ok(())
1224 }
1225
1226 fn trapz(&mut self, src: Reg, code: TrapCode) -> Result<()> {
1227 self.asm.test_rr(src, src, self.ptr_size);
1228 self.asm.trapif(IntCmpKind::Eq, code);
1229 Ok(())
1230 }
1231
1232 fn jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()> {
1233 debug_assert!(targets.len() >= 1);
1235 let default_index = targets.len() - 1;
1236 let max = default_index;
1240 let size = OperandSize::S32;
1241 self.asm.mov_ir(max as u64, writable!(tmp), size);
1242 self.asm.cmp_rr(tmp, index, size);
1243 self.asm.cmov(tmp, writable!(index), IntCmpKind::LtU, size);
1244
1245 let default = targets[default_index];
1246 let rest = &targets[0..default_index];
1247 let tmp1 = regs::scratch();
1248 self.asm.jmp_table(rest.into(), default, index, tmp1, tmp);
1249 Ok(())
1250 }
1251
1252 fn start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)> {
1253 Ok(self.asm.buffer_mut().start_srcloc(loc))
1254 }
1255
1256 fn end_source_loc(&mut self) -> Result<()> {
1257 self.asm.buffer_mut().end_srcloc();
1258 Ok(())
1259 }
1260
1261 fn current_code_offset(&self) -> Result<CodeOffset> {
1262 Ok(self.asm.buffer().cur_offset())
1263 }
1264
1265 fn add128(
1266 &mut self,
1267 dst_lo: WritableReg,
1268 dst_hi: WritableReg,
1269 lhs_lo: Reg,
1270 lhs_hi: Reg,
1271 rhs_lo: Reg,
1272 rhs_hi: Reg,
1273 ) -> Result<()> {
1274 Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1275 Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1276 self.asm.add_rr(rhs_lo, dst_lo, OperandSize::S64);
1277 self.asm.adc_rr(rhs_hi, dst_hi, OperandSize::S64);
1278 Ok(())
1279 }
1280
1281 fn sub128(
1282 &mut self,
1283 dst_lo: WritableReg,
1284 dst_hi: WritableReg,
1285 lhs_lo: Reg,
1286 lhs_hi: Reg,
1287 rhs_lo: Reg,
1288 rhs_hi: Reg,
1289 ) -> Result<()> {
1290 Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1291 Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1292 self.asm.sub_rr(rhs_lo, dst_lo, OperandSize::S64);
1293 self.asm.sbb_rr(rhs_hi, dst_hi, OperandSize::S64);
1294 Ok(())
1295 }
1296
1297 fn mul_wide(
1298 &mut self,
1299 context: &mut CodeGenContext<Emission>,
1300 kind: MulWideKind,
1301 ) -> Result<()> {
1302 let rax = context.reg(regs::rax(), self)?;
1305 let rdx = context.reg(regs::rdx(), self)?;
1306
1307 let rhs = context.pop_to_reg(self, None)?;
1309 context.free_reg(rax);
1312 let lhs = context.pop_to_reg(self, Some(rax))?;
1313
1314 self.asm.mul_wide(
1315 writable!(rax),
1316 writable!(rdx),
1317 lhs.reg,
1318 rhs.reg,
1319 kind,
1320 OperandSize::S64,
1321 );
1322
1323 context.free_reg(rhs);
1326
1327 context.stack.push(lhs.into());
1329 context.stack.push(Val::Reg(TypedReg::i64(rdx)));
1331
1332 Ok(())
1333 }
1334
1335 fn splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()> {
1336 let (src, dst) = match size {
1338 SplatKind::F32x4 | SplatKind::F64x2 => {
1340 let reg = context.pop_to_reg(self, None)?.reg;
1341 (RegImm::reg(reg), writable!(reg))
1342 }
1343 SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 | SplatKind::I64x2 => {
1346 let dst = writable!(context.any_fpr(self)?);
1347 let src = if size == SplatKind::I64x2 {
1348 context.pop_i64_const().map(RegImm::i64)
1349 } else {
1350 context.pop_i32_const().map(RegImm::i32)
1351 }
1352 .map_or_else(
1353 || -> Result<RegImm> {
1354 let reg = context.pop_to_reg(self, None)?.reg;
1355 self.reinterpret_int_as_float(
1356 dst,
1357 reg,
1358 match size {
1359 SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 => {
1360 OperandSize::S32
1361 }
1362 SplatKind::I64x2 => OperandSize::S64,
1363 SplatKind::F32x4 | SplatKind::F64x2 => unreachable!(),
1364 },
1365 )?;
1366 context.free_reg(reg);
1367 Ok(RegImm::Reg(dst.to_reg()))
1368 },
1369 Ok,
1370 )?;
1371 (src, dst)
1372 }
1373 };
1374
1375 if size == SplatKind::I64x2 || size == SplatKind::F64x2 {
1377 self.ensure_has_avx()?;
1378 let mask = Self::vpshuf_mask_for_64_bit_splats();
1379 match src {
1380 RegImm::Reg(src) => self.asm.xmm_vpshuf_rr(src, dst, mask, OperandSize::S32),
1381 RegImm::Imm(imm) => {
1382 let src = self.asm.add_constant(&imm.to_bytes());
1383 self.asm
1384 .xmm_vpshuf_mr(&src, dst, mask, OperandSize::S32, MemFlags::trusted());
1385 }
1386 }
1387 } else {
1388 self.ensure_has_avx2()?;
1389
1390 match src {
1391 RegImm::Reg(src) => self.asm.xmm_vpbroadcast_rr(src, dst, size.lane_size()),
1392 RegImm::Imm(imm) => {
1393 let src = self.asm.add_constant(&imm.to_bytes());
1394 self.asm
1395 .xmm_vpbroadcast_mr(&src, dst, size.lane_size(), MemFlags::trusted());
1396 }
1397 }
1398 }
1399
1400 context
1401 .stack
1402 .push(Val::reg(dst.to_reg(), WasmValType::V128));
1403 Ok(())
1404 }
1405
1406 fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()> {
1407 self.ensure_has_avx()?;
1408
1409 let mut mask_lhs: [u8; 16] = [0x80; 16];
1416 let mut mask_rhs: [u8; 16] = [0x80; 16];
1417 for i in 0..lanes.len() {
1418 if lanes[i] < 16 {
1419 mask_lhs[i] = lanes[i];
1420 } else {
1421 mask_rhs[i] = lanes[i] - 16;
1422 }
1423 }
1424 let mask_lhs = self.asm.add_constant(&mask_lhs);
1425 let mask_rhs = self.asm.add_constant(&mask_rhs);
1426
1427 self.asm.xmm_vpshufb_rrm(dst, lhs, &mask_lhs);
1428 let scratch = writable!(regs::scratch_xmm());
1429 self.asm.xmm_vpshufb_rrm(scratch, rhs, &mask_rhs);
1430 self.asm.vpor(dst, dst.to_reg(), scratch.to_reg());
1431 Ok(())
1432 }
1433
1434 fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()> {
1435 self.ensure_has_avx()?;
1436
1437 let clamp = self.asm.add_constant(&[0x70; 16]);
1443 self.asm.xmm_vpaddusb_rrm(writable!(rhs), rhs, &clamp);
1444
1445 self.asm.xmm_vpshufb_rrr(dst, lhs, rhs);
1448 Ok(())
1449 }
1450
1451 fn atomic_rmw(
1452 &mut self,
1453 context: &mut CodeGenContext<Emission>,
1454 addr: Self::Address,
1455 size: OperandSize,
1456 op: RmwOp,
1457 flags: MemFlags,
1458 extend: Option<Extend<Zero>>,
1459 ) -> Result<()> {
1460 let res = match op {
1461 RmwOp::Add => {
1462 let operand = context.pop_to_reg(self, None)?;
1463 self.asm
1464 .lock_xadd(addr, operand.reg, writable!(operand.reg), size, flags);
1465 operand.reg
1466 }
1467 RmwOp::Sub => {
1468 let operand = context.pop_to_reg(self, None)?;
1469 self.asm.neg(operand.reg, writable!(operand.reg), size);
1470 self.asm
1471 .lock_xadd(addr, operand.reg, writable!(operand.reg), size, flags);
1472 operand.reg
1473 }
1474 RmwOp::Xchg => {
1475 let operand = context.pop_to_reg(self, None)?;
1476 self.asm
1477 .xchg(addr, operand.reg, writable!(operand.reg), size, flags);
1478 operand.reg
1479 }
1480 RmwOp::And | RmwOp::Or | RmwOp::Xor => {
1481 let op = match op {
1482 RmwOp::And => AtomicRmwSeqOp::And,
1483 RmwOp::Or => AtomicRmwSeqOp::Or,
1484 RmwOp::Xor => AtomicRmwSeqOp::Xor,
1485 _ => unreachable!(
1486 "invalid op for atomic_rmw_seq, should be one of `or`, `and` or `xor`"
1487 ),
1488 };
1489 let dst = context.reg(regs::rax(), self)?;
1490 let operand = context.pop_to_reg(self, None)?;
1491
1492 self.asm
1493 .atomic_rmw_seq(addr, operand.reg, writable!(dst), size, flags, op);
1494
1495 context.free_reg(operand.reg);
1496 dst
1497 }
1498 };
1499
1500 let dst_ty = match extend {
1501 Some(ext) => {
1502 if !(ext.from_bits() == 32 && ext.to_bits() == 64) {
1504 self.asm.movzx_rr(res, writable!(res), ext.into());
1505 }
1506
1507 WasmValType::int_from_bits(ext.to_bits())
1508 }
1509 None => WasmValType::int_from_bits(size.num_bits()),
1510 };
1511
1512 context.stack.push(TypedReg::new(dst_ty, res).into());
1513
1514 Ok(())
1515 }
1516
1517 fn extract_lane(
1518 &mut self,
1519 src: Reg,
1520 dst: WritableReg,
1521 lane: u8,
1522 kind: ExtractLaneKind,
1523 ) -> Result<()> {
1524 self.ensure_has_avx()?;
1525
1526 match kind {
1527 ExtractLaneKind::I8x16S
1528 | ExtractLaneKind::I8x16U
1529 | ExtractLaneKind::I16x8S
1530 | ExtractLaneKind::I16x8U
1531 | ExtractLaneKind::I32x4
1532 | ExtractLaneKind::I64x2 => self.asm.xmm_vpextr_rr(dst, src, lane, kind.lane_size()),
1533 ExtractLaneKind::F32x4 | ExtractLaneKind::F64x2 if lane == 0 => {
1534 assert!(src == dst.to_reg());
1538 }
1539 ExtractLaneKind::F32x4 => self.asm.xmm_vpshuf_rr(src, dst, lane, kind.lane_size()),
1540 ExtractLaneKind::F64x2 => {
1541 assert!(lane == 1);
1547 self.asm
1548 .xmm_vpshuf_rr(src, dst, 0b11_10_11_10, OperandSize::S32)
1549 }
1550 }
1551
1552 match kind {
1554 ExtractLaneKind::I8x16S | ExtractLaneKind::I16x8S => {
1555 self.asm.movsx_rr(dst.to_reg(), dst, kind.into())
1556 }
1557 _ => (),
1558 }
1559
1560 Ok(())
1561 }
1562
1563 fn replace_lane(
1564 &mut self,
1565 src: RegImm,
1566 dst: WritableReg,
1567 lane: u8,
1568 kind: ReplaceLaneKind,
1569 ) -> Result<()> {
1570 self.ensure_has_avx()?;
1571
1572 match kind {
1573 ReplaceLaneKind::I8x16
1574 | ReplaceLaneKind::I16x8
1575 | ReplaceLaneKind::I32x4
1576 | ReplaceLaneKind::I64x2 => match src {
1577 RegImm::Reg(reg) => {
1578 self.asm
1579 .xmm_vpinsr_rrr(dst, dst.to_reg(), reg, lane, kind.lane_size());
1580 }
1581 RegImm::Imm(imm) => {
1582 let address = self.asm.add_constant(&imm.to_bytes());
1583 self.asm
1584 .xmm_vpinsr_rrm(dst, dst.to_reg(), &address, lane, kind.lane_size());
1585 }
1586 },
1587 ReplaceLaneKind::F32x4 => {
1588 let imm = lane << 4;
1593 match src {
1594 RegImm::Reg(reg) => self.asm.xmm_vinsertps_rrr(dst, dst.to_reg(), reg, imm),
1595 RegImm::Imm(val) => {
1596 let address = self.asm.add_constant(&val.to_bytes());
1597 self.asm.xmm_vinsertps_rrm(dst, dst.to_reg(), &address, imm);
1598 }
1599 }
1600 }
1601 ReplaceLaneKind::F64x2 => match src {
1602 RegImm::Reg(reg) => match lane {
1603 0 => self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), reg),
1604 1 => self.asm.xmm_vmovlhps_rrr(dst, dst.to_reg(), reg),
1605 _ => unreachable!(),
1606 },
1607 RegImm::Imm(imm) => {
1608 let address = self.asm.add_constant(&imm.to_bytes());
1609 match lane {
1610 0 => {
1611 let scratch = writable!(regs::scratch_xmm());
1616 self.asm.xmm_vmovsd_rm(scratch, &address);
1617 self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), scratch.to_reg());
1618 }
1619 1 => self.asm.xmm_vmovlhps_rrm(dst, dst.to_reg(), &address),
1620 _ => unreachable!(),
1621 }
1622 }
1623 },
1624 }
1625 Ok(())
1626 }
1627
1628 fn atomic_cas(
1629 &mut self,
1630 context: &mut CodeGenContext<Emission>,
1631 addr: Self::Address,
1632 size: OperandSize,
1633 flags: MemFlags,
1634 extend: Option<Extend<Zero>>,
1635 ) -> Result<()> {
1636 let rax = context.reg(regs::rax(), self)?;
1639
1640 let replacement = context.pop_to_reg(self, None)?;
1641
1642 context.free_reg(rax);
1644 let expected = context.pop_to_reg(self, Some(regs::rax()))?;
1645
1646 self.asm.cmpxchg(
1647 addr,
1648 expected.reg,
1649 replacement.reg,
1650 writable!(expected.reg),
1651 size,
1652 flags,
1653 );
1654
1655 if let Some(extend) = extend {
1656 if !(extend.from_bits() == 32 && extend.to_bits() == 64) {
1658 self.asm
1659 .movzx_rr(expected.reg.into(), writable!(expected.reg.into()), extend);
1660 }
1661 }
1662
1663 context.stack.push(expected.into());
1664 context.free_reg(replacement);
1665
1666 Ok(())
1667 }
1668
1669 fn v128_eq(
1670 &mut self,
1671 dst: WritableReg,
1672 lhs: Reg,
1673 rhs: Reg,
1674 kind: VectorEqualityKind,
1675 ) -> Result<()> {
1676 self.ensure_has_avx()?;
1677
1678 match kind {
1679 VectorEqualityKind::I8x16
1680 | VectorEqualityKind::I16x8
1681 | VectorEqualityKind::I32x4
1682 | VectorEqualityKind::I64x2 => {
1683 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size())
1684 }
1685 VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1686 self.asm
1687 .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Eq)
1688 }
1689 }
1690 Ok(())
1691 }
1692
1693 fn v128_ne(
1694 &mut self,
1695 dst: WritableReg,
1696 lhs: Reg,
1697 rhs: Reg,
1698 kind: VectorEqualityKind,
1699 ) -> Result<()> {
1700 self.ensure_has_avx()?;
1701
1702 match kind {
1703 VectorEqualityKind::I8x16
1704 | VectorEqualityKind::I16x8
1705 | VectorEqualityKind::I32x4
1706 | VectorEqualityKind::I64x2 => {
1707 self.asm
1709 .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1710 self.asm
1711 .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1712 self.asm.xmm_vex_rr(AvxOpcode::Vpxor, lhs, rhs, dst);
1713 }
1714 VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1715 self.asm
1716 .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Ne)
1717 }
1718 }
1719 Ok(())
1720 }
1721
1722 fn v128_lt(
1723 &mut self,
1724 dst: WritableReg,
1725 lhs: Reg,
1726 rhs: Reg,
1727 kind: VectorCompareKind,
1728 ) -> Result<()> {
1729 self.ensure_has_avx()?;
1730
1731 match kind {
1732 VectorCompareKind::I8x16S
1733 | VectorCompareKind::I16x8S
1734 | VectorCompareKind::I32x4S
1735 | VectorCompareKind::I64x2S => {
1736 self.asm.xmm_vpcmpgt_rrr(dst, rhs, lhs, kind.lane_size())
1738 }
1739 VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1740 self.asm
1746 .xmm_vpminu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1747 self.asm
1748 .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1749 self.asm
1750 .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1751 self.asm.xmm_vex_rr(AvxOpcode::Vpxor, lhs, rhs, dst);
1752 }
1753 VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1754 self.asm
1755 .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Lt)
1756 }
1757 }
1758 Ok(())
1759 }
1760
1761 fn v128_le(
1762 &mut self,
1763 dst: WritableReg,
1764 lhs: Reg,
1765 rhs: Reg,
1766 kind: VectorCompareKind,
1767 ) -> Result<()> {
1768 self.ensure_has_avx()?;
1769
1770 match kind {
1771 VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1772 self.asm
1775 .xmm_vpmins_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1776 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1777 }
1778 VectorCompareKind::I64x2S => {
1779 self.asm
1781 .xmm_vpcmpgt_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1782 self.asm
1783 .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1784 self.asm.xmm_vex_rr(AvxOpcode::Vpxor, lhs, rhs, dst);
1785 }
1786 VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1787 self.asm
1790 .xmm_vpminu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1791 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1792 }
1793 VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1794 self.asm
1795 .xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Le)
1796 }
1797 }
1798 Ok(())
1799 }
1800
1801 fn v128_gt(
1802 &mut self,
1803 dst: WritableReg,
1804 lhs: Reg,
1805 rhs: Reg,
1806 kind: VectorCompareKind,
1807 ) -> Result<()> {
1808 self.ensure_has_avx()?;
1809
1810 match kind {
1811 VectorCompareKind::I8x16S
1812 | VectorCompareKind::I16x8S
1813 | VectorCompareKind::I32x4S
1814 | VectorCompareKind::I64x2S => {
1815 self.asm.xmm_vpcmpgt_rrr(dst, lhs, rhs, kind.lane_size())
1816 }
1817 VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1818 self.asm
1824 .xmm_vpmaxu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1825 self.asm
1826 .xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1827 self.asm
1828 .xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1829 self.asm.xmm_vex_rr(AvxOpcode::Vpxor, lhs, rhs, dst);
1830 }
1831 VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1832 self.asm
1834 .xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Lt)
1835 }
1836 }
1837 Ok(())
1838 }
1839
1840 fn v128_ge(
1841 &mut self,
1842 dst: WritableReg,
1843 lhs: Reg,
1844 rhs: Reg,
1845 kind: VectorCompareKind,
1846 ) -> Result<()> {
1847 self.ensure_has_avx()?;
1848
1849 match kind {
1850 VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1851 self.asm
1853 .xmm_vpmaxs_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1854 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1855 }
1856 VectorCompareKind::I64x2S => {
1857 self.asm
1860 .xmm_vpcmpgt_rrr(writable!(rhs), rhs, lhs, kind.lane_size());
1861 self.asm.xmm_vpcmpeq_rrr(dst, lhs, lhs, kind.lane_size());
1862 self.asm
1863 .xmm_vex_rr(AvxOpcode::Vpxor, dst.to_reg(), rhs, dst);
1864 }
1865 VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1866 self.asm
1868 .xmm_vpmaxu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1869 self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1870 }
1871 VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1872 self.asm
1874 .xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Le)
1875 }
1876 }
1877
1878 Ok(())
1879 }
1880
1881 fn fence(&mut self) -> Result<()> {
1882 self.asm.fence(FenceKind::MFence);
1883 Ok(())
1884 }
1885
1886 fn v128_not(&mut self, dst: WritableReg) -> Result<()> {
1887 self.ensure_has_avx()?;
1888
1889 let tmp = regs::scratch_xmm();
1890 self.asm
1892 .xmm_vex_rr(AvxOpcode::Vpcmpeqd, tmp, tmp, writable!(tmp));
1893 self.asm
1895 .xmm_vex_rr(AvxOpcode::Vpxor, tmp, dst.to_reg(), dst);
1896 Ok(())
1897 }
1898
1899 fn v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1900 self.ensure_has_avx()?;
1901 self.asm.xmm_vex_rr(AvxOpcode::Vpand, src1, src2, dst);
1902 Ok(())
1903 }
1904
1905 fn v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1906 self.ensure_has_avx()?;
1907 self.asm.xmm_vex_rr(AvxOpcode::Vpandn, src1, src2, dst);
1908 Ok(())
1909 }
1910
1911 fn v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1912 self.ensure_has_avx()?;
1913 self.asm.xmm_vex_rr(AvxOpcode::Vpor, src1, src2, dst);
1914 Ok(())
1915 }
1916
1917 fn v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1918 self.ensure_has_avx()?;
1919 self.asm.xmm_vex_rr(AvxOpcode::Vpxor, src1, src2, dst);
1920 Ok(())
1921 }
1922
1923 fn v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()> {
1924 self.ensure_has_avx()?;
1925 let tmp = regs::scratch_xmm();
1926 self.v128_and(src1, mask, writable!(tmp))?;
1927 self.v128_and_not(mask, src2, dst)?;
1928 self.v128_or(dst.to_reg(), tmp, dst)?;
1929
1930 Ok(())
1931 }
1932
1933 fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
1934 self.ensure_has_avx()?;
1935 self.asm.xmm_vptest(src, src);
1936 self.asm.setcc(IntCmpKind::Ne, dst);
1937 Ok(())
1938 }
1939
1940 fn v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()> {
1941 self.ensure_has_avx()?;
1942 match kind {
1943 V128ConvertKind::I32x4S => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF32),
1944 V128ConvertKind::I32x4LowS => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF64),
1945 V128ConvertKind::I32x4U => {
1946 let scratch = writable!(regs::scratch_xmm());
1947
1948 self.asm
1952 .xmm_vpsll_rr(src, scratch, 0x10, kind.src_lane_size());
1953 self.asm
1954 .xmm_vpsrl_rr(scratch.to_reg(), scratch, 0x10, kind.src_lane_size());
1955 self.asm
1956 .xmm_vpsub_rrr(src, scratch.to_reg(), dst, kind.src_lane_size());
1957
1958 self.asm
1960 .xmm_vcvt_rr(scratch.to_reg(), scratch, VcvtKind::I32ToF32);
1961
1962 self.asm
1964 .xmm_vpsrl_rr(dst.to_reg(), dst, 1, kind.src_lane_size());
1965 self.asm.xmm_vcvt_rr(dst.to_reg(), dst, VcvtKind::I32ToF32);
1967 self.asm
1969 .xmm_vaddp_rrr(dst.to_reg(), dst.to_reg(), dst, kind.src_lane_size());
1970 self.asm
1972 .xmm_vaddp_rrr(dst.to_reg(), scratch.to_reg(), dst, kind.src_lane_size());
1973 }
1974 V128ConvertKind::I32x4LowU => {
1975 let conversion_constant = self
1983 .asm
1984 .add_constant(&[0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43]);
1985 self.asm
1986 .xmm_vunpcklp_rrm(src, &conversion_constant, dst, kind.src_lane_size());
1987 let conversion_constant = self.asm.add_constant(&[
1989 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
1990 0x00, 0x30, 0x43,
1991 ]);
1992 self.asm.xmm_vsub_rrm(
1993 dst.to_reg(),
1994 &conversion_constant,
1995 dst,
1996 kind.dst_lane_size(),
1997 );
1998 }
1999 }
2000 Ok(())
2001 }
2002
2003 fn v128_narrow(
2004 &mut self,
2005 src1: Reg,
2006 src2: Reg,
2007 dst: WritableReg,
2008 kind: V128NarrowKind,
2009 ) -> Result<()> {
2010 self.ensure_has_avx()?;
2011 match kind {
2012 V128NarrowKind::I16x8S | V128NarrowKind::I32x4S => {
2013 self.asm
2014 .xmm_vpackss_rrr(src1, src2, dst, kind.dst_lane_size())
2015 }
2016 V128NarrowKind::I16x8U | V128NarrowKind::I32x4U => {
2017 self.asm
2018 .xmm_vpackus_rrr(src1, src2, dst, kind.dst_lane_size())
2019 }
2020 }
2021 Ok(())
2022 }
2023
2024 fn v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2025 self.ensure_has_avx()?;
2026 self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F64ToF32);
2027 Ok(())
2028 }
2029
2030 fn v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2031 self.ensure_has_avx()?;
2032 self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F32ToF64);
2033 Ok(())
2034 }
2035
2036 fn v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()> {
2037 self.ensure_has_avx()?;
2038 match kind {
2039 V128ExtendKind::LowI8x16S
2040 | V128ExtendKind::LowI8x16U
2041 | V128ExtendKind::LowI16x8S
2042 | V128ExtendKind::LowI16x8U
2043 | V128ExtendKind::LowI32x4S
2044 | V128ExtendKind::LowI32x4U => self.asm.xmm_vpmov_rr(src, dst, kind.into()),
2045 V128ExtendKind::HighI8x16S | V128ExtendKind::HighI16x8S => {
2046 self.asm.xmm_vpalignr_rrr(src, src, dst, 0x8);
2047 self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2048 }
2049 V128ExtendKind::HighI8x16U | V128ExtendKind::HighI16x8U => {
2050 let scratch = regs::scratch_xmm();
2051 self.asm
2052 .xmm_vex_rr(AvxOpcode::Vpxor, scratch, scratch, writable!(scratch));
2053 self.asm
2054 .xmm_vpunpckh_rrr(src, scratch, dst, kind.src_lane_size());
2055 }
2056 V128ExtendKind::HighI32x4S => {
2057 self.asm
2061 .xmm_vpshuf_rr(src, dst, 0b11_10_11_10, kind.src_lane_size());
2062 self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2063 }
2064 V128ExtendKind::HighI32x4U => {
2065 let scratch = regs::scratch_xmm();
2067 self.asm
2068 .xmm_vxorp_rrr(scratch, scratch, writable!(scratch), kind.src_lane_size());
2069 self.asm
2071 .xmm_vunpckhp_rrr(src, scratch, dst, kind.src_lane_size());
2072 }
2073 }
2074 Ok(())
2075 }
2076
2077 fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()> {
2078 self.ensure_has_avx()?;
2079
2080 let op = match kind {
2081 V128AddKind::F32x4 => AvxOpcode::Vaddps,
2082 V128AddKind::F64x2 => AvxOpcode::Vaddpd,
2083 V128AddKind::I8x16 => AvxOpcode::Vpaddb,
2084 V128AddKind::I8x16SatS => AvxOpcode::Vpaddsb,
2085 V128AddKind::I8x16SatU => AvxOpcode::Vpaddusb,
2086 V128AddKind::I16x8 => AvxOpcode::Vpaddw,
2087 V128AddKind::I16x8SatS => AvxOpcode::Vpaddsw,
2088 V128AddKind::I16x8SatU => AvxOpcode::Vpaddusw,
2089 V128AddKind::I32x4 => AvxOpcode::Vpaddd,
2090 V128AddKind::I64x2 => AvxOpcode::Vpaddq,
2091 };
2092 self.asm.xmm_vex_rr(op, lhs, rhs, dst);
2093 Ok(())
2094 }
2095
2096 fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()> {
2097 self.ensure_has_avx()?;
2098
2099 let op = match kind {
2100 V128SubKind::F32x4 => AvxOpcode::Vsubps,
2101 V128SubKind::F64x2 => AvxOpcode::Vsubpd,
2102 V128SubKind::I8x16 => AvxOpcode::Vpsubb,
2103 V128SubKind::I8x16SatS => AvxOpcode::Vpsubsb,
2104 V128SubKind::I8x16SatU => AvxOpcode::Vpsubusb,
2105 V128SubKind::I16x8 => AvxOpcode::Vpsubw,
2106 V128SubKind::I16x8SatS => AvxOpcode::Vpsubsw,
2107 V128SubKind::I16x8SatU => AvxOpcode::Vpsubusw,
2108 V128SubKind::I32x4 => AvxOpcode::Vpsubd,
2109 V128SubKind::I64x2 => AvxOpcode::Vpsubq,
2110 };
2111 self.asm.xmm_vex_rr(op, lhs, rhs, dst);
2112 Ok(())
2113 }
2114
2115 fn v128_mul(
2116 &mut self,
2117 context: &mut CodeGenContext<Emission>,
2118 kind: V128MulKind,
2119 ) -> Result<()> {
2120 self.ensure_has_avx()?;
2121
2122 let rhs = context.pop_to_reg(self, None)?;
2123 let lhs = context.pop_to_reg(self, None)?;
2124
2125 let mul_avx = |this: &mut Self, op| {
2126 this.asm
2127 .xmm_vex_rr(op, lhs.reg, rhs.reg, writable!(lhs.reg));
2128 };
2129
2130 let mul_i64x2_avx512 = |this: &mut Self| {
2131 this.asm
2132 .xmm_rm_rvex3(Avx512Opcode::Vpmullq, lhs.reg, rhs.reg, writable!(lhs.reg));
2133 };
2134
2135 let mul_i64x2_fallback =
2136 |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
2137 let tmp1 = regs::scratch_xmm();
2163 let tmp2 = context.any_fpr(this)?;
2164
2165 this.asm
2167 .xmm_vex_ri(AvxOpcode::Vpsrlq, lhs.reg, 32, writable!(tmp1));
2168 this.asm
2170 .xmm_vex_rr(AvxOpcode::Vpmuldq, tmp1, rhs.reg, writable!(tmp2));
2171
2172 this.asm
2174 .xmm_vex_ri(AvxOpcode::Vpsrlq, rhs.reg, 32, writable!(tmp1));
2175
2176 this.asm
2178 .xmm_vex_rr(AvxOpcode::Vpmuludq, tmp1, lhs.reg, writable!(tmp1));
2179
2180 this.asm
2182 .xmm_vex_rr(AvxOpcode::Vpaddq, tmp1, tmp2, writable!(tmp1));
2183
2184 this.asm
2186 .xmm_vex_ri(AvxOpcode::Vpsllq, tmp1, 32, writable!(tmp1));
2187
2188 this.asm
2190 .xmm_vex_rr(AvxOpcode::Vpmuludq, lhs.reg, rhs.reg, writable!(tmp2));
2191
2192 this.asm
2195 .xmm_vex_rr(AvxOpcode::Vpaddq, tmp1, tmp2, writable!(lhs.reg));
2196
2197 context.free_reg(tmp2);
2198
2199 Ok(())
2200 };
2201
2202 match kind {
2203 V128MulKind::F32x4 => mul_avx(self, AvxOpcode::Vmulps),
2204 V128MulKind::F64x2 => mul_avx(self, AvxOpcode::Vmulpd),
2205 V128MulKind::I16x8 => mul_avx(self, AvxOpcode::Vpmullw),
2206 V128MulKind::I32x4 => mul_avx(self, AvxOpcode::Vpmulld),
2207 V128MulKind::I64x2
2209 if self.ensure_has_avx512vl().is_ok() && self.ensure_has_avx512dq().is_ok() =>
2210 {
2211 mul_i64x2_avx512(self)
2212 }
2213 V128MulKind::I64x2 => mul_i64x2_fallback(self, context)?,
2215 }
2216
2217 context.stack.push(lhs.into());
2218 context.free_reg(rhs);
2219
2220 Ok(())
2221 }
2222
2223 fn v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()> {
2224 self.ensure_has_avx()?;
2225
2226 match kind {
2227 V128AbsKind::I8x16 | V128AbsKind::I16x8 | V128AbsKind::I32x4 => {
2228 self.asm.xmm_vpabs_rr(src, dst, kind.lane_size())
2229 }
2230 V128AbsKind::I64x2 => {
2231 let scratch = writable!(regs::scratch_xmm());
2232 self.asm.xmm_vpsra_rri(src, scratch, 0x1f, OperandSize::S32);
2237 self.asm
2240 .xmm_vpshuf_rr(scratch.to_reg(), scratch, 0b11_11_01_01, OperandSize::S32);
2241 self.asm
2245 .xmm_vex_rr(AvxOpcode::Vpxor, src, scratch.to_reg(), dst);
2246 self.asm
2249 .xmm_vpsub_rrr(dst.to_reg(), scratch.to_reg(), dst, kind.lane_size());
2250 }
2251 V128AbsKind::F32x4 | V128AbsKind::F64x2 => {
2252 let scratch = writable!(regs::scratch_xmm());
2253 self.asm.xmm_vpcmpeq_rrr(
2255 scratch,
2256 scratch.to_reg(),
2257 scratch.to_reg(),
2258 kind.lane_size(),
2259 );
2260 self.asm
2263 .xmm_vpsrl_rr(scratch.to_reg(), scratch, 0x1, kind.lane_size());
2264 self.asm
2267 .xmm_vandp_rrr(src, scratch.to_reg(), dst, kind.lane_size());
2268 }
2269 }
2270 Ok(())
2271 }
2272
2273 fn v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()> {
2274 self.ensure_has_avx()?;
2275
2276 let tmp = regs::scratch_xmm();
2277 match kind {
2278 V128NegKind::I8x16 | V128NegKind::I16x8 | V128NegKind::I32x4 | V128NegKind::I64x2 => {
2279 self.v128_xor(tmp, tmp, writable!(tmp))?;
2280 self.v128_sub(tmp, op.to_reg(), op, kind.into())?;
2281 }
2282 V128NegKind::F32x4 | V128NegKind::F64x2 => {
2283 self.asm
2285 .xmm_vpcmpeq_rrr(writable!(tmp), tmp, tmp, kind.lane_size());
2286 self.asm.xmm_vpsll_rr(
2289 tmp,
2290 writable!(tmp),
2291 (kind.lane_size().num_bits() - 1) as u32,
2292 kind.lane_size(),
2293 );
2294 self.asm
2296 .xmm_vxorp_rrr(op.to_reg(), tmp, op, kind.lane_size());
2297 }
2298 }
2299 Ok(())
2300 }
2301
2302 fn v128_shift(
2303 &mut self,
2304 context: &mut CodeGenContext<Emission>,
2305 lane_width: OperandSize,
2306 kind: ShiftKind,
2307 ) -> Result<()> {
2308 self.ensure_has_avx()?;
2309 let shift_amount = context.pop_to_reg(self, None)?.reg;
2310 let operand = context.pop_to_reg(self, None)?.reg;
2311
2312 let tmp_xmm = regs::scratch_xmm();
2313 let tmp = regs::scratch();
2314 let amount_mask = lane_width.num_bits() - 1;
2315 self.and(
2316 writable!(shift_amount),
2317 shift_amount,
2318 RegImm::i32(amount_mask as i32),
2319 OperandSize::S32,
2320 )?;
2321
2322 let shl_normal = |this: &mut Self, op: AvxOpcode| {
2323 this.asm
2324 .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
2325 this.asm
2326 .xmm_vex_rr(op, operand, tmp_xmm, writable!(operand));
2327 };
2328
2329 let shift_i8x16 = |this: &mut Self, masks: &'static [u8], op: AvxOpcode| {
2330 this.asm
2346 .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
2347
2348 this.asm
2350 .xmm_vex_rr(op, operand, tmp_xmm, writable!(operand));
2351
2352 let masks_addr = this.asm.add_constant(masks);
2354
2355 this.asm.lea(&masks_addr, writable!(tmp), OperandSize::S64);
2357
2358 this.asm
2361 .shift_ir(4, writable!(shift_amount), ShiftKind::Shl, OperandSize::S32);
2362
2363 this.asm.xmm_vmovdqu_mr(
2365 &Address::ImmRegRegShift {
2366 simm32: 0,
2367 base: tmp,
2368 index: shift_amount,
2369 shift: 0,
2370 },
2371 writable!(tmp_xmm),
2372 MemFlags::trusted(),
2373 );
2374
2375 this.asm
2377 .xmm_vex_rr(AvxOpcode::Vpand, tmp_xmm, operand, writable!(operand));
2378 };
2379
2380 let i64x2_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
2381 const SIGN_MASK: u128 = 0x8000000000000000_8000000000000000;
2382
2383 let tmp_xmm2 = context.any_fpr(this)?;
2390
2391 this.asm
2392 .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
2393
2394 let cst = this.asm.add_constant(&SIGN_MASK.to_le_bytes());
2395
2396 this.asm
2397 .xmm_vmovdqu_mr(&cst, writable!(tmp_xmm2), MemFlags::trusted());
2398 this.asm
2399 .xmm_vex_rr(AvxOpcode::Vpsrlq, tmp_xmm2, tmp_xmm, writable!(tmp_xmm2));
2400 this.asm
2401 .xmm_vex_rr(AvxOpcode::Vpsrlq, operand, tmp_xmm, writable!(operand));
2402 this.asm
2403 .xmm_vex_rr(AvxOpcode::Vpxor, operand, tmp_xmm2, writable!(operand));
2404 this.asm
2405 .xmm_vex_rr(AvxOpcode::Vpsubq, operand, tmp_xmm2, writable!(operand));
2406
2407 context.free_reg(tmp_xmm2);
2408
2409 Ok(())
2410 };
2411
2412 let i8x16_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
2413 this.asm
2431 .add_ir(8, writable!(shift_amount), OperandSize::S32);
2432 this.asm
2433 .avx_gpr_to_xmm(shift_amount, writable!(tmp_xmm), OperandSize::S32);
2434
2435 let tmp_lo = context.any_fpr(this)?;
2436 let tmp_hi = context.any_fpr(this)?;
2437
2438 this.asm
2440 .xmm_vex_rr(AvxOpcode::Vpunpcklbw, operand, operand, writable!(tmp_lo));
2441 this.asm
2442 .xmm_vex_rr(AvxOpcode::Vpunpckhbw, operand, operand, writable!(tmp_hi));
2443
2444 this.asm
2446 .xmm_vex_rr(AvxOpcode::Vpsraw, tmp_lo, tmp_xmm, writable!(tmp_lo));
2447 this.asm
2448 .xmm_vex_rr(AvxOpcode::Vpsraw, tmp_hi, tmp_xmm, writable!(tmp_hi));
2449
2450 this.asm
2452 .xmm_vex_rr(AvxOpcode::Vpacksswb, tmp_lo, tmp_hi, writable!(operand));
2453
2454 context.free_reg(tmp_lo);
2455 context.free_reg(tmp_hi);
2456
2457 Ok(())
2458 };
2459
2460 match (lane_width, kind) {
2461 (OperandSize::S8, ShiftKind::Shl) => {
2463 shift_i8x16(self, &I8X16_ISHL_MASKS, AvxOpcode::Vpsllw)
2464 }
2465 (OperandSize::S16, ShiftKind::Shl) => shl_normal(self, AvxOpcode::Vpsllw),
2466 (OperandSize::S32, ShiftKind::Shl) => shl_normal(self, AvxOpcode::Vpslld),
2467 (OperandSize::S64, ShiftKind::Shl) => shl_normal(self, AvxOpcode::Vpsllq),
2468 (OperandSize::S8, ShiftKind::ShrU) => {
2470 shift_i8x16(self, &I8X16_USHR_MASKS, AvxOpcode::Vpsrlw)
2471 }
2472 (OperandSize::S16, ShiftKind::ShrU) => shl_normal(self, AvxOpcode::Vpsrlw),
2473 (OperandSize::S32, ShiftKind::ShrU) => shl_normal(self, AvxOpcode::Vpsrld),
2474 (OperandSize::S64, ShiftKind::ShrU) => shl_normal(self, AvxOpcode::Vpsrlq),
2475 (OperandSize::S8, ShiftKind::ShrS) => i8x16_shr_s(self, context)?,
2477 (OperandSize::S16, ShiftKind::ShrS) => shl_normal(self, AvxOpcode::Vpsraw),
2478 (OperandSize::S32, ShiftKind::ShrS) => shl_normal(self, AvxOpcode::Vpsrad),
2479 (OperandSize::S64, ShiftKind::ShrS) => i64x2_shr_s(self, context)?,
2480
2481 _ => bail!(CodeGenError::invalid_operand_combination()),
2482 }
2483
2484 context.free_reg(shift_amount);
2485 context
2486 .stack
2487 .push(TypedReg::new(WasmValType::V128, operand).into());
2488 Ok(())
2489 }
2490
2491 fn v128_q15mulr_sat_s(
2492 &mut self,
2493 lhs: Reg,
2494 rhs: Reg,
2495 dst: WritableReg,
2496 size: OperandSize,
2497 ) -> Result<()> {
2498 self.ensure_has_avx()?;
2499
2500 self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size);
2501
2502 let address = self.asm.add_constant(&[
2508 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
2509 0x00, 0x80,
2510 ]);
2511 self.asm
2512 .xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size);
2513 self.asm
2514 .xmm_vex_rr(AvxOpcode::Vpxor, dst.to_reg(), rhs, dst);
2515 Ok(())
2516 }
2517
2518 fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2519 self.ensure_has_avx()?;
2520
2521 let scratch = regs::scratch_xmm();
2522 self.asm
2524 .xmm_vex_rr(AvxOpcode::Vpxor, scratch, scratch, writable!(scratch));
2525 self.asm.xmm_vpcmpeq_rrr(writable!(src), src, scratch, size);
2528 self.asm.xmm_vptest(src, src);
2530 self.asm.setcc(IntCmpKind::Eq, dst);
2532 Ok(())
2533 }
2534
2535 fn v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2536 self.ensure_has_avx()?;
2537
2538 match size {
2539 OperandSize::S8 => self.asm.xmm_vpmovmsk_rr(src, dst, size, OperandSize::S32),
2540 OperandSize::S16 => {
2541 self.asm
2543 .xmm_vpackss_rrr(src, src, writable!(src), OperandSize::S8);
2544 self.asm
2546 .xmm_vpmovmsk_rr(src, dst, OperandSize::S8, OperandSize::S32);
2547 self.asm
2549 .shift_ir(0x8, dst, ShiftKind::ShrU, OperandSize::S32);
2550 }
2551 OperandSize::S32 | OperandSize::S64 => self.asm.xmm_vmovskp_rr(src, dst, size, size),
2552 _ => unimplemented!(),
2553 }
2554
2555 Ok(())
2556 }
2557
2558 fn v128_trunc(
2559 &mut self,
2560 context: &mut CodeGenContext<Emission>,
2561 kind: V128TruncKind,
2562 ) -> Result<()> {
2563 self.ensure_has_avx()?;
2564
2565 let reg = writable!(context.pop_to_reg(self, None)?.reg);
2566 match kind {
2567 V128TruncKind::F32x4 | V128TruncKind::F64x2 => self.asm.xmm_vroundp_rri(
2568 reg.to_reg(),
2569 reg,
2570 VroundMode::TowardZero,
2571 kind.dst_lane_size(),
2572 ),
2573 V128TruncKind::I32x4FromF32x4S => {
2574 self.v128_trunc_sat_f32x4_s(reg, kind.src_lane_size(), kind.dst_lane_size());
2575 }
2576 V128TruncKind::I32x4FromF32x4U => {
2577 let temp_reg = writable!(context.any_fpr(self)?);
2578 self.v128_trunc_sat_f32x4_u(
2579 reg,
2580 temp_reg,
2581 kind.src_lane_size(),
2582 kind.dst_lane_size(),
2583 );
2584 context.free_reg(temp_reg.to_reg());
2585 }
2586 V128TruncKind::I32x4FromF64x2SZero => {
2587 self.v128_trunc_sat_f64x2_s_zero(reg, kind.src_lane_size());
2588 }
2589 V128TruncKind::I32x4FromF64x2UZero => {
2590 self.v128_trunc_sat_f64x2_u_zero(reg, kind.src_lane_size(), kind.dst_lane_size());
2591 }
2592 }
2593
2594 context.stack.push(TypedReg::v128(reg.to_reg()).into());
2595 Ok(())
2596 }
2597
2598 fn v128_min(
2599 &mut self,
2600 src1: Reg,
2601 src2: Reg,
2602 dst: WritableReg,
2603 kind: V128MinKind,
2604 ) -> Result<()> {
2605 self.ensure_has_avx()?;
2606
2607 match kind {
2608 V128MinKind::I8x16S
2609 | V128MinKind::I8x16U
2610 | V128MinKind::I16x8S
2611 | V128MinKind::I16x8U
2612 | V128MinKind::I32x4S
2613 | V128MinKind::I32x4U => {
2614 let op = match kind {
2615 V128MinKind::I8x16S => AvxOpcode::Vpminsb,
2616 V128MinKind::I8x16U => AvxOpcode::Vpminub,
2617 V128MinKind::I16x8S => AvxOpcode::Vpminsw,
2618 V128MinKind::I16x8U => AvxOpcode::Vpminuw,
2619 V128MinKind::I32x4S => AvxOpcode::Vpminsd,
2620 V128MinKind::I32x4U => AvxOpcode::Vpminud,
2621 _ => unreachable!(),
2622 };
2623 self.asm.xmm_vex_rr(op, src1, src2, dst);
2624 }
2625 V128MinKind::F32x4 | V128MinKind::F64x2 => {
2626 let scratch = writable!(regs::scratch_xmm());
2629 self.asm
2633 .xmm_vminp_rrr(src1, src2, scratch, kind.lane_size());
2634 self.asm.xmm_vminp_rrr(src2, src1, dst, kind.lane_size());
2635 self.asm
2638 .xmm_vorp_rrr(dst.to_reg(), scratch.to_reg(), dst, kind.lane_size());
2639 self.asm.xmm_vcmpp_rrr(
2641 writable!(src2),
2642 src2,
2643 dst.to_reg(),
2644 kind.lane_size(),
2645 VcmpKind::Unord,
2646 );
2647 self.asm
2649 .xmm_vorp_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2650 self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2651 }
2652 }
2653
2654 Ok(())
2655 }
2656
2657 fn v128_max(
2658 &mut self,
2659 src1: Reg,
2660 src2: Reg,
2661 dst: WritableReg,
2662 kind: V128MaxKind,
2663 ) -> Result<()> {
2664 self.ensure_has_avx()?;
2665
2666 match kind {
2667 V128MaxKind::I8x16S
2668 | V128MaxKind::I8x16U
2669 | V128MaxKind::I16x8S
2670 | V128MaxKind::I16x8U
2671 | V128MaxKind::I32x4S
2672 | V128MaxKind::I32x4U => {
2673 let op = match kind {
2674 V128MaxKind::I8x16S => AvxOpcode::Vpmaxsb,
2675 V128MaxKind::I8x16U => AvxOpcode::Vpmaxub,
2676 V128MaxKind::I16x8S => AvxOpcode::Vpmaxsw,
2677 V128MaxKind::I16x8U => AvxOpcode::Vpmaxuw,
2678 V128MaxKind::I32x4S => AvxOpcode::Vpmaxsd,
2679 V128MaxKind::I32x4U => AvxOpcode::Vpmaxud,
2680 _ => unreachable!(),
2681 };
2682 self.asm.xmm_vex_rr(op, src1, src2, dst);
2683 }
2684 V128MaxKind::F32x4 | V128MaxKind::F64x2 => {
2685 let scratch = writable!(regs::scratch_xmm());
2688 self.asm
2692 .xmm_vmaxp_rrr(src1, src2, scratch, kind.lane_size());
2693 self.asm.xmm_vmaxp_rrr(src2, src1, dst, kind.lane_size());
2694 self.asm
2697 .xmm_vxorp_rrr(dst.to_reg(), scratch.to_reg(), dst, kind.lane_size());
2698 self.asm.xmm_vorp_rrr(
2699 dst.to_reg(),
2700 scratch.to_reg(),
2701 writable!(src2),
2702 kind.lane_size(),
2703 );
2704 self.asm
2705 .xmm_vsub_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2706 self.asm.xmm_vcmpp_rrr(
2708 writable!(src2),
2709 src2,
2710 src2,
2711 kind.lane_size(),
2712 VcmpKind::Unord,
2713 );
2714 self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2715 }
2716 }
2717 Ok(())
2718 }
2719
2720 fn v128_extmul(
2721 &mut self,
2722 context: &mut CodeGenContext<Emission>,
2723 kind: V128ExtMulKind,
2724 ) -> Result<()> {
2725 self.ensure_has_avx()?;
2726
2727 let src1 = context.pop_to_reg(self, None)?;
2731 let src2 = context.pop_to_reg(self, None)?;
2732
2733 let ext_kind = kind.into();
2734 self.v128_extend(src1.reg, writable!(src1.reg), ext_kind)?;
2735 self.v128_extend(src2.reg, writable!(src2.reg), ext_kind)?;
2736
2737 context.stack.push(src2.into());
2738 context.stack.push(src1.into());
2739
2740 self.v128_mul(context, kind.into())
2741 }
2742
2743 fn v128_extadd_pairwise(
2744 &mut self,
2745 src: Reg,
2746 dst: WritableReg,
2747 kind: V128ExtAddKind,
2748 ) -> Result<()> {
2749 self.ensure_has_avx()?;
2750
2751 match kind {
2752 V128ExtAddKind::I8x16S => {
2753 let scratch = regs::scratch_xmm();
2754 let mask = self.asm.add_constant(&[1; 16]);
2759 self.asm.xmm_mov_mr(
2760 &mask,
2761 writable!(scratch),
2762 OperandSize::S128,
2763 MemFlags::trusted(),
2764 );
2765 self.asm
2766 .xmm_vex_rr(AvxOpcode::Vpmaddubsw, scratch, src, dst);
2767 }
2768 V128ExtAddKind::I8x16U => {
2769 let mask = self.asm.add_constant(&[1; 16]);
2773 self.asm
2774 .xmm_vpmaddubs_rmr(src, &mask, dst, OperandSize::S16);
2775 }
2776 V128ExtAddKind::I16x8S => {
2777 let mask = self
2781 .asm
2782 .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
2783 self.asm.xmm_vpmaddwd_rmr(src, &mask, dst);
2784 }
2785 V128ExtAddKind::I16x8U => {
2786 let xor_mask = self.asm.add_constant(&[
2793 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
2794 0x80, 0x00, 0x80,
2795 ]);
2796 self.asm.xmm_vpxor_rmr(src, &xor_mask, dst);
2797
2798 let madd_mask = self
2799 .asm
2800 .add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
2801 self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst);
2802
2803 let add_mask = self
2807 .asm
2808 .add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]);
2809 self.asm
2810 .xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32);
2811 }
2812 }
2813 Ok(())
2814 }
2815
2816 fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> {
2817 self.ensure_has_avx()?;
2818 self.asm.xmm_vex_rr(AvxOpcode::Vpmaddwd, lhs, rhs, dst);
2819 Ok(())
2820 }
2821
2822 fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {
2823 self.ensure_has_avx()?;
2824
2825 let reg = writable!(context.pop_to_reg(self, None)?.reg);
2826 let scratch = writable!(regs::scratch_xmm());
2827
2828 let address = self.asm.add_constant(&[
2834 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
2835 0x0F, 0x0F,
2836 ]);
2837 self.asm.xmm_vpand_rrm(reg.to_reg(), &address, scratch);
2839 self.asm
2842 .xmm_vpsrl_rr(reg.to_reg(), reg, 0x4, OperandSize::S16);
2843 self.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);
2845
2846 let address = self.asm.add_constant(&[
2866 0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,
2867 ]);
2868 let reg2 = writable!(context.any_fpr(self)?);
2869 self.asm
2870 .xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());
2871 self.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());
2873 self.asm
2875 .xmm_vpshufb_rrr(scratch, reg2.to_reg(), scratch.to_reg());
2876 context.free_reg(reg2.to_reg());
2877
2878 self.asm
2881 .xmm_vpadd_rrr(reg.to_reg(), scratch.to_reg(), reg, OperandSize::S8);
2882
2883 context.stack.push(TypedReg::v128(reg.to_reg()).into());
2884 Ok(())
2885 }
2886
2887 fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2888 self.ensure_has_avx()?;
2889 self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);
2890 Ok(())
2891 }
2892
2893 fn v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2894 self.ensure_has_avx()?;
2895 self.asm.xmm_vdivp_rrr(lhs, rhs, dst, size);
2896 Ok(())
2897 }
2898
2899 fn v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2900 self.ensure_has_avx()?;
2901 self.asm.xmm_vsqrtp_rr(src, dst, size);
2902 Ok(())
2903 }
2904
2905 fn v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2906 self.ensure_has_avx()?;
2907 self.asm
2908 .xmm_vroundp_rri(src, dst, VroundMode::TowardPositiveInfinity, size);
2909 Ok(())
2910 }
2911
2912 fn v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2913 self.ensure_has_avx()?;
2914 self.asm
2915 .xmm_vroundp_rri(src, dst, VroundMode::TowardNegativeInfinity, size);
2916 Ok(())
2917 }
2918
2919 fn v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2920 self.ensure_has_avx()?;
2921 self.asm
2922 .xmm_vroundp_rri(src, dst, VroundMode::TowardNearest, size);
2923 Ok(())
2924 }
2925
2926 fn v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2927 self.ensure_has_avx()?;
2928 self.asm.xmm_vminp_rrr(rhs, lhs, dst, size);
2931 Ok(())
2932 }
2933
2934 fn v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2935 self.ensure_has_avx()?;
2936 self.asm.xmm_vmaxp_rrr(rhs, lhs, dst, size);
2939 Ok(())
2940 }
2941}
2942
2943impl MacroAssembler {
2944 pub fn new(
2946 ptr_size: impl PtrSize,
2947 shared_flags: settings::Flags,
2948 isa_flags: x64_settings::Flags,
2949 ) -> Result<Self> {
2950 let ptr_type: WasmValType = ptr_type_from_ptr_size(ptr_size.size()).into();
2951
2952 Ok(Self {
2953 sp_offset: 0,
2954 sp_max: 0,
2955 stack_max_use_add: None,
2956 asm: Assembler::new(shared_flags.clone(), isa_flags.clone()),
2957 flags: isa_flags,
2958 shared_flags,
2959 ptr_size: ptr_type.try_into()?,
2960 })
2961 }
2962
2963 fn add_stack_max(&mut self, reg: Reg) {
2967 assert!(self.stack_max_use_add.is_none());
2968 let patch = PatchableAddToReg::new(reg, OperandSize::S64, self.asm.buffer_mut());
2969 self.stack_max_use_add.replace(patch);
2970 }
2971
2972 fn ensure_has_avx(&self) -> Result<()> {
2973 anyhow::ensure!(self.flags.has_avx(), CodeGenError::UnimplementedForNoAvx);
2974 Ok(())
2975 }
2976
2977 fn ensure_has_avx2(&self) -> Result<()> {
2978 anyhow::ensure!(self.flags.has_avx2(), CodeGenError::UnimplementedForNoAvx2);
2979 Ok(())
2980 }
2981
2982 fn ensure_has_avx512vl(&self) -> Result<()> {
2983 anyhow::ensure!(
2984 self.flags.has_avx512vl(),
2985 CodeGenError::UnimplementedForNoAvx512VL
2986 );
2987 Ok(())
2988 }
2989
2990 fn ensure_has_avx512dq(&self) -> Result<()> {
2991 anyhow::ensure!(
2992 self.flags.has_avx512dq(),
2993 CodeGenError::UnimplementedForNoAvx512DQ
2994 );
2995 Ok(())
2996 }
2997
2998 fn increment_sp(&mut self, bytes: u32) {
2999 self.sp_offset += bytes;
3000
3001 self.sp_max = self.sp_max.max(self.sp_offset);
3005 }
3006
3007 fn decrement_sp(&mut self, bytes: u32) {
3008 assert!(
3009 self.sp_offset >= bytes,
3010 "sp offset = {}; bytes = {}",
3011 self.sp_offset,
3012 bytes
3013 );
3014 self.sp_offset -= bytes;
3015 }
3016
3017 fn load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()> {
3018 match constant {
3019 I::I32(v) => Ok(self.asm.mov_ir(*v as u64, dst, size)),
3020 I::I64(v) => Ok(self.asm.mov_ir(*v, dst, size)),
3021 _ => Err(anyhow!(CodeGenError::unsupported_imm())),
3022 }
3023 }
3024
3025 fn load_impl(
3027 &mut self,
3028 src: Address,
3029 dst: WritableReg,
3030 size: OperandSize,
3031 flags: MemFlags,
3032 ) -> Result<()> {
3033 if dst.to_reg().is_int() {
3034 let ext = size.extend_to::<Zero>(OperandSize::S64);
3035 self.asm.movzx_mr(&src, dst, ext, flags);
3036 } else {
3037 self.asm.xmm_mov_mr(&src, dst, size, flags);
3038 }
3039
3040 Ok(())
3041 }
3042
3043 fn store_impl(
3045 &mut self,
3046 src: RegImm,
3047 dst: Address,
3048 size: OperandSize,
3049 flags: MemFlags,
3050 ) -> Result<()> {
3051 let _ = match src {
3052 RegImm::Imm(imm) => match imm {
3053 I::I32(v) => self.asm.mov_im(v as i32, &dst, size, flags),
3054 I::I64(v) => match v.try_into() {
3055 Ok(v) => self.asm.mov_im(v, &dst, size, flags),
3056 Err(_) => {
3057 let scratch = regs::scratch();
3060 self.asm.mov_ir(v, writable!(scratch), size);
3061 self.asm.mov_rm(scratch, &dst, size, flags);
3062 }
3063 },
3064 I::F32(v) => {
3065 let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3066 let float_scratch = regs::scratch_xmm();
3067 self.asm
3070 .xmm_mov_mr(&addr, writable!(float_scratch), size, MemFlags::trusted());
3071 self.asm.xmm_mov_rm(float_scratch, &dst, size, flags);
3072 }
3073 I::F64(v) => {
3074 let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3075 let float_scratch = regs::scratch_xmm();
3076 self.asm
3079 .xmm_mov_mr(&addr, writable!(float_scratch), size, MemFlags::trusted());
3080 self.asm.xmm_mov_rm(float_scratch, &dst, size, flags);
3081 }
3082 I::V128(v) => {
3083 let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3084 let vector_scratch = regs::scratch_xmm();
3085 self.asm.xmm_mov_mr(
3088 &addr,
3089 writable!(vector_scratch),
3090 size,
3091 MemFlags::trusted(),
3092 );
3093 self.asm.xmm_mov_rm(vector_scratch, &dst, size, flags);
3094 }
3095 },
3096 RegImm::Reg(reg) => {
3097 if reg.is_int() {
3098 self.asm.mov_rm(reg, &dst, size, flags);
3099 } else {
3100 self.asm.xmm_mov_rm(reg, &dst, size, flags);
3101 }
3102 }
3103 };
3104 Ok(())
3105 }
3106
3107 fn ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()> {
3108 if dst != lhs {
3109 Err(anyhow!(CodeGenError::invalid_two_arg_form()))
3110 } else {
3111 Ok(())
3112 }
3113 }
3114
3115 fn vpshuf_mask_for_64_bit_splats() -> u8 {
3117 0b01_00_01_00
3122 }
3123
3124 fn v128_trunc_sat_f32x4_s(
3125 &mut self,
3126 reg: WritableReg,
3127 src_lane_size: OperandSize,
3128 dst_lane_size: OperandSize,
3129 ) {
3130 let scratch = writable!(regs::scratch_xmm());
3131 self.asm.xmm_vcmpp_rrr(
3134 scratch,
3135 reg.to_reg(),
3136 reg.to_reg(),
3137 src_lane_size,
3138 VcmpKind::Eq,
3139 );
3140 self.asm
3142 .xmm_vandp_rrr(reg.to_reg(), scratch.to_reg(), reg, src_lane_size);
3143 self.asm
3145 .xmm_vex_rr(AvxOpcode::Vpxor, scratch.to_reg(), reg.to_reg(), scratch);
3146 self.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3148 self.asm
3150 .xmm_vex_rr(AvxOpcode::Vpand, reg.to_reg(), scratch.to_reg(), scratch);
3151 self.asm
3155 .xmm_vpsra_rri(scratch.to_reg(), scratch, 0x1F, dst_lane_size);
3156 self.asm
3158 .xmm_vex_rr(AvxOpcode::Vpxor, reg.to_reg(), scratch.to_reg(), reg);
3159 }
3160
3161 fn v128_trunc_sat_f32x4_u(
3162 &mut self,
3163 reg: WritableReg,
3164 temp_reg: WritableReg,
3165 src_lane_size: OperandSize,
3166 dst_lane_size: OperandSize,
3167 ) {
3168 let scratch = writable!(regs::scratch_xmm());
3169 self.asm
3171 .xmm_vxorp_rrr(reg.to_reg(), reg.to_reg(), scratch, src_lane_size);
3172 self.asm
3174 .xmm_vmaxp_rrr(reg.to_reg(), scratch.to_reg(), reg, src_lane_size);
3175 self.asm
3177 .xmm_vpcmpeq_rrr(scratch, scratch.to_reg(), scratch.to_reg(), src_lane_size);
3178 self.asm
3181 .xmm_vpsrl_rr(scratch.to_reg(), scratch, 0x1, src_lane_size);
3182 self.asm
3184 .xmm_vcvt_rr(scratch.to_reg(), scratch, VcvtKind::I32ToF32);
3185 self.asm
3189 .xmm_vcvt_rr(reg.to_reg(), temp_reg, VcvtKind::F32ToI32);
3190 self.asm
3193 .xmm_vsub_rrr(reg.to_reg(), scratch.to_reg(), reg, dst_lane_size);
3194 self.asm.xmm_vcmpp_rrr(
3198 scratch,
3199 scratch.to_reg(),
3200 reg.to_reg(),
3201 dst_lane_size,
3202 VcmpKind::Le,
3203 );
3204 self.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3206 self.asm
3211 .xmm_vex_rr(AvxOpcode::Vpxor, reg.to_reg(), scratch.to_reg(), scratch);
3212 self.asm
3214 .xmm_vex_rr(AvxOpcode::Vpxor, reg.to_reg(), reg.to_reg(), reg);
3215 self.asm
3218 .xmm_vpmaxs_rrr(reg, scratch.to_reg(), reg.to_reg(), dst_lane_size);
3219 self.asm
3223 .xmm_vpadd_rrr(reg.to_reg(), temp_reg.to_reg(), reg, dst_lane_size);
3224 }
3225
3226 fn v128_trunc_sat_f64x2_s_zero(&mut self, reg: WritableReg, src_lane_size: OperandSize) {
3227 let scratch = writable!(regs::scratch_xmm());
3228 self.asm.xmm_vcmpp_rrr(
3230 scratch,
3231 reg.to_reg(),
3232 reg.to_reg(),
3233 src_lane_size,
3234 VcmpKind::Eq,
3235 );
3236 let address = self.asm.add_constant(&[
3239 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF,
3240 0xDF, 0x41,
3241 ]);
3242 self.asm
3243 .xmm_vandp_rrm(scratch.to_reg(), &address, scratch, src_lane_size);
3244 self.asm
3246 .xmm_vminp_rrr(reg.to_reg(), scratch.to_reg(), reg, src_lane_size);
3247 self.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F64ToI32);
3249 }
3250
3251 fn v128_trunc_sat_f64x2_u_zero(
3252 &mut self,
3253 reg: WritableReg,
3254 src_lane_size: OperandSize,
3255 dst_lane_size: OperandSize,
3256 ) {
3257 let scratch = writable!(regs::scratch_xmm());
3258 self.asm
3260 .xmm_vxorp_rrr(scratch.to_reg(), scratch.to_reg(), scratch, src_lane_size);
3261 self.asm
3263 .xmm_vmaxp_rrr(reg.to_reg(), scratch.to_reg(), reg, src_lane_size);
3264 let address = self.asm.add_constant(&[
3267 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF,
3268 0xEF, 0x41,
3269 ]);
3270 self.asm
3271 .xmm_vminp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3272 self.asm
3274 .xmm_vroundp_rri(reg.to_reg(), reg, VroundMode::TowardZero, src_lane_size);
3275 let address = self.asm.add_constant(&[
3279 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3280 0x30, 0x43,
3281 ]);
3282 self.asm
3283 .xmm_vaddp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3284 self.asm.xmm_vshufp_rrri(
3288 reg.to_reg(),
3289 scratch.to_reg(),
3290 reg,
3291 0b10_00_10_00,
3292 dst_lane_size,
3293 );
3294 }
3295
3296 fn canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize) {
3300 let amount_to_shift = 1 + size.mantissa_bits() + 1;
3306 self.asm
3307 .xmm_vpsrl_rr(mask.to_reg(), mask, amount_to_shift as u32, size);
3308 self.asm
3312 .xmm_vandnp_rrr(mask.to_reg(), dst.to_reg(), dst, size);
3313 }
3314}