cranelift_codegen/isa/x64/inst/
emit.rs

1use crate::ir::immediates::{Ieee32, Ieee64};
2use crate::ir::KnownSymbol;
3use crate::isa::x64::encoding::evex::{EvexInstruction, EvexVectorLength, RegisterOrAmode};
4use crate::isa::x64::encoding::rex::{
5    emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc,
6    low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap,
7    RexFlags,
8};
9use crate::isa::x64::encoding::vex::{VexInstruction, VexVectorLength};
10use crate::isa::x64::inst::args::*;
11use crate::isa::x64::inst::*;
12use crate::isa::x64::lower::isle::generated_code::{Atomic128RmwSeqOp, AtomicRmwSeqOp};
13
14/// A small helper to generate a signed conversion instruction.
15fn emit_signed_cvt(
16    sink: &mut MachBuffer<Inst>,
17    info: &EmitInfo,
18    state: &mut EmitState,
19    // Required to be RealRegs.
20    src: Reg,
21    dst: Writable<Reg>,
22    to_f64: bool,
23) {
24    // Handle an unsigned int, which is the "easy" case: a signed conversion will do the
25    // right thing.
26    let op = if to_f64 {
27        SseOpcode::Cvtsi2sd
28    } else {
29        SseOpcode::Cvtsi2ss
30    };
31    let dst = WritableXmm::from_writable_reg(dst).unwrap();
32    Inst::CvtIntToFloat {
33        op,
34        dst,
35        src1: dst.to_reg(),
36        src2: GprMem::unwrap_new(RegMem::reg(src)),
37        src2_size: OperandSize::Size64,
38    }
39    .emit(sink, info, state);
40}
41
42/// Emits a one way conditional jump if CC is set (true).
43fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
44    let cond_start = sink.cur_offset();
45    let cond_disp_off = cond_start + 2;
46    sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel32);
47    sink.put1(0x0F);
48    sink.put1(0x80 + cc.get_enc());
49    sink.put4(0x0);
50}
51
52/// Emits a relocation, attaching the current source location as well.
53fn emit_reloc(sink: &mut MachBuffer<Inst>, kind: Reloc, name: &ExternalName, addend: Addend) {
54    sink.add_reloc(kind, name, addend);
55}
56
57/// The top-level emit function.
58///
59/// Important!  Do not add improved (shortened) encoding cases to existing
60/// instructions without also adding tests for those improved encodings.  That
61/// is a dangerous game that leads to hard-to-track-down errors in the emitted
62/// code.
63///
64/// For all instructions, make sure to have test coverage for all of the
65/// following situations.  Do this by creating the cross product resulting from
66/// applying the following rules to each operand:
67///
68/// (1) for any insn that mentions a register: one test using a register from
69///     the group [rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi] and a second one
70///     using a register from the group [r8, r9, r10, r11, r12, r13, r14, r15].
71///     This helps detect incorrect REX prefix construction.
72///
73/// (2) for any insn that mentions a byte register: one test for each of the
74///     four encoding groups [al, cl, dl, bl], [spl, bpl, sil, dil],
75///     [r8b .. r11b] and [r12b .. r15b].  This checks that
76///     apparently-redundant REX prefixes are retained when required.
77///
78/// (3) for any insn that contains an immediate field, check the following
79///     cases: field is zero, field is in simm8 range (-128 .. 127), field is
80///     in simm32 range (-0x8000_0000 .. 0x7FFF_FFFF).  This is because some
81///     instructions that require a 32-bit immediate have a short-form encoding
82///     when the imm is in simm8 range.
83///
84/// Rules (1), (2) and (3) don't apply for registers within address expressions
85/// (`Addr`s).  Those are already pretty well tested, and the registers in them
86/// don't have any effect on the containing instruction (apart from possibly
87/// require REX prefix bits).
88///
89/// When choosing registers for a test, avoid using registers with the same
90/// offset within a given group.  For example, don't use rax and r8, since they
91/// both have the lowest 3 bits as 000, and so the test won't detect errors
92/// where those 3-bit register sub-fields are confused by the emitter.  Instead
93/// use (eg) rax (lo3 = 000) and r9 (lo3 = 001).  Similarly, don't use (eg) cl
94/// and bpl since they have the same offset in their group; use instead (eg) cl
95/// and sil.
96///
97/// For all instructions, also add a test that uses only low-half registers
98/// (rax .. rdi, xmm0 .. xmm7) etc, so as to check that any redundant REX
99/// prefixes are correctly omitted.  This low-half restriction must apply to
100/// _all_ registers in the insn, even those in address expressions.
101///
102/// Following these rules creates large numbers of test cases, but it's the
103/// only way to make the emitter reliable.
104///
105/// Known possible improvements:
106///
107/// * there's a shorter encoding for shl/shr/sar by a 1-bit immediate.  (Do we
108///   care?)
109pub(crate) fn emit(
110    inst: &Inst,
111    sink: &mut MachBuffer<Inst>,
112    info: &EmitInfo,
113    state: &mut EmitState,
114) {
115    let matches_isa_flags = |iset_requirement: &InstructionSet| -> bool {
116        match iset_requirement {
117            // Cranelift assumes SSE2 at least.
118            InstructionSet::SSE | InstructionSet::SSE2 => true,
119            InstructionSet::CMPXCHG16b => info.isa_flags.use_cmpxchg16b(),
120            InstructionSet::SSSE3 => info.isa_flags.use_ssse3(),
121            InstructionSet::SSE41 => info.isa_flags.use_sse41(),
122            InstructionSet::SSE42 => info.isa_flags.use_sse42(),
123            InstructionSet::Popcnt => info.isa_flags.use_popcnt(),
124            InstructionSet::Lzcnt => info.isa_flags.use_lzcnt(),
125            InstructionSet::BMI1 => info.isa_flags.use_bmi1(),
126            InstructionSet::BMI2 => info.isa_flags.has_bmi2(),
127            InstructionSet::FMA => info.isa_flags.has_fma(),
128            InstructionSet::AVX => info.isa_flags.has_avx(),
129            InstructionSet::AVX2 => info.isa_flags.has_avx2(),
130            InstructionSet::AVX512BITALG => info.isa_flags.has_avx512bitalg(),
131            InstructionSet::AVX512DQ => info.isa_flags.has_avx512dq(),
132            InstructionSet::AVX512F => info.isa_flags.has_avx512f(),
133            InstructionSet::AVX512VBMI => info.isa_flags.has_avx512vbmi(),
134            InstructionSet::AVX512VL => info.isa_flags.has_avx512vl(),
135        }
136    };
137
138    // Certain instructions may be present in more than one ISA feature set; we must at least match
139    // one of them in the target CPU.
140    let isa_requirements = inst.available_in_any_isa();
141    if !isa_requirements.is_empty() && !isa_requirements.iter().all(matches_isa_flags) {
142        panic!(
143            "Cannot emit inst '{inst:?}' for target; failed to match ISA requirements: {isa_requirements:?}"
144        )
145    }
146    match inst {
147        Inst::AluRmiR {
148            size,
149            op,
150            src1,
151            src2,
152            dst: reg_g,
153        } => {
154            let src1 = src1.to_reg();
155            let reg_g = reg_g.to_reg().to_reg();
156            debug_assert_eq!(src1, reg_g);
157            let src2 = src2.clone().to_reg_mem_imm().clone();
158
159            let prefix = if *size == OperandSize::Size16 {
160                LegacyPrefixes::_66
161            } else {
162                LegacyPrefixes::None
163            };
164
165            let mut rex = RexFlags::from(*size);
166            let (opcode_r, opcode_m, subopcode_i) = match op {
167                AluRmiROpcode::Add => (0x01, 0x03, 0),
168                AluRmiROpcode::Adc => (0x11, 0x13, 2),
169                AluRmiROpcode::Sub => (0x29, 0x2B, 5),
170                AluRmiROpcode::Sbb => (0x19, 0x1B, 3),
171                AluRmiROpcode::And => (0x21, 0x23, 4),
172                AluRmiROpcode::Or => (0x09, 0x0B, 1),
173                AluRmiROpcode::Xor => (0x31, 0x33, 6),
174            };
175
176            let (opcode_r, opcode_m) = if *size == OperandSize::Size8 {
177                (opcode_r - 1, opcode_m - 1)
178            } else {
179                (opcode_r, opcode_m)
180            };
181
182            if *size == OperandSize::Size8 {
183                debug_assert!(reg_g.is_real());
184                rex.always_emit_if_8bit_needed(reg_g);
185            }
186
187            match src2 {
188                RegMemImm::Reg { reg: reg_e } => {
189                    if *size == OperandSize::Size8 {
190                        debug_assert!(reg_e.is_real());
191                        rex.always_emit_if_8bit_needed(reg_e);
192                    }
193
194                    // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
195                    // duality). Do this too, so as to be able to compare generated machine
196                    // code easily.
197                    emit_std_reg_reg(sink, prefix, opcode_r, 1, reg_e, reg_g, rex);
198                }
199
200                RegMemImm::Mem { addr } => {
201                    let amode = addr.finalize(state.frame_layout(), sink);
202                    // Here we revert to the "normal" G-E ordering.
203                    emit_std_reg_mem(sink, prefix, opcode_m, 1, reg_g, &amode, rex, 0);
204                }
205
206                RegMemImm::Imm { simm32 } => {
207                    let imm_size = if *size == OperandSize::Size8 {
208                        1
209                    } else {
210                        if low8_will_sign_extend_to_32(simm32) {
211                            1
212                        } else {
213                            if *size == OperandSize::Size16 {
214                                2
215                            } else {
216                                4
217                            }
218                        }
219                    };
220
221                    let opcode = if *size == OperandSize::Size8 {
222                        0x80
223                    } else if low8_will_sign_extend_to_32(simm32) {
224                        0x83
225                    } else {
226                        0x81
227                    };
228
229                    // And also here we use the "normal" G-E ordering.
230                    let enc_g = int_reg_enc(reg_g);
231                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode_i, enc_g, rex);
232                    emit_simm(sink, imm_size, simm32);
233                }
234            }
235        }
236
237        &Inst::AluConstOp { op, size, dst } => {
238            let dst = WritableGpr::from_writable_reg(dst.to_writable_reg()).unwrap();
239            emit(
240                &Inst::AluRmiR {
241                    size,
242                    op,
243                    dst,
244                    src1: dst.to_reg(),
245                    src2: dst.to_reg().into(),
246                },
247                sink,
248                info,
249                state,
250            );
251        }
252
253        Inst::AluRM {
254            size,
255            src1_dst,
256            src2,
257            op,
258            lock,
259        } => {
260            let src2 = src2.to_reg();
261            let src1_dst = src1_dst.finalize(state.frame_layout(), sink).clone();
262
263            let opcode = match op {
264                AluRmiROpcode::Add => 0x01,
265                AluRmiROpcode::Sub => 0x29,
266                AluRmiROpcode::And => 0x21,
267                AluRmiROpcode::Or => 0x09,
268                AluRmiROpcode::Xor => 0x31,
269                _ => panic!("Unsupported read-modify-write ALU opcode"),
270            };
271
272            let prefix = match (size, lock) {
273                (OperandSize::Size16, false) => LegacyPrefixes::_66,
274                (OperandSize::Size16, true) => LegacyPrefixes::_66F0,
275                (_, false) => LegacyPrefixes::None,
276                (_, true) => LegacyPrefixes::_F0,
277            };
278            let opcode = if *size == OperandSize::Size8 {
279                opcode - 1
280            } else {
281                opcode
282            };
283
284            let mut rex = RexFlags::from(*size);
285            if *size == OperandSize::Size8 {
286                debug_assert!(src2.is_real());
287                rex.always_emit_if_8bit_needed(src2);
288            }
289
290            let enc_g = int_reg_enc(src2);
291            emit_std_enc_mem(sink, prefix, opcode, 1, enc_g, &src1_dst, rex, 0);
292        }
293
294        Inst::AluRmRVex {
295            size,
296            op,
297            dst,
298            src1,
299            src2,
300        } => {
301            use AluRmROpcode::*;
302            use LegacyPrefixes as LP;
303
304            let dst = dst.to_reg().to_reg();
305            let src1 = src1.to_reg();
306            let src2 = match src2.clone().to_reg_mem().clone() {
307                RegMem::Reg { reg } => {
308                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
309                }
310                RegMem::Mem { addr } => {
311                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
312                }
313            };
314
315            let w = match size {
316                OperandSize::Size32 => false,
317                OperandSize::Size64 => true,
318
319                // the other cases would be rejected by isle constructors
320                _ => unreachable!(),
321            };
322
323            let (prefix, opcode) = match op {
324                Andn => (LP::None, 0xf2),
325                Sarx => (LP::_F3, 0xf7),
326                Shrx => (LP::_F2, 0xf7),
327                Shlx => (LP::_66, 0xf7),
328                Bzhi => (LP::None, 0xf5),
329            };
330
331            VexInstruction::new()
332                .prefix(prefix)
333                .map(OpcodeMap::_0F38)
334                .w(w)
335                .reg(dst.to_real_reg().unwrap().hw_enc())
336                .vvvv(src1.to_real_reg().unwrap().hw_enc())
337                .rm(src2)
338                .opcode(opcode)
339                .encode(sink);
340        }
341
342        Inst::UnaryRmR { size, op, src, dst } => {
343            let dst = dst.to_reg().to_reg();
344            let rex_flags = RexFlags::from(*size);
345            use UnaryRmROpcode::*;
346            let prefix = match size {
347                OperandSize::Size16 => match op {
348                    Bsr | Bsf => LegacyPrefixes::_66,
349                    Lzcnt | Tzcnt | Popcnt => LegacyPrefixes::_66F3,
350                },
351                OperandSize::Size32 | OperandSize::Size64 => match op {
352                    Bsr | Bsf => LegacyPrefixes::None,
353                    Lzcnt | Tzcnt | Popcnt => LegacyPrefixes::_F3,
354                },
355                _ => unreachable!(),
356            };
357
358            let (opcode, num_opcodes) = match op {
359                Bsr => (0x0fbd, 2),
360                Bsf => (0x0fbc, 2),
361                Lzcnt => (0x0fbd, 2),
362                Tzcnt => (0x0fbc, 2),
363                Popcnt => (0x0fb8, 2),
364            };
365
366            match src.clone().into() {
367                RegMem::Reg { reg: src } => {
368                    emit_std_reg_reg(sink, prefix, opcode, num_opcodes, dst, src, rex_flags);
369                }
370                RegMem::Mem { addr: src } => {
371                    let amode = src.finalize(state.frame_layout(), sink).clone();
372                    emit_std_reg_mem(sink, prefix, opcode, num_opcodes, dst, &amode, rex_flags, 0);
373                }
374            }
375        }
376
377        Inst::UnaryRmRVex { size, op, src, dst } => {
378            let dst = dst.to_reg().to_reg();
379            let src = match src.clone().to_reg_mem().clone() {
380                RegMem::Reg { reg } => {
381                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
382                }
383                RegMem::Mem { addr } => {
384                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
385                }
386            };
387
388            let (opcode, opcode_ext) = match op {
389                UnaryRmRVexOpcode::Blsr => (0xF3, 1),
390                UnaryRmRVexOpcode::Blsmsk => (0xF3, 2),
391                UnaryRmRVexOpcode::Blsi => (0xF3, 3),
392            };
393
394            VexInstruction::new()
395                .map(OpcodeMap::_0F38)
396                .w(*size == OperandSize::Size64)
397                .opcode(opcode)
398                .reg(opcode_ext)
399                .vvvv(dst.to_real_reg().unwrap().hw_enc())
400                .rm(src)
401                .encode(sink);
402        }
403
404        Inst::UnaryRmRImmVex {
405            size,
406            op,
407            src,
408            dst,
409            imm,
410        } => {
411            let dst = dst.to_reg().to_reg();
412            let src = match src.clone().to_reg_mem().clone() {
413                RegMem::Reg { reg } => {
414                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
415                }
416                RegMem::Mem { addr } => {
417                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
418                }
419            };
420
421            let opcode = match op {
422                UnaryRmRImmVexOpcode::Rorx => 0xF0,
423            };
424
425            VexInstruction::new()
426                .prefix(LegacyPrefixes::_F2)
427                .map(OpcodeMap::_0F3A)
428                .w(*size == OperandSize::Size64)
429                .opcode(opcode)
430                .reg(dst.to_real_reg().unwrap().hw_enc())
431                .rm(src)
432                .imm(*imm)
433                .encode(sink);
434        }
435
436        Inst::Not { size, src, dst } => {
437            let src = src.to_reg();
438            let dst = dst.to_reg().to_reg();
439            debug_assert_eq!(src, dst);
440            let rex_flags = RexFlags::from((*size, dst));
441            let (opcode, prefix) = match size {
442                OperandSize::Size8 => (0xF6, LegacyPrefixes::None),
443                OperandSize::Size16 => (0xF7, LegacyPrefixes::_66),
444                OperandSize::Size32 => (0xF7, LegacyPrefixes::None),
445                OperandSize::Size64 => (0xF7, LegacyPrefixes::None),
446            };
447
448            let subopcode = 2;
449            let enc_src = int_reg_enc(dst);
450            emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_src, rex_flags)
451        }
452
453        Inst::Neg { size, src, dst } => {
454            let src = src.to_reg();
455            let dst = dst.to_reg().to_reg();
456            debug_assert_eq!(src, dst);
457            let rex_flags = RexFlags::from((*size, dst));
458            let (opcode, prefix) = match size {
459                OperandSize::Size8 => (0xF6, LegacyPrefixes::None),
460                OperandSize::Size16 => (0xF7, LegacyPrefixes::_66),
461                OperandSize::Size32 => (0xF7, LegacyPrefixes::None),
462                OperandSize::Size64 => (0xF7, LegacyPrefixes::None),
463            };
464
465            let subopcode = 3;
466            let enc_src = int_reg_enc(dst);
467            emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_src, rex_flags)
468        }
469
470        Inst::Div {
471            sign,
472            trap,
473            divisor,
474            ..
475        }
476        | Inst::Div8 {
477            sign,
478            trap,
479            divisor,
480            ..
481        } => {
482            let divisor = divisor.clone().to_reg_mem().clone();
483            let size = match inst {
484                Inst::Div {
485                    size,
486                    dividend_lo,
487                    dividend_hi,
488                    dst_quotient,
489                    dst_remainder,
490                    ..
491                } => {
492                    let dividend_lo = dividend_lo.to_reg();
493                    let dividend_hi = dividend_hi.to_reg();
494                    let dst_quotient = dst_quotient.to_reg().to_reg();
495                    let dst_remainder = dst_remainder.to_reg().to_reg();
496                    debug_assert_eq!(dividend_lo, regs::rax());
497                    debug_assert_eq!(dividend_hi, regs::rdx());
498                    debug_assert_eq!(dst_quotient, regs::rax());
499                    debug_assert_eq!(dst_remainder, regs::rdx());
500                    *size
501                }
502                Inst::Div8 { dividend, dst, .. } => {
503                    let dividend = dividend.to_reg();
504                    let dst = dst.to_reg().to_reg();
505                    debug_assert_eq!(dividend, regs::rax());
506                    debug_assert_eq!(dst, regs::rax());
507                    OperandSize::Size8
508                }
509                _ => unreachable!(),
510            };
511
512            let (opcode, prefix) = match size {
513                OperandSize::Size8 => (0xF6, LegacyPrefixes::None),
514                OperandSize::Size16 => (0xF7, LegacyPrefixes::_66),
515                OperandSize::Size32 => (0xF7, LegacyPrefixes::None),
516                OperandSize::Size64 => (0xF7, LegacyPrefixes::None),
517            };
518
519            sink.add_trap(*trap);
520
521            let subopcode = match sign {
522                DivSignedness::Signed => 7,
523                DivSignedness::Unsigned => 6,
524            };
525            match divisor {
526                RegMem::Reg { reg } => {
527                    let src = int_reg_enc(reg);
528                    emit_std_enc_enc(
529                        sink,
530                        prefix,
531                        opcode,
532                        1,
533                        subopcode,
534                        src,
535                        RexFlags::from((size, reg)),
536                    )
537                }
538                RegMem::Mem { addr: src } => {
539                    let amode = src.finalize(state.frame_layout(), sink);
540                    emit_std_enc_mem(
541                        sink,
542                        prefix,
543                        opcode,
544                        1,
545                        subopcode,
546                        &amode,
547                        RexFlags::from(size),
548                        0,
549                    );
550                }
551            }
552        }
553
554        Inst::Mul {
555            signed,
556            size,
557            src1,
558            src2,
559            dst_lo,
560            dst_hi,
561        } => {
562            let src1 = src1.to_reg();
563            let dst_lo = dst_lo.to_reg().to_reg();
564            let dst_hi = dst_hi.to_reg().to_reg();
565            debug_assert_eq!(src1, regs::rax());
566            debug_assert_eq!(dst_lo, regs::rax());
567            debug_assert_eq!(dst_hi, regs::rdx());
568            let src2 = src2.clone().to_reg_mem().clone();
569
570            let rex_flags = RexFlags::from(*size);
571            let prefix = match size {
572                OperandSize::Size16 => LegacyPrefixes::_66,
573                OperandSize::Size32 => LegacyPrefixes::None,
574                OperandSize::Size64 => LegacyPrefixes::None,
575                _ => unreachable!(),
576            };
577
578            let subopcode = if *signed { 5 } else { 4 };
579            match src2 {
580                RegMem::Reg { reg } => {
581                    let src = int_reg_enc(reg);
582                    emit_std_enc_enc(sink, prefix, 0xF7, 1, subopcode, src, rex_flags)
583                }
584                RegMem::Mem { addr: src } => {
585                    let amode = src.finalize(state.frame_layout(), sink);
586                    emit_std_enc_mem(sink, prefix, 0xF7, 1, subopcode, &amode, rex_flags, 0);
587                }
588            }
589        }
590        Inst::Mul8 {
591            signed,
592            src1,
593            src2,
594            dst,
595        } => {
596            let src1 = src1.to_reg();
597            let dst = dst.to_reg().to_reg();
598            debug_assert_eq!(src1, regs::rax());
599            debug_assert_eq!(dst, regs::rax());
600            let src2 = src2.clone().to_reg_mem().clone();
601
602            let mut rex_flags = RexFlags::from(OperandSize::Size8);
603            let prefix = LegacyPrefixes::None;
604            let subopcode = if *signed { 5 } else { 4 };
605            match src2 {
606                RegMem::Reg { reg } => {
607                    // The intel manual states:
608                    //
609                    // > r/m8 can not be encoded to access the following byte
610                    // > registers if a REX prefix is used: AH, BH, CH, DH
611                    //
612                    // And apparently that also means that a REX prefix must be
613                    // used if it's not one of those registers.
614                    if !(reg == regs::rax()
615                        || reg == regs::rbx()
616                        || reg == regs::rcx()
617                        || reg == regs::rdx())
618                    {
619                        rex_flags.always_emit();
620                    }
621                    let src = int_reg_enc(reg);
622                    emit_std_enc_enc(sink, prefix, 0xF6, 1, subopcode, src, rex_flags)
623                }
624                RegMem::Mem { addr } => {
625                    let amode = addr.finalize(state.frame_layout(), sink);
626                    emit_std_enc_mem(sink, prefix, 0xF6, 1, subopcode, &amode, rex_flags, 0);
627                }
628            }
629        }
630        Inst::IMul {
631            size,
632            src1,
633            src2,
634            dst,
635        } => {
636            let src1 = src1.to_reg();
637            let dst = dst.to_reg().to_reg();
638            debug_assert_eq!(src1, dst);
639            let src2 = src2.clone().to_reg_mem().clone();
640
641            let rex = RexFlags::from(*size);
642            let prefix = LegacyPrefixes::None;
643            match src2 {
644                RegMem::Reg { reg } => {
645                    emit_std_reg_reg(sink, prefix, 0x0FAF, 2, dst, reg, rex);
646                }
647
648                RegMem::Mem { addr } => {
649                    let amode = addr.finalize(state.frame_layout(), sink);
650                    emit_std_reg_mem(sink, prefix, 0x0FAF, 2, dst, &amode, rex, 0);
651                }
652            }
653        }
654
655        Inst::IMulImm {
656            size,
657            src1,
658            src2,
659            dst,
660        } => {
661            let dst = dst.to_reg().to_reg();
662            let src1 = src1.clone().to_reg_mem().clone();
663
664            let rex = RexFlags::from(*size);
665            let prefix = match size {
666                // NB: the intel manual doesn't seem to mention this prefix as
667                // being required
668                OperandSize::Size16 => LegacyPrefixes::_66,
669                _ => LegacyPrefixes::None,
670            };
671            let imm_size = if i8::try_from(*src2).is_ok() {
672                1
673            } else {
674                if *size == OperandSize::Size16 {
675                    2
676                } else {
677                    4
678                }
679            };
680            let opcode = if imm_size == 1 { 0x6B } else { 0x69 };
681            match src1 {
682                RegMem::Reg { reg } => {
683                    emit_std_reg_reg(sink, prefix, opcode, 1, dst, reg, rex);
684                }
685
686                RegMem::Mem { addr } => {
687                    let amode = addr.finalize(state.frame_layout(), sink);
688                    emit_std_reg_mem(sink, prefix, opcode, 1, dst, &amode, rex, imm_size);
689                }
690            }
691            emit_simm(sink, imm_size, *src2 as u32);
692        }
693
694        Inst::MulX {
695            size,
696            src1,
697            src2,
698            dst_lo,
699            dst_hi,
700        } => {
701            let src1 = src1.to_reg();
702            let dst_lo = dst_lo.to_reg().to_reg();
703            let dst_hi = dst_hi.to_reg().to_reg();
704            debug_assert_eq!(src1, regs::rdx());
705            let src2 = match src2.clone().to_reg_mem().clone() {
706                RegMem::Reg { reg } => {
707                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
708                }
709                RegMem::Mem { addr } => {
710                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
711                }
712            };
713
714            let dst_hi = dst_hi.to_real_reg().unwrap().hw_enc();
715            let dst_lo = if dst_lo.is_invalid_sentinel() {
716                dst_hi
717            } else {
718                dst_lo.to_real_reg().unwrap().hw_enc()
719            };
720
721            VexInstruction::new()
722                .prefix(LegacyPrefixes::_F2)
723                .map(OpcodeMap::_0F38)
724                .w(*size == OperandSize::Size64)
725                .opcode(0xf6)
726                .reg(dst_hi)
727                .vvvv(dst_lo)
728                .rm(src2)
729                .encode(sink);
730        }
731
732        Inst::SignExtendData { size, src, dst } => {
733            let src = src.to_reg();
734            let dst = dst.to_reg().to_reg();
735            debug_assert_eq!(src, regs::rax());
736            if *size == OperandSize::Size8 {
737                debug_assert_eq!(dst, regs::rax());
738            } else {
739                debug_assert_eq!(dst, regs::rdx());
740            }
741            match size {
742                OperandSize::Size8 => {
743                    sink.put1(0x66);
744                    sink.put1(0x98);
745                }
746                OperandSize::Size16 => {
747                    sink.put1(0x66);
748                    sink.put1(0x99);
749                }
750                OperandSize::Size32 => sink.put1(0x99),
751                OperandSize::Size64 => {
752                    sink.put1(0x48);
753                    sink.put1(0x99);
754                }
755            }
756        }
757
758        Inst::CheckedSRemSeq { divisor, .. } | Inst::CheckedSRemSeq8 { divisor, .. } => {
759            let divisor = divisor.to_reg();
760
761            // Validate that the register constraints of the dividend and the
762            // destination are all as expected.
763            let (dst, size) = match inst {
764                Inst::CheckedSRemSeq {
765                    dividend_lo,
766                    dividend_hi,
767                    dst_quotient,
768                    dst_remainder,
769                    size,
770                    ..
771                } => {
772                    let dividend_lo = dividend_lo.to_reg();
773                    let dividend_hi = dividend_hi.to_reg();
774                    let dst_quotient = dst_quotient.to_reg().to_reg();
775                    let dst_remainder = dst_remainder.to_reg().to_reg();
776                    debug_assert_eq!(dividend_lo, regs::rax());
777                    debug_assert_eq!(dividend_hi, regs::rdx());
778                    debug_assert_eq!(dst_quotient, regs::rax());
779                    debug_assert_eq!(dst_remainder, regs::rdx());
780                    (regs::rdx(), *size)
781                }
782                Inst::CheckedSRemSeq8 { dividend, dst, .. } => {
783                    let dividend = dividend.to_reg();
784                    let dst = dst.to_reg().to_reg();
785                    debug_assert_eq!(dividend, regs::rax());
786                    debug_assert_eq!(dst, regs::rax());
787                    (regs::rax(), OperandSize::Size8)
788                }
789                _ => unreachable!(),
790            };
791
792            // Generates the following code sequence:
793            //
794            // cmp -1 %divisor
795            // jnz $do_op
796            //
797            // ;; for srem, result is 0
798            // mov #0, %dst
799            // j $done
800            //
801            // $do_op:
802            // idiv %divisor
803            //
804            // $done:
805
806            let do_op = sink.get_label();
807            let done_label = sink.get_label();
808
809            // Check if the divisor is -1, and if it isn't then immediately
810            // go to the `idiv`.
811            let inst = Inst::cmp_rmi_r(size, divisor, RegMemImm::imm(0xffffffff));
812            inst.emit(sink, info, state);
813            one_way_jmp(sink, CC::NZ, do_op);
814
815            // ... otherwise the divisor is -1 and the result is always 0. This
816            // is written to the destination register which will be %rax for
817            // 8-bit srem and %rdx otherwise.
818            //
819            // Note that for 16-to-64-bit srem operations this leaves the
820            // second destination, %rax, unchanged. This isn't semantically
821            // correct if a lowering actually tries to use the `dst_quotient`
822            // output but for srem only the `dst_remainder` output is used for
823            // now.
824            let inst = Inst::imm(OperandSize::Size64, 0, Writable::from_reg(dst));
825            inst.emit(sink, info, state);
826            let inst = Inst::jmp_known(done_label);
827            inst.emit(sink, info, state);
828
829            // Here the `idiv` is executed, which is different depending on the
830            // size
831            sink.bind_label(do_op, state.ctrl_plane_mut());
832            let inst = match size {
833                OperandSize::Size8 => Inst::div8(
834                    DivSignedness::Signed,
835                    TrapCode::INTEGER_DIVISION_BY_ZERO,
836                    RegMem::reg(divisor),
837                    Gpr::unwrap_new(regs::rax()),
838                    Writable::from_reg(Gpr::unwrap_new(regs::rax())),
839                ),
840                _ => Inst::div(
841                    size,
842                    DivSignedness::Signed,
843                    TrapCode::INTEGER_DIVISION_BY_ZERO,
844                    RegMem::reg(divisor),
845                    Gpr::unwrap_new(regs::rax()),
846                    Gpr::unwrap_new(regs::rdx()),
847                    Writable::from_reg(Gpr::unwrap_new(regs::rax())),
848                    Writable::from_reg(Gpr::unwrap_new(regs::rdx())),
849                ),
850            };
851            inst.emit(sink, info, state);
852
853            sink.bind_label(done_label, state.ctrl_plane_mut());
854        }
855
856        Inst::Imm {
857            dst_size,
858            simm64,
859            dst,
860        } => {
861            let dst = dst.to_reg().to_reg();
862            let enc_dst = int_reg_enc(dst);
863            if *dst_size == OperandSize::Size64 {
864                if low32_will_sign_extend_to_64(*simm64) {
865                    // Sign-extended move imm32.
866                    emit_std_enc_enc(
867                        sink,
868                        LegacyPrefixes::None,
869                        0xC7,
870                        1,
871                        /* subopcode */ 0,
872                        enc_dst,
873                        RexFlags::set_w(),
874                    );
875                    sink.put4(*simm64 as u32);
876                } else {
877                    sink.put1(0x48 | ((enc_dst >> 3) & 1));
878                    sink.put1(0xB8 | (enc_dst & 7));
879                    sink.put8(*simm64);
880                }
881            } else {
882                if ((enc_dst >> 3) & 1) == 1 {
883                    sink.put1(0x41);
884                }
885                sink.put1(0xB8 | (enc_dst & 7));
886                sink.put4(*simm64 as u32);
887            }
888        }
889
890        Inst::MovImmM { size, simm32, dst } => {
891            let dst = &dst.finalize(state.frame_layout(), sink).clone();
892            let default_rex = RexFlags::clear_w();
893            let default_opcode = 0xC7;
894            let bytes = size.to_bytes();
895            let prefix = LegacyPrefixes::None;
896
897            let (opcode, rex, size, prefix) = match *size {
898                // In the 8-bit case, we don't need to enforce REX flags via
899                // `always_emit_if_8bit_needed()` since the destination
900                // operand is a memory operand, not a possibly 8-bit register.
901                OperandSize::Size8 => (0xC6, default_rex, bytes, prefix),
902                OperandSize::Size16 => (0xC7, default_rex, bytes, LegacyPrefixes::_66),
903                OperandSize::Size64 => (default_opcode, RexFlags::from(*size), bytes, prefix),
904
905                _ => (default_opcode, default_rex, bytes, prefix),
906            };
907
908            // 8-bit C6 /0 ib
909            // 16-bit 0x66 C7 /0 iw
910            // 32-bit C7 /0 id
911            // 64-bit REX.W C7 /0 id
912            emit_std_enc_mem(sink, prefix, opcode, 1, /*subopcode*/ 0, dst, rex, 0);
913            emit_simm(sink, size, *simm32 as u32);
914        }
915
916        Inst::MovRR { size, src, dst } => {
917            let src = src.to_reg();
918            let dst = dst.to_reg().to_reg();
919            emit_std_reg_reg(
920                sink,
921                LegacyPrefixes::None,
922                0x89,
923                1,
924                src,
925                dst,
926                RexFlags::from(*size),
927            );
928        }
929
930        Inst::MovFromPReg { src, dst } => {
931            let src: Reg = (*src).into();
932            debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&src));
933            let src = Gpr::unwrap_new(src);
934            let size = OperandSize::Size64;
935            let dst = WritableGpr::from_writable_reg(dst.to_writable_reg()).unwrap();
936            Inst::MovRR { size, src, dst }.emit(sink, info, state);
937        }
938
939        Inst::MovToPReg { src, dst } => {
940            let src = src.to_reg();
941            let src = Gpr::unwrap_new(src);
942            let dst: Reg = (*dst).into();
943            debug_assert!([regs::rsp(), regs::rbp(), regs::pinned_reg()].contains(&dst));
944            let dst = WritableGpr::from_writable_reg(Writable::from_reg(dst)).unwrap();
945            let size = OperandSize::Size64;
946            Inst::MovRR { size, src, dst }.emit(sink, info, state);
947        }
948
949        Inst::MovzxRmR { ext_mode, src, dst } => {
950            let dst = dst.to_reg().to_reg();
951            let (opcodes, num_opcodes, mut rex_flags) = match ext_mode {
952                ExtMode::BL => {
953                    // MOVZBL is (REX.W==0) 0F B6 /r
954                    (0x0FB6, 2, RexFlags::clear_w())
955                }
956                ExtMode::BQ => {
957                    // MOVZBQ is (REX.W==1) 0F B6 /r
958                    // I'm not sure why the Intel manual offers different
959                    // encodings for MOVZBQ than for MOVZBL.  AIUI they should
960                    // achieve the same, since MOVZBL is just going to zero out
961                    // the upper half of the destination anyway.
962                    (0x0FB6, 2, RexFlags::set_w())
963                }
964                ExtMode::WL => {
965                    // MOVZWL is (REX.W==0) 0F B7 /r
966                    (0x0FB7, 2, RexFlags::clear_w())
967                }
968                ExtMode::WQ => {
969                    // MOVZWQ is (REX.W==1) 0F B7 /r
970                    (0x0FB7, 2, RexFlags::set_w())
971                }
972                ExtMode::LQ => {
973                    // This is just a standard 32 bit load, and we rely on the
974                    // default zero-extension rule to perform the extension.
975                    // Note that in reg/reg mode, gcc seems to use the swapped form R/RM, which we
976                    // don't do here, since it's the same encoding size.
977                    // MOV r/m32, r32 is (REX.W==0) 8B /r
978                    (0x8B, 1, RexFlags::clear_w())
979                }
980            };
981
982            match src.clone().to_reg_mem() {
983                RegMem::Reg { reg: src } => {
984                    match ext_mode {
985                        ExtMode::BL | ExtMode::BQ => {
986                            // A redundant REX prefix must be emitted for certain register inputs.
987                            rex_flags.always_emit_if_8bit_needed(src);
988                        }
989                        _ => {}
990                    }
991                    emit_std_reg_reg(
992                        sink,
993                        LegacyPrefixes::None,
994                        opcodes,
995                        num_opcodes,
996                        dst,
997                        src,
998                        rex_flags,
999                    )
1000                }
1001
1002                RegMem::Mem { addr: src } => {
1003                    let src = &src.finalize(state.frame_layout(), sink).clone();
1004
1005                    emit_std_reg_mem(
1006                        sink,
1007                        LegacyPrefixes::None,
1008                        opcodes,
1009                        num_opcodes,
1010                        dst,
1011                        src,
1012                        rex_flags,
1013                        0,
1014                    )
1015                }
1016            }
1017        }
1018
1019        Inst::Mov64MR { src, dst } => {
1020            let dst = dst.to_reg().to_reg();
1021            let src = &src.finalize(state.frame_layout(), sink).clone();
1022
1023            emit_std_reg_mem(
1024                sink,
1025                LegacyPrefixes::None,
1026                0x8B,
1027                1,
1028                dst,
1029                src,
1030                RexFlags::set_w(),
1031                0,
1032            )
1033        }
1034
1035        Inst::LoadEffectiveAddress { addr, dst, size } => {
1036            let dst = dst.to_reg().to_reg();
1037            let amode = addr.finalize(state.frame_layout(), sink).clone();
1038
1039            // If this `lea` can actually get encoded as an `add` then do that
1040            // instead. Currently all candidate `iadd`s become an `lea`
1041            // pseudo-instruction here but maximizing the sue of `lea` is not
1042            // necessarily optimal. The `lea` instruction goes through dedicated
1043            // address units on cores which are finite and disjoint from the
1044            // general ALU, so if everything uses `lea` then those units can get
1045            // saturated while leaving the ALU idle.
1046            //
1047            // To help make use of more parts of a cpu, this attempts to use
1048            // `add` when it's semantically equivalent to `lea`, or otherwise
1049            // when the `dst` register is the same as the `base` or `index`
1050            // register.
1051            //
1052            // FIXME: ideally regalloc is informed of this constraint. Register
1053            // allocation of `lea` should "attempt" to put the `base` in the
1054            // same register as `dst` but not at the expense of generating a
1055            // `mov` instruction. Currently that's not possible but perhaps one
1056            // day it may be worth it.
1057            match amode {
1058                // If `base == dst` then this is `add $imm, %dst`, so encode
1059                // that instead.
1060                Amode::ImmReg {
1061                    simm32,
1062                    base,
1063                    flags: _,
1064                } if base == dst => {
1065                    let inst = Inst::alu_rmi_r(
1066                        *size,
1067                        AluRmiROpcode::Add,
1068                        RegMemImm::imm(simm32 as u32),
1069                        Writable::from_reg(dst),
1070                    );
1071                    inst.emit(sink, info, state);
1072                }
1073                // If the offset is 0 and the shift is 0 (meaning multiplication
1074                // by 1) then:
1075                //
1076                // * If `base == dst`, then this is `add %index, %base`
1077                // * If `index == dst`, then this is `add %base, %index`
1078                //
1079                // Encode the appropriate instruction here in that case.
1080                Amode::ImmRegRegShift {
1081                    simm32: 0,
1082                    base,
1083                    index,
1084                    shift: 0,
1085                    flags: _,
1086                } if base == dst || index == dst => {
1087                    let (dst, operand) = if base == dst {
1088                        (base, index)
1089                    } else {
1090                        (index, base)
1091                    };
1092                    let inst = Inst::alu_rmi_r(
1093                        *size,
1094                        AluRmiROpcode::Add,
1095                        RegMemImm::reg(operand.to_reg()),
1096                        Writable::from_reg(dst.to_reg()),
1097                    );
1098                    inst.emit(sink, info, state);
1099                }
1100
1101                // If `lea`'s 3-operand mode is leveraged by regalloc, or if
1102                // it's fancy like imm-plus-shift-plus-base, then `lea` is
1103                // actually emitted.
1104                _ => {
1105                    let flags = match size {
1106                        OperandSize::Size32 => RexFlags::clear_w(),
1107                        OperandSize::Size64 => RexFlags::set_w(),
1108                        _ => unreachable!(),
1109                    };
1110                    emit_std_reg_mem(sink, LegacyPrefixes::None, 0x8D, 1, dst, &amode, flags, 0);
1111                }
1112            };
1113        }
1114
1115        Inst::MovsxRmR { ext_mode, src, dst } => {
1116            let dst = dst.to_reg().to_reg();
1117            let (opcodes, num_opcodes, mut rex_flags) = match ext_mode {
1118                ExtMode::BL => {
1119                    // MOVSBL is (REX.W==0) 0F BE /r
1120                    (0x0FBE, 2, RexFlags::clear_w())
1121                }
1122                ExtMode::BQ => {
1123                    // MOVSBQ is (REX.W==1) 0F BE /r
1124                    (0x0FBE, 2, RexFlags::set_w())
1125                }
1126                ExtMode::WL => {
1127                    // MOVSWL is (REX.W==0) 0F BF /r
1128                    (0x0FBF, 2, RexFlags::clear_w())
1129                }
1130                ExtMode::WQ => {
1131                    // MOVSWQ is (REX.W==1) 0F BF /r
1132                    (0x0FBF, 2, RexFlags::set_w())
1133                }
1134                ExtMode::LQ => {
1135                    // MOVSLQ is (REX.W==1) 63 /r
1136                    (0x63, 1, RexFlags::set_w())
1137                }
1138            };
1139
1140            match src.clone().to_reg_mem() {
1141                RegMem::Reg { reg: src } => {
1142                    match ext_mode {
1143                        ExtMode::BL | ExtMode::BQ => {
1144                            // A redundant REX prefix must be emitted for certain register inputs.
1145                            rex_flags.always_emit_if_8bit_needed(src);
1146                        }
1147                        _ => {}
1148                    }
1149                    emit_std_reg_reg(
1150                        sink,
1151                        LegacyPrefixes::None,
1152                        opcodes,
1153                        num_opcodes,
1154                        dst,
1155                        src,
1156                        rex_flags,
1157                    )
1158                }
1159
1160                RegMem::Mem { addr: src } => {
1161                    let src = &src.finalize(state.frame_layout(), sink).clone();
1162
1163                    emit_std_reg_mem(
1164                        sink,
1165                        LegacyPrefixes::None,
1166                        opcodes,
1167                        num_opcodes,
1168                        dst,
1169                        src,
1170                        rex_flags,
1171                        0,
1172                    )
1173                }
1174            }
1175        }
1176
1177        Inst::MovRM { size, src, dst } => {
1178            let src = src.to_reg();
1179            let dst = &dst.finalize(state.frame_layout(), sink).clone();
1180
1181            let prefix = match size {
1182                OperandSize::Size16 => LegacyPrefixes::_66,
1183                _ => LegacyPrefixes::None,
1184            };
1185
1186            let opcode = match size {
1187                OperandSize::Size8 => 0x88,
1188                _ => 0x89,
1189            };
1190
1191            // This is one of the few places where the presence of a
1192            // redundant REX prefix changes the meaning of the
1193            // instruction.
1194            let rex = RexFlags::from((*size, src));
1195
1196            //  8-bit: MOV r8, r/m8 is (REX.W==0) 88 /r
1197            // 16-bit: MOV r16, r/m16 is 66 (REX.W==0) 89 /r
1198            // 32-bit: MOV r32, r/m32 is (REX.W==0) 89 /r
1199            // 64-bit: MOV r64, r/m64 is (REX.W==1) 89 /r
1200            emit_std_reg_mem(sink, prefix, opcode, 1, src, dst, rex, 0);
1201        }
1202
1203        Inst::ShiftR {
1204            size,
1205            kind,
1206            src,
1207            num_bits,
1208            dst,
1209        } => {
1210            let src = src.to_reg();
1211            let dst = dst.to_reg().to_reg();
1212            debug_assert_eq!(src, dst);
1213            let subopcode = match kind {
1214                ShiftKind::RotateLeft => 0,
1215                ShiftKind::RotateRight => 1,
1216                ShiftKind::ShiftLeft => 4,
1217                ShiftKind::ShiftRightLogical => 5,
1218                ShiftKind::ShiftRightArithmetic => 7,
1219            };
1220            let enc_dst = int_reg_enc(dst);
1221            let rex_flags = RexFlags::from((*size, dst));
1222            match num_bits.as_imm8_reg() {
1223                &Imm8Reg::Reg { reg } => {
1224                    debug_assert_eq!(reg, regs::rcx());
1225                    let (opcode, prefix) = match size {
1226                        OperandSize::Size8 => (0xD2, LegacyPrefixes::None),
1227                        OperandSize::Size16 => (0xD3, LegacyPrefixes::_66),
1228                        OperandSize::Size32 => (0xD3, LegacyPrefixes::None),
1229                        OperandSize::Size64 => (0xD3, LegacyPrefixes::None),
1230                    };
1231
1232                    // SHL/SHR/SAR %cl, reg8 is (REX.W==0) D2 /subopcode
1233                    // SHL/SHR/SAR %cl, reg16 is 66 (REX.W==0) D3 /subopcode
1234                    // SHL/SHR/SAR %cl, reg32 is (REX.W==0) D3 /subopcode
1235                    // SHL/SHR/SAR %cl, reg64 is (REX.W==1) D3 /subopcode
1236                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags);
1237                }
1238
1239                &Imm8Reg::Imm8 { imm: num_bits } => {
1240                    let (opcode, prefix) = match size {
1241                        OperandSize::Size8 => (0xC0, LegacyPrefixes::None),
1242                        OperandSize::Size16 => (0xC1, LegacyPrefixes::_66),
1243                        OperandSize::Size32 => (0xC1, LegacyPrefixes::None),
1244                        OperandSize::Size64 => (0xC1, LegacyPrefixes::None),
1245                    };
1246
1247                    // SHL/SHR/SAR $ib, reg8 is (REX.W==0) C0 /subopcode
1248                    // SHL/SHR/SAR $ib, reg16 is 66 (REX.W==0) C1 /subopcode
1249                    // SHL/SHR/SAR $ib, reg32 is (REX.W==0) C1 /subopcode ib
1250                    // SHL/SHR/SAR $ib, reg64 is (REX.W==1) C1 /subopcode ib
1251                    // When the shift amount is 1, there's an even shorter encoding, but we don't
1252                    // bother with that nicety here.
1253                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_dst, rex_flags);
1254                    sink.put1(num_bits);
1255                }
1256            }
1257        }
1258
1259        Inst::XmmRmiReg {
1260            opcode,
1261            src1,
1262            src2,
1263            dst,
1264        } => {
1265            let src1 = src1.to_reg();
1266            let dst = dst.to_reg().to_reg();
1267            debug_assert_eq!(src1, dst);
1268            let rex = RexFlags::clear_w();
1269            let prefix = LegacyPrefixes::_66;
1270            let src2 = src2.clone().to_reg_mem_imm();
1271            if let RegMemImm::Imm { simm32 } = src2 {
1272                let (opcode_bytes, reg_digit) = match opcode {
1273                    SseOpcode::Psllw => (0x0F71, 6),
1274                    SseOpcode::Pslld => (0x0F72, 6),
1275                    SseOpcode::Psllq => (0x0F73, 6),
1276                    SseOpcode::Psraw => (0x0F71, 4),
1277                    SseOpcode::Psrad => (0x0F72, 4),
1278                    SseOpcode::Psrlw => (0x0F71, 2),
1279                    SseOpcode::Psrld => (0x0F72, 2),
1280                    SseOpcode::Psrlq => (0x0F73, 2),
1281                    _ => panic!("invalid opcode: {opcode}"),
1282                };
1283                let dst_enc = reg_enc(dst);
1284                emit_std_enc_enc(sink, prefix, opcode_bytes, 2, reg_digit, dst_enc, rex);
1285                let imm = (simm32)
1286                    .try_into()
1287                    .expect("the immediate must be convertible to a u8");
1288                sink.put1(imm);
1289            } else {
1290                let opcode_bytes = match opcode {
1291                    SseOpcode::Psllw => 0x0FF1,
1292                    SseOpcode::Pslld => 0x0FF2,
1293                    SseOpcode::Psllq => 0x0FF3,
1294                    SseOpcode::Psraw => 0x0FE1,
1295                    SseOpcode::Psrad => 0x0FE2,
1296                    SseOpcode::Psrlw => 0x0FD1,
1297                    SseOpcode::Psrld => 0x0FD2,
1298                    SseOpcode::Psrlq => 0x0FD3,
1299                    _ => panic!("invalid opcode: {opcode}"),
1300                };
1301
1302                match src2 {
1303                    RegMemImm::Reg { reg } => {
1304                        emit_std_reg_reg(sink, prefix, opcode_bytes, 2, dst, reg, rex);
1305                    }
1306                    RegMemImm::Mem { addr } => {
1307                        let addr = &addr.finalize(state.frame_layout(), sink).clone();
1308                        emit_std_reg_mem(sink, prefix, opcode_bytes, 2, dst, addr, rex, 0);
1309                    }
1310                    RegMemImm::Imm { .. } => unreachable!(),
1311                }
1312            };
1313        }
1314
1315        Inst::CmpRmiR {
1316            size,
1317            src1: reg_g,
1318            src2: src_e,
1319            opcode,
1320        } => {
1321            let reg_g = reg_g.to_reg();
1322
1323            let is_cmp = match opcode {
1324                CmpOpcode::Cmp => true,
1325                CmpOpcode::Test => false,
1326            };
1327
1328            let mut prefix = LegacyPrefixes::None;
1329            if *size == OperandSize::Size16 {
1330                prefix = LegacyPrefixes::_66;
1331            }
1332            // A redundant REX prefix can change the meaning of this instruction.
1333            let mut rex = RexFlags::from((*size, reg_g));
1334
1335            match src_e.clone().to_reg_mem_imm() {
1336                RegMemImm::Reg { reg: reg_e } => {
1337                    if *size == OperandSize::Size8 {
1338                        // Check whether the E register forces the use of a redundant REX.
1339                        rex.always_emit_if_8bit_needed(reg_e);
1340                    }
1341
1342                    // Use the swapped operands encoding for CMP, to stay consistent with the output of
1343                    // gcc/llvm.
1344                    let opcode = match (*size, is_cmp) {
1345                        (OperandSize::Size8, true) => 0x38,
1346                        (_, true) => 0x39,
1347                        (OperandSize::Size8, false) => 0x84,
1348                        (_, false) => 0x85,
1349                    };
1350                    emit_std_reg_reg(sink, prefix, opcode, 1, reg_e, reg_g, rex);
1351                }
1352
1353                RegMemImm::Mem { addr } => {
1354                    let addr = &addr.finalize(state.frame_layout(), sink).clone();
1355                    // Whereas here we revert to the "normal" G-E ordering for CMP.
1356                    let opcode = match (*size, is_cmp) {
1357                        (OperandSize::Size8, true) => 0x3A,
1358                        (_, true) => 0x3B,
1359                        (OperandSize::Size8, false) => 0x84,
1360                        (_, false) => 0x85,
1361                    };
1362                    emit_std_reg_mem(sink, prefix, opcode, 1, reg_g, addr, rex, 0);
1363                }
1364
1365                RegMemImm::Imm { simm32 } => {
1366                    // FIXME JRS 2020Feb11: there are shorter encodings for
1367                    // cmp $imm, rax/eax/ax/al.
1368                    let use_imm8 = is_cmp && low8_will_sign_extend_to_32(simm32);
1369
1370                    // And also here we use the "normal" G-E ordering.
1371                    let opcode = if is_cmp {
1372                        if *size == OperandSize::Size8 {
1373                            0x80
1374                        } else if use_imm8 {
1375                            0x83
1376                        } else {
1377                            0x81
1378                        }
1379                    } else {
1380                        if *size == OperandSize::Size8 {
1381                            0xF6
1382                        } else {
1383                            0xF7
1384                        }
1385                    };
1386                    let subopcode = if is_cmp { 7 } else { 0 };
1387
1388                    let enc_g = int_reg_enc(reg_g);
1389                    emit_std_enc_enc(sink, prefix, opcode, 1, subopcode, enc_g, rex);
1390                    emit_simm(sink, if use_imm8 { 1 } else { size.to_bytes() }, simm32);
1391                }
1392            }
1393        }
1394
1395        Inst::Setcc { cc, dst } => {
1396            let dst = dst.to_reg().to_reg();
1397            let opcode = 0x0f90 + cc.get_enc() as u32;
1398            let mut rex_flags = RexFlags::clear_w();
1399            rex_flags.always_emit();
1400            emit_std_enc_enc(
1401                sink,
1402                LegacyPrefixes::None,
1403                opcode,
1404                2,
1405                0,
1406                reg_enc(dst),
1407                rex_flags,
1408            );
1409        }
1410
1411        Inst::Bswap { size, src, dst } => {
1412            let src = src.to_reg();
1413            let dst = dst.to_reg().to_reg();
1414            debug_assert_eq!(src, dst);
1415            let enc_reg = int_reg_enc(dst);
1416
1417            // BSWAP reg32 is (REX.W==0) 0F C8
1418            // BSWAP reg64 is (REX.W==1) 0F C8
1419            let rex_flags = RexFlags::from(*size);
1420            rex_flags.emit_one_op(sink, enc_reg);
1421
1422            sink.put1(0x0F);
1423            sink.put1(0xC8 | (enc_reg & 7));
1424        }
1425
1426        Inst::Cmove {
1427            size,
1428            cc,
1429            consequent,
1430            alternative,
1431            dst,
1432        } => {
1433            let alternative = alternative.to_reg();
1434            let dst = dst.to_reg().to_reg();
1435            debug_assert_eq!(alternative, dst);
1436            let rex_flags = RexFlags::from(*size);
1437            let prefix = match size {
1438                OperandSize::Size16 => LegacyPrefixes::_66,
1439                OperandSize::Size32 => LegacyPrefixes::None,
1440                OperandSize::Size64 => LegacyPrefixes::None,
1441                _ => unreachable!("invalid size spec for cmove"),
1442            };
1443            let opcode = 0x0F40 + cc.get_enc() as u32;
1444            match consequent.clone().to_reg_mem() {
1445                RegMem::Reg { reg } => {
1446                    emit_std_reg_reg(sink, prefix, opcode, 2, dst, reg, rex_flags);
1447                }
1448                RegMem::Mem { addr } => {
1449                    let addr = &addr.finalize(state.frame_layout(), sink).clone();
1450                    emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex_flags, 0);
1451                }
1452            }
1453        }
1454
1455        Inst::XmmCmove {
1456            ty,
1457            cc,
1458            consequent,
1459            alternative,
1460            dst,
1461        } => {
1462            let alternative = alternative.to_reg();
1463            let dst = dst.to_writable_reg();
1464            debug_assert_eq!(alternative, dst.to_reg());
1465            let consequent = consequent.to_reg();
1466
1467            // Lowering of the Select IR opcode when the input is an fcmp relies on the fact that
1468            // this doesn't clobber flags. Make sure to not do so here.
1469            let next = sink.get_label();
1470
1471            // Jump if cc is *not* set.
1472            one_way_jmp(sink, cc.invert(), next);
1473
1474            let op = match *ty {
1475                types::F64 => SseOpcode::Movsd,
1476                types::F32 => SseOpcode::Movsd,
1477                types::F16 => SseOpcode::Movsd,
1478                types::F32X4 => SseOpcode::Movaps,
1479                types::F64X2 => SseOpcode::Movapd,
1480                ty => {
1481                    debug_assert!((ty.is_float() || ty.is_vector()) && ty.bytes() == 16);
1482                    SseOpcode::Movdqa
1483                }
1484            };
1485            let inst = Inst::xmm_unary_rm_r(op, consequent.into(), dst);
1486            inst.emit(sink, info, state);
1487
1488            sink.bind_label(next, state.ctrl_plane_mut());
1489        }
1490
1491        Inst::Push64 { src } => {
1492            let src = src.clone().to_reg_mem_imm().clone();
1493
1494            match src {
1495                RegMemImm::Reg { reg } => {
1496                    let enc_reg = int_reg_enc(reg);
1497                    let rex = 0x40 | ((enc_reg >> 3) & 1);
1498                    if rex != 0x40 {
1499                        sink.put1(rex);
1500                    }
1501                    sink.put1(0x50 | (enc_reg & 7));
1502                }
1503
1504                RegMemImm::Mem { addr } => {
1505                    let addr = &addr.finalize(state.frame_layout(), sink);
1506                    emit_std_enc_mem(
1507                        sink,
1508                        LegacyPrefixes::None,
1509                        0xFF,
1510                        1,
1511                        6, /*subopcode*/
1512                        addr,
1513                        RexFlags::clear_w(),
1514                        0,
1515                    );
1516                }
1517
1518                RegMemImm::Imm { simm32 } => {
1519                    if low8_will_sign_extend_to_64(simm32) {
1520                        sink.put1(0x6A);
1521                        sink.put1(simm32 as u8);
1522                    } else {
1523                        sink.put1(0x68);
1524                        sink.put4(simm32);
1525                    }
1526                }
1527            }
1528        }
1529
1530        Inst::Pop64 { dst } => {
1531            let dst = dst.to_reg().to_reg();
1532            let enc_dst = int_reg_enc(dst);
1533            if enc_dst >= 8 {
1534                // 0x41 == REX.{W=0, B=1}.  It seems that REX.W is irrelevant here.
1535                sink.put1(0x41);
1536            }
1537            sink.put1(0x58 + (enc_dst & 7));
1538        }
1539
1540        Inst::StackProbeLoop {
1541            tmp,
1542            frame_size,
1543            guard_size,
1544        } => {
1545            assert!(info.flags.enable_probestack());
1546            assert!(guard_size.is_power_of_two());
1547
1548            let tmp = *tmp;
1549
1550            // Number of probes that we need to perform
1551            let probe_count = align_to(*frame_size, *guard_size) / guard_size;
1552
1553            // The inline stack probe loop has 3 phases:
1554            //
1555            // We generate the "guard area" register which is essentially the frame_size aligned to
1556            // guard_size. We copy the stack pointer and subtract the guard area from it. This
1557            // gets us a register that we can use to compare when looping.
1558            //
1559            // After that we emit the loop. Essentially we just adjust the stack pointer one guard_size'd
1560            // distance at a time and then touch the stack by writing anything to it. We use the previously
1561            // created "guard area" register to know when to stop looping.
1562            //
1563            // When we have touched all the pages that we need, we have to restore the stack pointer
1564            // to where it was before.
1565            //
1566            // Generate the following code:
1567            //         mov  tmp_reg, rsp
1568            //         sub  tmp_reg, guard_size * probe_count
1569            // .loop_start:
1570            //         sub  rsp, guard_size
1571            //         mov  [rsp], rsp
1572            //         cmp  rsp, tmp_reg
1573            //         jne  .loop_start
1574            //         add  rsp, guard_size * probe_count
1575
1576            // Create the guard bound register
1577            // mov  tmp_reg, rsp
1578            let inst = Inst::gen_move(tmp, regs::rsp(), types::I64);
1579            inst.emit(sink, info, state);
1580
1581            // sub  tmp_reg, GUARD_SIZE * probe_count
1582            let inst = Inst::alu_rmi_r(
1583                OperandSize::Size64,
1584                AluRmiROpcode::Sub,
1585                RegMemImm::imm(guard_size * probe_count),
1586                tmp,
1587            );
1588            inst.emit(sink, info, state);
1589
1590            // Emit the main loop!
1591            let loop_start = sink.get_label();
1592            sink.bind_label(loop_start, state.ctrl_plane_mut());
1593
1594            // sub  rsp, GUARD_SIZE
1595            let inst = Inst::alu_rmi_r(
1596                OperandSize::Size64,
1597                AluRmiROpcode::Sub,
1598                RegMemImm::imm(*guard_size),
1599                Writable::from_reg(regs::rsp()),
1600            );
1601            inst.emit(sink, info, state);
1602
1603            // TODO: `mov [rsp], 0` would be better, but we don't have that instruction
1604            // Probe the stack! We don't use Inst::gen_store_stack here because we need a predictable
1605            // instruction size.
1606            // mov  [rsp], rsp
1607            let inst = Inst::mov_r_m(
1608                OperandSize::Size32, // Use Size32 since it saves us one byte
1609                regs::rsp(),
1610                SyntheticAmode::Real(Amode::imm_reg(0, regs::rsp())),
1611            );
1612            inst.emit(sink, info, state);
1613
1614            // Compare and jump if we are not done yet
1615            // cmp  rsp, tmp_reg
1616            let inst = Inst::cmp_rmi_r(
1617                OperandSize::Size64,
1618                tmp.to_reg(),
1619                RegMemImm::reg(regs::rsp()),
1620            );
1621            inst.emit(sink, info, state);
1622
1623            // jne  .loop_start
1624            // TODO: Encoding the conditional jump as a short jump
1625            // could save us us 4 bytes here.
1626            one_way_jmp(sink, CC::NZ, loop_start);
1627
1628            // The regular prologue code is going to emit a `sub` after this, so we need to
1629            // reset the stack pointer
1630            //
1631            // TODO: It would be better if we could avoid the `add` + `sub` that is generated here
1632            // and in the stack adj portion of the prologue
1633            //
1634            // add rsp, GUARD_SIZE * probe_count
1635            let inst = Inst::alu_rmi_r(
1636                OperandSize::Size64,
1637                AluRmiROpcode::Add,
1638                RegMemImm::imm(guard_size * probe_count),
1639                Writable::from_reg(regs::rsp()),
1640            );
1641            inst.emit(sink, info, state);
1642        }
1643
1644        Inst::CallKnown { info: call_info } => {
1645            if let Some(s) = state.take_stack_map() {
1646                let offset = sink.cur_offset() + 5;
1647                sink.push_user_stack_map(state, offset, s);
1648            }
1649
1650            sink.put1(0xE8);
1651            // The addend adjusts for the difference between the end of the instruction and the
1652            // beginning of the immediate field.
1653            emit_reloc(sink, Reloc::X86CallPCRel4, &call_info.dest, -4);
1654            sink.put4(0);
1655            sink.add_call_site();
1656
1657            // Reclaim the outgoing argument area that was released by the callee, to ensure that
1658            // StackAMode values are always computed from a consistent SP.
1659            if call_info.callee_pop_size > 0 {
1660                Inst::alu_rmi_r(
1661                    OperandSize::Size64,
1662                    AluRmiROpcode::Sub,
1663                    RegMemImm::imm(call_info.callee_pop_size),
1664                    Writable::from_reg(regs::rsp()),
1665                )
1666                .emit(sink, info, state);
1667            }
1668        }
1669
1670        Inst::ReturnCallKnown { info: call_info } => {
1671            emit_return_call_common_sequence(sink, info, state, &call_info);
1672
1673            // Finally, jump to the callee!
1674            //
1675            // Note: this is not `Inst::Jmp { .. }.emit(..)` because we have
1676            // different metadata in this case: we don't have a label for the
1677            // target, but rather a function relocation.
1678            sink.put1(0xE9);
1679            // The addend adjusts for the difference between the end of the instruction and the
1680            // beginning of the immediate field.
1681            emit_reloc(sink, Reloc::X86CallPCRel4, &call_info.dest, -4);
1682            sink.put4(0);
1683            sink.add_call_site();
1684        }
1685
1686        Inst::ReturnCallUnknown { info: call_info } => {
1687            let callee = call_info.dest;
1688
1689            emit_return_call_common_sequence(sink, info, state, &call_info);
1690
1691            Inst::JmpUnknown {
1692                target: RegMem::reg(callee),
1693            }
1694            .emit(sink, info, state);
1695            sink.add_call_site();
1696        }
1697
1698        Inst::CallUnknown {
1699            info: call_info, ..
1700        } => {
1701            let dest = call_info.dest.clone();
1702
1703            match dest {
1704                RegMem::Reg { reg } => {
1705                    let reg_enc = int_reg_enc(reg);
1706                    emit_std_enc_enc(
1707                        sink,
1708                        LegacyPrefixes::None,
1709                        0xFF,
1710                        1,
1711                        2, /*subopcode*/
1712                        reg_enc,
1713                        RexFlags::clear_w(),
1714                    );
1715                }
1716
1717                RegMem::Mem { addr } => {
1718                    let addr = &addr.finalize(state.frame_layout(), sink);
1719                    emit_std_enc_mem(
1720                        sink,
1721                        LegacyPrefixes::None,
1722                        0xFF,
1723                        1,
1724                        2, /*subopcode*/
1725                        addr,
1726                        RexFlags::clear_w(),
1727                        0,
1728                    );
1729                }
1730            }
1731
1732            if let Some(s) = state.take_stack_map() {
1733                let offset = sink.cur_offset();
1734                sink.push_user_stack_map(state, offset, s);
1735            }
1736
1737            sink.add_call_site();
1738
1739            // Reclaim the outgoing argument area that was released by the callee, to ensure that
1740            // StackAMode values are always computed from a consistent SP.
1741            if call_info.callee_pop_size > 0 {
1742                Inst::alu_rmi_r(
1743                    OperandSize::Size64,
1744                    AluRmiROpcode::Sub,
1745                    RegMemImm::imm(call_info.callee_pop_size),
1746                    Writable::from_reg(regs::rsp()),
1747                )
1748                .emit(sink, info, state);
1749            }
1750        }
1751
1752        Inst::Args { .. } => {}
1753        Inst::Rets { .. } => {}
1754
1755        Inst::Ret {
1756            stack_bytes_to_pop: 0,
1757        } => sink.put1(0xC3),
1758
1759        Inst::Ret { stack_bytes_to_pop } => {
1760            sink.put1(0xC2);
1761            sink.put2(u16::try_from(*stack_bytes_to_pop).unwrap());
1762        }
1763
1764        Inst::StackSwitchBasic {
1765            store_context_ptr,
1766            load_context_ptr,
1767            in_payload0,
1768            out_payload0,
1769        } => {
1770            // Note that we do not emit anything for preserving and restoring
1771            // ordinary registers here: That's taken care of by regalloc for us,
1772            // since we marked this instruction as clobbering all registers.
1773            //
1774            // Also note that we do nothing about passing the single payload
1775            // value: We've informed regalloc that it is sent and received via
1776            // the fixed register given by [stack_switch::payload_register]
1777
1778            let (tmp1, tmp2) = {
1779                // Ideally we would just ask regalloc for two temporary registers.
1780                // However, adding any early defs to the constraints on StackSwitch
1781                // causes TooManyLiveRegs. Fortunately, we can manually find tmp
1782                // registers without regalloc: Since our instruction clobbers all
1783                // registers, we can simply pick any register that is not assigned
1784                // to the operands.
1785
1786                let all = crate::isa::x64::abi::ALL_CLOBBERS;
1787
1788                let used_regs = [
1789                    **load_context_ptr,
1790                    **store_context_ptr,
1791                    **in_payload0,
1792                    *out_payload0.to_reg(),
1793                ];
1794
1795                let mut tmps = all.into_iter().filter_map(|preg| {
1796                    let reg: Reg = preg.into();
1797                    if !used_regs.contains(&reg) {
1798                        WritableGpr::from_writable_reg(isle::WritableReg::from_reg(reg))
1799                    } else {
1800                        None
1801                    }
1802                });
1803                (tmps.next().unwrap(), tmps.next().unwrap())
1804            };
1805
1806            let layout = stack_switch::control_context_layout();
1807            let rsp_offset = layout.stack_pointer_offset as i32;
1808            let pc_offset = layout.ip_offset as i32;
1809            let rbp_offset = layout.frame_pointer_offset as i32;
1810
1811            // Location to which someone switch-ing back to this stack will jump
1812            // to: Right behind the `StackSwitch` instruction
1813            let resume = sink.get_label();
1814
1815            //
1816            // For RBP and RSP we do the following:
1817            // - Load new value for register from `load_context_ptr` +
1818            // corresponding offset.
1819            // - Store previous (!) value of register at `store_context_ptr` +
1820            // corresponding offset.
1821            //
1822            // Since `load_context_ptr` and `store_context_ptr` are allowed to be
1823            // equal, we need to use a temporary register here.
1824            //
1825
1826            let mut exchange = |offset, reg| {
1827                let inst = Inst::Mov64MR {
1828                    src: Amode::imm_reg(offset, **load_context_ptr).into(),
1829                    dst: tmp1,
1830                };
1831                emit(&inst, sink, info, state);
1832
1833                let inst = Inst::MovRM {
1834                    size: OperandSize::Size64,
1835                    src: Gpr::new(reg).unwrap(),
1836                    dst: Amode::imm_reg(offset, **store_context_ptr).into(),
1837                };
1838                emit(&inst, sink, info, state);
1839
1840                let dst = Writable::from_reg(reg.into());
1841                let inst = Inst::MovRR {
1842                    size: OperandSize::Size64,
1843                    src: tmp1.to_reg(),
1844                    dst: WritableGpr::from_writable_reg(dst.into()).unwrap(),
1845                };
1846                emit(&inst, sink, info, state);
1847            };
1848
1849            exchange(rsp_offset, regs::rsp());
1850            exchange(rbp_offset, regs::rbp());
1851
1852            //
1853            // Load target PC, store resume PC, jump to target PC
1854            //
1855
1856            let inst = Inst::Mov64MR {
1857                src: Amode::imm_reg(pc_offset, **load_context_ptr).into(),
1858                dst: tmp1,
1859            };
1860            emit(&inst, sink, info, state);
1861
1862            let amode = Amode::RipRelative { target: resume };
1863            let inst = Inst::lea(amode, tmp2.map(Reg::from));
1864            inst.emit(sink, info, state);
1865
1866            let inst = Inst::MovRM {
1867                size: OperandSize::Size64,
1868                src: tmp2.to_reg(),
1869                dst: Amode::imm_reg(pc_offset, **store_context_ptr).into(),
1870            };
1871            emit(&inst, sink, info, state);
1872
1873            let inst = Inst::JmpUnknown {
1874                target: RegMem::reg(tmp1.to_reg().into()),
1875            };
1876            emit(&inst, sink, info, state);
1877
1878            sink.bind_label(resume, state.ctrl_plane_mut());
1879        }
1880
1881        Inst::JmpKnown { dst } => {
1882            let br_start = sink.cur_offset();
1883            let br_disp_off = br_start + 1;
1884            let br_end = br_start + 5;
1885
1886            sink.use_label_at_offset(br_disp_off, *dst, LabelUse::JmpRel32);
1887            sink.add_uncond_branch(br_start, br_end, *dst);
1888
1889            sink.put1(0xE9);
1890            // Placeholder for the label value.
1891            sink.put4(0x0);
1892        }
1893
1894        Inst::WinchJmpIf { cc, taken } => {
1895            let cond_start = sink.cur_offset();
1896            let cond_disp_off = cond_start + 2;
1897
1898            sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32);
1899            // Since this is not a terminator, don't enroll in the branch inversion mechanism.
1900
1901            sink.put1(0x0F);
1902            sink.put1(0x80 + cc.get_enc());
1903            // Placeholder for the label value.
1904            sink.put4(0x0);
1905        }
1906
1907        Inst::JmpCond {
1908            cc,
1909            taken,
1910            not_taken,
1911        } => {
1912            // If taken.
1913            let cond_start = sink.cur_offset();
1914            let cond_disp_off = cond_start + 2;
1915            let cond_end = cond_start + 6;
1916
1917            sink.use_label_at_offset(cond_disp_off, *taken, LabelUse::JmpRel32);
1918            let inverted: [u8; 6] = [0x0F, 0x80 + (cc.invert().get_enc()), 0x00, 0x00, 0x00, 0x00];
1919            sink.add_cond_branch(cond_start, cond_end, *taken, &inverted[..]);
1920
1921            sink.put1(0x0F);
1922            sink.put1(0x80 + cc.get_enc());
1923            // Placeholder for the label value.
1924            sink.put4(0x0);
1925
1926            // If not taken.
1927            let uncond_start = sink.cur_offset();
1928            let uncond_disp_off = uncond_start + 1;
1929            let uncond_end = uncond_start + 5;
1930
1931            sink.use_label_at_offset(uncond_disp_off, *not_taken, LabelUse::JmpRel32);
1932            sink.add_uncond_branch(uncond_start, uncond_end, *not_taken);
1933
1934            sink.put1(0xE9);
1935            // Placeholder for the label value.
1936            sink.put4(0x0);
1937        }
1938
1939        Inst::JmpCondOr {
1940            cc1,
1941            cc2,
1942            taken,
1943            not_taken,
1944        } => {
1945            // Emit:
1946            //   jcc1 taken
1947            //   jcc2 taken
1948            //   jmp not_taken
1949            //
1950            // Note that we enroll both conditionals in the
1951            // branch-chomping mechanism because MachBuffer
1952            // simplification can continue upward as long as it keeps
1953            // chomping branches. In the best case, if taken ==
1954            // not_taken and that one block is the fallthrough block,
1955            // all three branches can disappear.
1956
1957            // jcc1 taken
1958            let cond_1_start = sink.cur_offset();
1959            let cond_1_disp_off = cond_1_start + 2;
1960            let cond_1_end = cond_1_start + 6;
1961
1962            sink.use_label_at_offset(cond_1_disp_off, *taken, LabelUse::JmpRel32);
1963            let inverted: [u8; 6] = [
1964                0x0F,
1965                0x80 + (cc1.invert().get_enc()),
1966                0x00,
1967                0x00,
1968                0x00,
1969                0x00,
1970            ];
1971            sink.add_cond_branch(cond_1_start, cond_1_end, *taken, &inverted[..]);
1972
1973            sink.put1(0x0F);
1974            sink.put1(0x80 + cc1.get_enc());
1975            sink.put4(0x0);
1976
1977            // jcc2 taken
1978            let cond_2_start = sink.cur_offset();
1979            let cond_2_disp_off = cond_2_start + 2;
1980            let cond_2_end = cond_2_start + 6;
1981
1982            sink.use_label_at_offset(cond_2_disp_off, *taken, LabelUse::JmpRel32);
1983            let inverted: [u8; 6] = [
1984                0x0F,
1985                0x80 + (cc2.invert().get_enc()),
1986                0x00,
1987                0x00,
1988                0x00,
1989                0x00,
1990            ];
1991            sink.add_cond_branch(cond_2_start, cond_2_end, *taken, &inverted[..]);
1992
1993            sink.put1(0x0F);
1994            sink.put1(0x80 + cc2.get_enc());
1995            sink.put4(0x0);
1996
1997            // jmp not_taken
1998            let uncond_start = sink.cur_offset();
1999            let uncond_disp_off = uncond_start + 1;
2000            let uncond_end = uncond_start + 5;
2001
2002            sink.use_label_at_offset(uncond_disp_off, *not_taken, LabelUse::JmpRel32);
2003            sink.add_uncond_branch(uncond_start, uncond_end, *not_taken);
2004
2005            sink.put1(0xE9);
2006            sink.put4(0x0);
2007        }
2008
2009        Inst::JmpUnknown { target } => {
2010            let target = target.clone();
2011
2012            match target {
2013                RegMem::Reg { reg } => {
2014                    let reg_enc = int_reg_enc(reg);
2015                    emit_std_enc_enc(
2016                        sink,
2017                        LegacyPrefixes::None,
2018                        0xFF,
2019                        1,
2020                        4, /*subopcode*/
2021                        reg_enc,
2022                        RexFlags::clear_w(),
2023                    );
2024                }
2025
2026                RegMem::Mem { addr } => {
2027                    let addr = &addr.finalize(state.frame_layout(), sink);
2028                    emit_std_enc_mem(
2029                        sink,
2030                        LegacyPrefixes::None,
2031                        0xFF,
2032                        1,
2033                        4, /*subopcode*/
2034                        addr,
2035                        RexFlags::clear_w(),
2036                        0,
2037                    );
2038                }
2039            }
2040        }
2041
2042        &Inst::JmpTableSeq {
2043            idx,
2044            tmp1,
2045            tmp2,
2046            ref targets,
2047            ref default_target,
2048            ..
2049        } => {
2050            // This sequence is *one* instruction in the vcode, and is expanded only here at
2051            // emission time, because we cannot allow the regalloc to insert spills/reloads in
2052            // the middle; we depend on hardcoded PC-rel addressing below.
2053            //
2054            // We don't have to worry about emitting islands, because the only label-use type has a
2055            // maximum range of 2 GB. If we later consider using shorter-range label references,
2056            // this will need to be revisited.
2057
2058            // We generate the following sequence. Note that the only read of %idx is before the
2059            // write to %tmp2, so regalloc may use the same register for both; fix x64/inst/mod.rs
2060            // if you change this.
2061            // lea start_of_jump_table_offset(%rip), %tmp1
2062            // movslq [%tmp1, %idx, 4], %tmp2 ;; shift of 2, viz. multiply index by 4
2063            // addq %tmp2, %tmp1
2064            // j *%tmp1
2065            // $start_of_jump_table:
2066            // -- jump table entries
2067
2068            // Load base address of jump table.
2069            let start_of_jumptable = sink.get_label();
2070            let inst = Inst::lea(Amode::rip_relative(start_of_jumptable), tmp1);
2071            inst.emit(sink, info, state);
2072
2073            // Load value out of the jump table. It's a relative offset to the target block, so it
2074            // might be negative; use a sign-extension.
2075            let inst = Inst::movsx_rm_r(
2076                ExtMode::LQ,
2077                RegMem::mem(Amode::imm_reg_reg_shift(
2078                    0,
2079                    Gpr::unwrap_new(tmp1.to_reg()),
2080                    Gpr::unwrap_new(idx),
2081                    2,
2082                )),
2083                tmp2,
2084            );
2085            inst.emit(sink, info, state);
2086
2087            // Add base of jump table to jump-table-sourced block offset.
2088            let inst = Inst::alu_rmi_r(
2089                OperandSize::Size64,
2090                AluRmiROpcode::Add,
2091                RegMemImm::reg(tmp2.to_reg()),
2092                tmp1,
2093            );
2094            inst.emit(sink, info, state);
2095
2096            // Branch to computed address.
2097            let inst = Inst::jmp_unknown(RegMem::reg(tmp1.to_reg()));
2098            inst.emit(sink, info, state);
2099
2100            // Emit jump table (table of 32-bit offsets).
2101            sink.bind_label(start_of_jumptable, state.ctrl_plane_mut());
2102            let jt_off = sink.cur_offset();
2103            for &target in targets.iter().chain(std::iter::once(default_target)) {
2104                let word_off = sink.cur_offset();
2105                // off_into_table is an addend here embedded in the label to be later patched at
2106                // the end of codegen. The offset is initially relative to this jump table entry;
2107                // with the extra addend, it'll be relative to the jump table's start, after
2108                // patching.
2109                let off_into_table = word_off - jt_off;
2110                sink.use_label_at_offset(word_off, target, LabelUse::PCRel32);
2111                sink.put4(off_into_table);
2112            }
2113        }
2114
2115        Inst::TrapIf { cc, trap_code } => {
2116            let trap_label = sink.defer_trap(*trap_code);
2117            one_way_jmp(sink, *cc, trap_label);
2118        }
2119
2120        Inst::TrapIfAnd {
2121            cc1,
2122            cc2,
2123            trap_code,
2124        } => {
2125            let trap_label = sink.defer_trap(*trap_code);
2126            let else_label = sink.get_label();
2127
2128            // Jump to the end if the first condition isn't true, and then if
2129            // the second condition is true go to the trap.
2130            one_way_jmp(sink, cc1.invert(), else_label);
2131            one_way_jmp(sink, *cc2, trap_label);
2132
2133            sink.bind_label(else_label, state.ctrl_plane_mut());
2134        }
2135
2136        Inst::TrapIfOr {
2137            cc1,
2138            cc2,
2139            trap_code,
2140        } => {
2141            let trap_label = sink.defer_trap(*trap_code);
2142
2143            // Emit two jumps to the same trap if either condition code is true.
2144            one_way_jmp(sink, *cc1, trap_label);
2145            one_way_jmp(sink, *cc2, trap_label);
2146        }
2147
2148        Inst::XmmUnaryRmR { op, src, dst } => {
2149            emit(
2150                &Inst::XmmUnaryRmRUnaligned {
2151                    op: *op,
2152                    src: XmmMem::unwrap_new(src.clone().into()),
2153                    dst: *dst,
2154                },
2155                sink,
2156                info,
2157                state,
2158            );
2159        }
2160
2161        Inst::XmmUnaryRmRUnaligned {
2162            op,
2163            src: src_e,
2164            dst: reg_g,
2165        } => {
2166            let reg_g = reg_g.to_reg().to_reg();
2167            let src_e = src_e.clone().to_reg_mem().clone();
2168
2169            let rex = RexFlags::clear_w();
2170
2171            let (prefix, opcode, num_opcodes) = match op {
2172                SseOpcode::Cvtdq2pd => (LegacyPrefixes::_F3, 0x0FE6, 2),
2173                SseOpcode::Cvtpd2ps => (LegacyPrefixes::_66, 0x0F5A, 2),
2174                SseOpcode::Cvtps2pd => (LegacyPrefixes::None, 0x0F5A, 2),
2175                SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
2176                SseOpcode::Cvttpd2dq => (LegacyPrefixes::_66, 0x0FE6, 2),
2177                SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
2178                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F28, 2),
2179                SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F28, 2),
2180                SseOpcode::Movdqa => (LegacyPrefixes::_66, 0x0F6F, 2),
2181                SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F6F, 2),
2182                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
2183                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10, 2),
2184                SseOpcode::Movups => (LegacyPrefixes::None, 0x0F10, 2),
2185                SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F10, 2),
2186                SseOpcode::Pabsb => (LegacyPrefixes::_66, 0x0F381C, 3),
2187                SseOpcode::Pabsw => (LegacyPrefixes::_66, 0x0F381D, 3),
2188                SseOpcode::Pabsd => (LegacyPrefixes::_66, 0x0F381E, 3),
2189                SseOpcode::Pmovsxbd => (LegacyPrefixes::_66, 0x0F3821, 3),
2190                SseOpcode::Pmovsxbw => (LegacyPrefixes::_66, 0x0F3820, 3),
2191                SseOpcode::Pmovsxbq => (LegacyPrefixes::_66, 0x0F3822, 3),
2192                SseOpcode::Pmovsxwd => (LegacyPrefixes::_66, 0x0F3823, 3),
2193                SseOpcode::Pmovsxwq => (LegacyPrefixes::_66, 0x0F3824, 3),
2194                SseOpcode::Pmovsxdq => (LegacyPrefixes::_66, 0x0F3825, 3),
2195                SseOpcode::Pmovzxbd => (LegacyPrefixes::_66, 0x0F3831, 3),
2196                SseOpcode::Pmovzxbw => (LegacyPrefixes::_66, 0x0F3830, 3),
2197                SseOpcode::Pmovzxbq => (LegacyPrefixes::_66, 0x0F3832, 3),
2198                SseOpcode::Pmovzxwd => (LegacyPrefixes::_66, 0x0F3833, 3),
2199                SseOpcode::Pmovzxwq => (LegacyPrefixes::_66, 0x0F3834, 3),
2200                SseOpcode::Pmovzxdq => (LegacyPrefixes::_66, 0x0F3835, 3),
2201                SseOpcode::Sqrtps => (LegacyPrefixes::None, 0x0F51, 2),
2202                SseOpcode::Sqrtpd => (LegacyPrefixes::_66, 0x0F51, 2),
2203                SseOpcode::Movddup => (LegacyPrefixes::_F2, 0x0F12, 2),
2204                _ => unimplemented!("Opcode {:?} not implemented", op),
2205            };
2206
2207            match src_e {
2208                RegMem::Reg { reg: reg_e } => {
2209                    emit_std_reg_reg(sink, prefix, opcode, num_opcodes, reg_g, reg_e, rex);
2210                }
2211                RegMem::Mem { addr } => {
2212                    let addr = &addr.finalize(state.frame_layout(), sink);
2213                    emit_std_reg_mem(sink, prefix, opcode, num_opcodes, reg_g, addr, rex, 0);
2214                }
2215            };
2216        }
2217
2218        Inst::XmmUnaryRmRImm { op, src, dst, imm } => {
2219            let dst = dst.to_reg().to_reg();
2220            let src = src.clone().to_reg_mem().clone();
2221            let rex = RexFlags::clear_w();
2222
2223            let (prefix, opcode, len) = match op {
2224                SseOpcode::Roundps => (LegacyPrefixes::_66, 0x0F3A08, 3),
2225                SseOpcode::Roundss => (LegacyPrefixes::_66, 0x0F3A0A, 3),
2226                SseOpcode::Roundpd => (LegacyPrefixes::_66, 0x0F3A09, 3),
2227                SseOpcode::Roundsd => (LegacyPrefixes::_66, 0x0F3A0B, 3),
2228                SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
2229                SseOpcode::Pshuflw => (LegacyPrefixes::_F2, 0x0F70, 2),
2230                SseOpcode::Pshufhw => (LegacyPrefixes::_F3, 0x0F70, 2),
2231                _ => unimplemented!("Opcode {:?} not implemented", op),
2232            };
2233            match src {
2234                RegMem::Reg { reg } => {
2235                    emit_std_reg_reg(sink, prefix, opcode, len, dst, reg, rex);
2236                }
2237                RegMem::Mem { addr } => {
2238                    let addr = &addr.finalize(state.frame_layout(), sink);
2239                    // N.B.: bytes_at_end == 1, because of the `imm` byte below.
2240                    emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 1);
2241                }
2242            }
2243            sink.put1(*imm);
2244        }
2245
2246        Inst::XmmUnaryRmREvex { op, src, dst } => {
2247            let dst = dst.to_reg().to_reg();
2248            let src = match src.clone().to_reg_mem().clone() {
2249                RegMem::Reg { reg } => {
2250                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2251                }
2252                RegMem::Mem { addr } => {
2253                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
2254                }
2255            };
2256
2257            let (prefix, map, w, opcode) = match op {
2258                Avx512Opcode::Vcvtudq2ps => (LegacyPrefixes::_F2, OpcodeMap::_0F, false, 0x7a),
2259                Avx512Opcode::Vpabsq => (LegacyPrefixes::_66, OpcodeMap::_0F38, true, 0x1f),
2260                Avx512Opcode::Vpopcntb => (LegacyPrefixes::_66, OpcodeMap::_0F38, false, 0x54),
2261                _ => unimplemented!("Opcode {:?} not implemented", op),
2262            };
2263            EvexInstruction::new()
2264                .length(EvexVectorLength::V128)
2265                .prefix(prefix)
2266                .map(map)
2267                .w(w)
2268                .opcode(opcode)
2269                .tuple_type(op.tuple_type())
2270                .reg(dst.to_real_reg().unwrap().hw_enc())
2271                .rm(src)
2272                .encode(sink);
2273        }
2274
2275        Inst::XmmUnaryRmRImmEvex { op, src, dst, imm } => {
2276            let dst = dst.to_reg().to_reg();
2277            let src = match src.clone().to_reg_mem().clone() {
2278                RegMem::Reg { reg } => {
2279                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2280                }
2281                RegMem::Mem { addr } => {
2282                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
2283                }
2284            };
2285
2286            let (opcode, opcode_ext, w) = match op {
2287                Avx512Opcode::VpsraqImm => (0x72, 4, true),
2288                _ => unimplemented!("Opcode {:?} not implemented", op),
2289            };
2290            EvexInstruction::new()
2291                .length(EvexVectorLength::V128)
2292                .prefix(LegacyPrefixes::_66)
2293                .map(OpcodeMap::_0F)
2294                .w(w)
2295                .opcode(opcode)
2296                .reg(opcode_ext)
2297                .vvvvv(dst.to_real_reg().unwrap().hw_enc())
2298                .tuple_type(op.tuple_type())
2299                .rm(src)
2300                .imm(*imm)
2301                .encode(sink);
2302        }
2303
2304        Inst::XmmRmR {
2305            op,
2306            src1,
2307            src2,
2308            dst,
2309        } => emit(
2310            &Inst::XmmRmRUnaligned {
2311                op: *op,
2312                dst: *dst,
2313                src1: *src1,
2314                src2: XmmMem::unwrap_new(src2.clone().to_reg_mem()),
2315            },
2316            sink,
2317            info,
2318            state,
2319        ),
2320
2321        Inst::XmmRmRUnaligned {
2322            op,
2323            src1,
2324            src2: src_e,
2325            dst: reg_g,
2326        } => {
2327            let src1 = src1.to_reg();
2328            let reg_g = reg_g.to_reg().to_reg();
2329            let src_e = src_e.clone().to_reg_mem().clone();
2330            debug_assert_eq!(src1, reg_g);
2331
2332            let rex = RexFlags::clear_w();
2333            let (prefix, opcode, length) = match op {
2334                SseOpcode::Addps => (LegacyPrefixes::None, 0x0F58, 2),
2335                SseOpcode::Addpd => (LegacyPrefixes::_66, 0x0F58, 2),
2336                SseOpcode::Addss => (LegacyPrefixes::_F3, 0x0F58, 2),
2337                SseOpcode::Addsd => (LegacyPrefixes::_F2, 0x0F58, 2),
2338                SseOpcode::Andps => (LegacyPrefixes::None, 0x0F54, 2),
2339                SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
2340                SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
2341                SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
2342                SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2),
2343                SseOpcode::Divpd => (LegacyPrefixes::_66, 0x0F5E, 2),
2344                SseOpcode::Divss => (LegacyPrefixes::_F3, 0x0F5E, 2),
2345                SseOpcode::Divsd => (LegacyPrefixes::_F2, 0x0F5E, 2),
2346                SseOpcode::Maxps => (LegacyPrefixes::None, 0x0F5F, 2),
2347                SseOpcode::Maxpd => (LegacyPrefixes::_66, 0x0F5F, 2),
2348                SseOpcode::Maxss => (LegacyPrefixes::_F3, 0x0F5F, 2),
2349                SseOpcode::Maxsd => (LegacyPrefixes::_F2, 0x0F5F, 2),
2350                SseOpcode::Minps => (LegacyPrefixes::None, 0x0F5D, 2),
2351                SseOpcode::Minpd => (LegacyPrefixes::_66, 0x0F5D, 2),
2352                SseOpcode::Minss => (LegacyPrefixes::_F3, 0x0F5D, 2),
2353                SseOpcode::Minsd => (LegacyPrefixes::_F2, 0x0F5D, 2),
2354                SseOpcode::Movlhps => (LegacyPrefixes::None, 0x0F16, 2),
2355                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F10, 2),
2356                SseOpcode::Mulps => (LegacyPrefixes::None, 0x0F59, 2),
2357                SseOpcode::Mulpd => (LegacyPrefixes::_66, 0x0F59, 2),
2358                SseOpcode::Mulss => (LegacyPrefixes::_F3, 0x0F59, 2),
2359                SseOpcode::Mulsd => (LegacyPrefixes::_F2, 0x0F59, 2),
2360                SseOpcode::Orpd => (LegacyPrefixes::_66, 0x0F56, 2),
2361                SseOpcode::Orps => (LegacyPrefixes::None, 0x0F56, 2),
2362                SseOpcode::Packssdw => (LegacyPrefixes::_66, 0x0F6B, 2),
2363                SseOpcode::Packsswb => (LegacyPrefixes::_66, 0x0F63, 2),
2364                SseOpcode::Packusdw => (LegacyPrefixes::_66, 0x0F382B, 3),
2365                SseOpcode::Packuswb => (LegacyPrefixes::_66, 0x0F67, 2),
2366                SseOpcode::Paddb => (LegacyPrefixes::_66, 0x0FFC, 2),
2367                SseOpcode::Paddd => (LegacyPrefixes::_66, 0x0FFE, 2),
2368                SseOpcode::Paddq => (LegacyPrefixes::_66, 0x0FD4, 2),
2369                SseOpcode::Paddw => (LegacyPrefixes::_66, 0x0FFD, 2),
2370                SseOpcode::Paddsb => (LegacyPrefixes::_66, 0x0FEC, 2),
2371                SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2),
2372                SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2),
2373                SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2),
2374                SseOpcode::Pmaddubsw => (LegacyPrefixes::_66, 0x0F3804, 3),
2375                SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2),
2376                SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2),
2377                SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2),
2378                SseOpcode::Pavgw => (LegacyPrefixes::_66, 0x0FE3, 2),
2379                SseOpcode::Pcmpeqb => (LegacyPrefixes::_66, 0x0F74, 2),
2380                SseOpcode::Pcmpeqw => (LegacyPrefixes::_66, 0x0F75, 2),
2381                SseOpcode::Pcmpeqd => (LegacyPrefixes::_66, 0x0F76, 2),
2382                SseOpcode::Pcmpeqq => (LegacyPrefixes::_66, 0x0F3829, 3),
2383                SseOpcode::Pcmpgtb => (LegacyPrefixes::_66, 0x0F64, 2),
2384                SseOpcode::Pcmpgtw => (LegacyPrefixes::_66, 0x0F65, 2),
2385                SseOpcode::Pcmpgtd => (LegacyPrefixes::_66, 0x0F66, 2),
2386                SseOpcode::Pcmpgtq => (LegacyPrefixes::_66, 0x0F3837, 3),
2387                SseOpcode::Pmaddwd => (LegacyPrefixes::_66, 0x0FF5, 2),
2388                SseOpcode::Pmaxsb => (LegacyPrefixes::_66, 0x0F383C, 3),
2389                SseOpcode::Pmaxsw => (LegacyPrefixes::_66, 0x0FEE, 2),
2390                SseOpcode::Pmaxsd => (LegacyPrefixes::_66, 0x0F383D, 3),
2391                SseOpcode::Pmaxub => (LegacyPrefixes::_66, 0x0FDE, 2),
2392                SseOpcode::Pmaxuw => (LegacyPrefixes::_66, 0x0F383E, 3),
2393                SseOpcode::Pmaxud => (LegacyPrefixes::_66, 0x0F383F, 3),
2394                SseOpcode::Pminsb => (LegacyPrefixes::_66, 0x0F3838, 3),
2395                SseOpcode::Pminsw => (LegacyPrefixes::_66, 0x0FEA, 2),
2396                SseOpcode::Pminsd => (LegacyPrefixes::_66, 0x0F3839, 3),
2397                SseOpcode::Pminub => (LegacyPrefixes::_66, 0x0FDA, 2),
2398                SseOpcode::Pminuw => (LegacyPrefixes::_66, 0x0F383A, 3),
2399                SseOpcode::Pminud => (LegacyPrefixes::_66, 0x0F383B, 3),
2400                SseOpcode::Pmuldq => (LegacyPrefixes::_66, 0x0F3828, 3),
2401                SseOpcode::Pmulhw => (LegacyPrefixes::_66, 0x0FE5, 2),
2402                SseOpcode::Pmulhrsw => (LegacyPrefixes::_66, 0x0F380B, 3),
2403                SseOpcode::Pmulhuw => (LegacyPrefixes::_66, 0x0FE4, 2),
2404                SseOpcode::Pmulld => (LegacyPrefixes::_66, 0x0F3840, 3),
2405                SseOpcode::Pmullw => (LegacyPrefixes::_66, 0x0FD5, 2),
2406                SseOpcode::Pmuludq => (LegacyPrefixes::_66, 0x0FF4, 2),
2407                SseOpcode::Por => (LegacyPrefixes::_66, 0x0FEB, 2),
2408                SseOpcode::Pshufb => (LegacyPrefixes::_66, 0x0F3800, 3),
2409                SseOpcode::Psubb => (LegacyPrefixes::_66, 0x0FF8, 2),
2410                SseOpcode::Psubd => (LegacyPrefixes::_66, 0x0FFA, 2),
2411                SseOpcode::Psubq => (LegacyPrefixes::_66, 0x0FFB, 2),
2412                SseOpcode::Psubw => (LegacyPrefixes::_66, 0x0FF9, 2),
2413                SseOpcode::Psubsb => (LegacyPrefixes::_66, 0x0FE8, 2),
2414                SseOpcode::Psubsw => (LegacyPrefixes::_66, 0x0FE9, 2),
2415                SseOpcode::Psubusb => (LegacyPrefixes::_66, 0x0FD8, 2),
2416                SseOpcode::Psubusw => (LegacyPrefixes::_66, 0x0FD9, 2),
2417                SseOpcode::Punpckhbw => (LegacyPrefixes::_66, 0x0F68, 2),
2418                SseOpcode::Punpckhwd => (LegacyPrefixes::_66, 0x0F69, 2),
2419                SseOpcode::Punpcklbw => (LegacyPrefixes::_66, 0x0F60, 2),
2420                SseOpcode::Punpcklwd => (LegacyPrefixes::_66, 0x0F61, 2),
2421                SseOpcode::Punpckldq => (LegacyPrefixes::_66, 0x0F62, 2),
2422                SseOpcode::Punpcklqdq => (LegacyPrefixes::_66, 0x0F6C, 2),
2423                SseOpcode::Punpckhdq => (LegacyPrefixes::_66, 0x0F6A, 2),
2424                SseOpcode::Punpckhqdq => (LegacyPrefixes::_66, 0x0F6D, 2),
2425                SseOpcode::Pxor => (LegacyPrefixes::_66, 0x0FEF, 2),
2426                SseOpcode::Subps => (LegacyPrefixes::None, 0x0F5C, 2),
2427                SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
2428                SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2),
2429                SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2),
2430                SseOpcode::Unpcklps => (LegacyPrefixes::None, 0x0F14, 2),
2431                SseOpcode::Unpckhps => (LegacyPrefixes::None, 0x0F15, 2),
2432                SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
2433                SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
2434                SseOpcode::Phaddw => (LegacyPrefixes::_66, 0x0F3801, 3),
2435                SseOpcode::Phaddd => (LegacyPrefixes::_66, 0x0F3802, 3),
2436                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F10, 2),
2437                SseOpcode::Cvtss2sd => (LegacyPrefixes::_F3, 0x0F5A, 2),
2438                SseOpcode::Cvtsd2ss => (LegacyPrefixes::_F2, 0x0F5A, 2),
2439                SseOpcode::Sqrtss => (LegacyPrefixes::_F3, 0x0F51, 2),
2440                SseOpcode::Sqrtsd => (LegacyPrefixes::_F2, 0x0F51, 2),
2441                SseOpcode::Unpcklpd => (LegacyPrefixes::_66, 0x0F14, 2),
2442                _ => unimplemented!("Opcode {:?} not implemented", op),
2443            };
2444
2445            match src_e {
2446                RegMem::Reg { reg: reg_e } => {
2447                    emit_std_reg_reg(sink, prefix, opcode, length, reg_g, reg_e, rex);
2448                }
2449                RegMem::Mem { addr } => {
2450                    let addr = &addr.finalize(state.frame_layout(), sink);
2451                    emit_std_reg_mem(sink, prefix, opcode, length, reg_g, addr, rex, 0);
2452                }
2453            }
2454        }
2455
2456        Inst::XmmRmRBlend {
2457            op,
2458            src1,
2459            src2,
2460            dst,
2461            mask,
2462        } => {
2463            let src1 = src1.to_reg();
2464            let mask = mask.to_reg();
2465            debug_assert_eq!(mask, regs::xmm0());
2466            let reg_g = dst.to_reg().to_reg();
2467            debug_assert_eq!(src1, reg_g);
2468            let src_e = src2.clone().to_reg_mem().clone();
2469
2470            let rex = RexFlags::clear_w();
2471            let (prefix, opcode, length) = match op {
2472                SseOpcode::Blendvps => (LegacyPrefixes::_66, 0x0F3814, 3),
2473                SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3),
2474                SseOpcode::Pblendvb => (LegacyPrefixes::_66, 0x0F3810, 3),
2475                _ => unimplemented!("Opcode {:?} not implemented", op),
2476            };
2477
2478            match src_e {
2479                RegMem::Reg { reg: reg_e } => {
2480                    emit_std_reg_reg(sink, prefix, opcode, length, reg_g, reg_e, rex);
2481                }
2482                RegMem::Mem { addr } => {
2483                    let addr = &addr.finalize(state.frame_layout(), sink);
2484                    emit_std_reg_mem(sink, prefix, opcode, length, reg_g, addr, rex, 0);
2485                }
2486            }
2487        }
2488
2489        Inst::XmmRmiRVex {
2490            op,
2491            src1,
2492            src2,
2493            dst,
2494        } => {
2495            use LegacyPrefixes as LP;
2496            use OpcodeMap as OM;
2497
2498            let dst = dst.to_reg().to_reg();
2499            let src1 = src1.to_reg();
2500            let src2 = src2.clone().to_reg_mem_imm().clone();
2501
2502            // When the opcode is commutative, src1 is xmm{0..7}, and src2 is
2503            // xmm{8..15}, then we can swap the operands to save one byte on the
2504            // instruction's encoding.
2505            let (src1, src2) = match (src1, src2) {
2506                (src1, RegMemImm::Reg { reg: src2 })
2507                    if op.is_commutative()
2508                        && src1.to_real_reg().unwrap().hw_enc() < 8
2509                        && src2.to_real_reg().unwrap().hw_enc() >= 8 =>
2510                {
2511                    (src2, RegMemImm::Reg { reg: src1 })
2512                }
2513                (src1, src2) => (src1, src2),
2514            };
2515
2516            let src2 = match src2 {
2517                // For opcodes where one of the operands is an immediate the
2518                // encoding is a bit different, notably the usage of
2519                // `opcode_ext`, so handle that specially here.
2520                RegMemImm::Imm { simm32 } => {
2521                    let (opcode, opcode_ext, prefix) = match op {
2522                        AvxOpcode::Vpsrlw => (0x71, 2, LegacyPrefixes::_66),
2523                        AvxOpcode::Vpsrld => (0x72, 2, LegacyPrefixes::_66),
2524                        AvxOpcode::Vpsrlq => (0x73, 2, LegacyPrefixes::_66),
2525                        AvxOpcode::Vpsllw => (0x71, 6, LegacyPrefixes::_66),
2526                        AvxOpcode::Vpslld => (0x72, 6, LegacyPrefixes::_66),
2527                        AvxOpcode::Vpsllq => (0x73, 6, LegacyPrefixes::_66),
2528                        AvxOpcode::Vpsraw => (0x71, 4, LegacyPrefixes::_66),
2529                        AvxOpcode::Vpsrad => (0x72, 4, LegacyPrefixes::_66),
2530                        _ => panic!("unexpected rmi_r_vex opcode with immediate {op:?}"),
2531                    };
2532                    VexInstruction::new()
2533                        .length(VexVectorLength::V128)
2534                        .prefix(prefix)
2535                        .map(OpcodeMap::_0F)
2536                        .opcode(opcode)
2537                        .opcode_ext(opcode_ext)
2538                        .vvvv(dst.to_real_reg().unwrap().hw_enc())
2539                        .prefix(LegacyPrefixes::_66)
2540                        .rm(src1.to_real_reg().unwrap().hw_enc())
2541                        .imm(simm32.try_into().unwrap())
2542                        .encode(sink);
2543                    return;
2544                }
2545                RegMemImm::Reg { reg } => {
2546                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2547                }
2548                RegMemImm::Mem { addr } => {
2549                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
2550                }
2551            };
2552
2553            let (prefix, map, opcode) = match op {
2554                AvxOpcode::Vminps => (LP::None, OM::_0F, 0x5D),
2555                AvxOpcode::Vminpd => (LP::_66, OM::_0F, 0x5D),
2556                AvxOpcode::Vmaxps => (LP::None, OM::_0F, 0x5F),
2557                AvxOpcode::Vmaxpd => (LP::_66, OM::_0F, 0x5F),
2558                AvxOpcode::Vandnps => (LP::None, OM::_0F, 0x55),
2559                AvxOpcode::Vandnpd => (LP::_66, OM::_0F, 0x55),
2560                AvxOpcode::Vpandn => (LP::_66, OM::_0F, 0xDF),
2561                AvxOpcode::Vpsrlw => (LP::_66, OM::_0F, 0xD1),
2562                AvxOpcode::Vpsrld => (LP::_66, OM::_0F, 0xD2),
2563                AvxOpcode::Vpsrlq => (LP::_66, OM::_0F, 0xD3),
2564                AvxOpcode::Vpaddb => (LP::_66, OM::_0F, 0xFC),
2565                AvxOpcode::Vpaddw => (LP::_66, OM::_0F, 0xFD),
2566                AvxOpcode::Vpaddd => (LP::_66, OM::_0F, 0xFE),
2567                AvxOpcode::Vpaddq => (LP::_66, OM::_0F, 0xD4),
2568                AvxOpcode::Vpaddsb => (LP::_66, OM::_0F, 0xEC),
2569                AvxOpcode::Vpaddsw => (LP::_66, OM::_0F, 0xED),
2570                AvxOpcode::Vpaddusb => (LP::_66, OM::_0F, 0xDC),
2571                AvxOpcode::Vpaddusw => (LP::_66, OM::_0F, 0xDD),
2572                AvxOpcode::Vpsubb => (LP::_66, OM::_0F, 0xF8),
2573                AvxOpcode::Vpsubw => (LP::_66, OM::_0F, 0xF9),
2574                AvxOpcode::Vpsubd => (LP::_66, OM::_0F, 0xFA),
2575                AvxOpcode::Vpsubq => (LP::_66, OM::_0F, 0xFB),
2576                AvxOpcode::Vpsubsb => (LP::_66, OM::_0F, 0xE8),
2577                AvxOpcode::Vpsubsw => (LP::_66, OM::_0F, 0xE9),
2578                AvxOpcode::Vpsubusb => (LP::_66, OM::_0F, 0xD8),
2579                AvxOpcode::Vpsubusw => (LP::_66, OM::_0F, 0xD9),
2580                AvxOpcode::Vpavgb => (LP::_66, OM::_0F, 0xE0),
2581                AvxOpcode::Vpavgw => (LP::_66, OM::_0F, 0xE3),
2582                AvxOpcode::Vpand => (LP::_66, OM::_0F, 0xDB),
2583                AvxOpcode::Vandps => (LP::None, OM::_0F, 0x54),
2584                AvxOpcode::Vandpd => (LP::_66, OM::_0F, 0x54),
2585                AvxOpcode::Vpor => (LP::_66, OM::_0F, 0xEB),
2586                AvxOpcode::Vorps => (LP::None, OM::_0F, 0x56),
2587                AvxOpcode::Vorpd => (LP::_66, OM::_0F, 0x56),
2588                AvxOpcode::Vpxor => (LP::_66, OM::_0F, 0xEF),
2589                AvxOpcode::Vxorps => (LP::None, OM::_0F, 0x57),
2590                AvxOpcode::Vxorpd => (LP::_66, OM::_0F, 0x57),
2591                AvxOpcode::Vpmullw => (LP::_66, OM::_0F, 0xD5),
2592                AvxOpcode::Vpmulld => (LP::_66, OM::_0F38, 0x40),
2593                AvxOpcode::Vpmulhw => (LP::_66, OM::_0F, 0xE5),
2594                AvxOpcode::Vpmulhrsw => (LP::_66, OM::_0F38, 0x0B),
2595                AvxOpcode::Vpmulhuw => (LP::_66, OM::_0F, 0xE4),
2596                AvxOpcode::Vpmuldq => (LP::_66, OM::_0F38, 0x28),
2597                AvxOpcode::Vpmuludq => (LP::_66, OM::_0F, 0xF4),
2598                AvxOpcode::Vpunpckhwd => (LP::_66, OM::_0F, 0x69),
2599                AvxOpcode::Vpunpcklwd => (LP::_66, OM::_0F, 0x61),
2600                AvxOpcode::Vunpcklps => (LP::None, OM::_0F, 0x14),
2601                AvxOpcode::Vunpckhps => (LP::None, OM::_0F, 0x15),
2602                AvxOpcode::Vaddps => (LP::None, OM::_0F, 0x58),
2603                AvxOpcode::Vaddpd => (LP::_66, OM::_0F, 0x58),
2604                AvxOpcode::Vsubps => (LP::None, OM::_0F, 0x5C),
2605                AvxOpcode::Vsubpd => (LP::_66, OM::_0F, 0x5C),
2606                AvxOpcode::Vmulps => (LP::None, OM::_0F, 0x59),
2607                AvxOpcode::Vmulpd => (LP::_66, OM::_0F, 0x59),
2608                AvxOpcode::Vdivps => (LP::None, OM::_0F, 0x5E),
2609                AvxOpcode::Vdivpd => (LP::_66, OM::_0F, 0x5E),
2610                AvxOpcode::Vpcmpeqb => (LP::_66, OM::_0F, 0x74),
2611                AvxOpcode::Vpcmpeqw => (LP::_66, OM::_0F, 0x75),
2612                AvxOpcode::Vpcmpeqd => (LP::_66, OM::_0F, 0x76),
2613                AvxOpcode::Vpcmpeqq => (LP::_66, OM::_0F38, 0x29),
2614                AvxOpcode::Vpcmpgtb => (LP::_66, OM::_0F, 0x64),
2615                AvxOpcode::Vpcmpgtw => (LP::_66, OM::_0F, 0x65),
2616                AvxOpcode::Vpcmpgtd => (LP::_66, OM::_0F, 0x66),
2617                AvxOpcode::Vpcmpgtq => (LP::_66, OM::_0F38, 0x37),
2618                AvxOpcode::Vmovlhps => (LP::None, OM::_0F, 0x16),
2619                AvxOpcode::Vpminsb => (LP::_66, OM::_0F38, 0x38),
2620                AvxOpcode::Vpminsw => (LP::_66, OM::_0F, 0xEA),
2621                AvxOpcode::Vpminsd => (LP::_66, OM::_0F38, 0x39),
2622                AvxOpcode::Vpmaxsb => (LP::_66, OM::_0F38, 0x3C),
2623                AvxOpcode::Vpmaxsw => (LP::_66, OM::_0F, 0xEE),
2624                AvxOpcode::Vpmaxsd => (LP::_66, OM::_0F38, 0x3D),
2625                AvxOpcode::Vpminub => (LP::_66, OM::_0F, 0xDA),
2626                AvxOpcode::Vpminuw => (LP::_66, OM::_0F38, 0x3A),
2627                AvxOpcode::Vpminud => (LP::_66, OM::_0F38, 0x3B),
2628                AvxOpcode::Vpmaxub => (LP::_66, OM::_0F, 0xDE),
2629                AvxOpcode::Vpmaxuw => (LP::_66, OM::_0F38, 0x3E),
2630                AvxOpcode::Vpmaxud => (LP::_66, OM::_0F38, 0x3F),
2631                AvxOpcode::Vpunpcklbw => (LP::_66, OM::_0F, 0x60),
2632                AvxOpcode::Vpunpckhbw => (LP::_66, OM::_0F, 0x68),
2633                AvxOpcode::Vpacksswb => (LP::_66, OM::_0F, 0x63),
2634                AvxOpcode::Vpackssdw => (LP::_66, OM::_0F, 0x6B),
2635                AvxOpcode::Vpackuswb => (LP::_66, OM::_0F, 0x67),
2636                AvxOpcode::Vpackusdw => (LP::_66, OM::_0F38, 0x2B),
2637                AvxOpcode::Vpmaddwd => (LP::_66, OM::_0F, 0xF5),
2638                AvxOpcode::Vpmaddubsw => (LP::_66, OM::_0F38, 0x04),
2639                AvxOpcode::Vpshufb => (LP::_66, OM::_0F38, 0x00),
2640                AvxOpcode::Vpsllw => (LP::_66, OM::_0F, 0xF1),
2641                AvxOpcode::Vpslld => (LP::_66, OM::_0F, 0xF2),
2642                AvxOpcode::Vpsllq => (LP::_66, OM::_0F, 0xF3),
2643                AvxOpcode::Vpsraw => (LP::_66, OM::_0F, 0xE1),
2644                AvxOpcode::Vpsrad => (LP::_66, OM::_0F, 0xE2),
2645                AvxOpcode::Vaddss => (LP::_F3, OM::_0F, 0x58),
2646                AvxOpcode::Vaddsd => (LP::_F2, OM::_0F, 0x58),
2647                AvxOpcode::Vmulss => (LP::_F3, OM::_0F, 0x59),
2648                AvxOpcode::Vmulsd => (LP::_F2, OM::_0F, 0x59),
2649                AvxOpcode::Vsubss => (LP::_F3, OM::_0F, 0x5C),
2650                AvxOpcode::Vsubsd => (LP::_F2, OM::_0F, 0x5C),
2651                AvxOpcode::Vdivss => (LP::_F3, OM::_0F, 0x5E),
2652                AvxOpcode::Vdivsd => (LP::_F2, OM::_0F, 0x5E),
2653                AvxOpcode::Vminss => (LP::_F3, OM::_0F, 0x5D),
2654                AvxOpcode::Vminsd => (LP::_F2, OM::_0F, 0x5D),
2655                AvxOpcode::Vmaxss => (LP::_F3, OM::_0F, 0x5F),
2656                AvxOpcode::Vmaxsd => (LP::_F2, OM::_0F, 0x5F),
2657                AvxOpcode::Vphaddw => (LP::_66, OM::_0F38, 0x01),
2658                AvxOpcode::Vphaddd => (LP::_66, OM::_0F38, 0x02),
2659                AvxOpcode::Vpunpckldq => (LP::_66, OM::_0F, 0x62),
2660                AvxOpcode::Vpunpckhdq => (LP::_66, OM::_0F, 0x6A),
2661                AvxOpcode::Vpunpcklqdq => (LP::_66, OM::_0F, 0x6C),
2662                AvxOpcode::Vpunpckhqdq => (LP::_66, OM::_0F, 0x6D),
2663                AvxOpcode::Vmovsd => (LP::_F2, OM::_0F, 0x10),
2664                AvxOpcode::Vmovss => (LP::_F3, OM::_0F, 0x10),
2665                AvxOpcode::Vcvtss2sd => (LP::_F3, OM::_0F, 0x5A),
2666                AvxOpcode::Vcvtsd2ss => (LP::_F2, OM::_0F, 0x5A),
2667                AvxOpcode::Vsqrtss => (LP::_F3, OM::_0F, 0x51),
2668                AvxOpcode::Vsqrtsd => (LP::_F2, OM::_0F, 0x51),
2669                AvxOpcode::Vunpcklpd => (LP::_66, OM::_0F, 0x14),
2670                _ => panic!("unexpected rmir vex opcode {op:?}"),
2671            };
2672            VexInstruction::new()
2673                .length(VexVectorLength::V128)
2674                .prefix(prefix)
2675                .map(map)
2676                .opcode(opcode)
2677                .reg(dst.to_real_reg().unwrap().hw_enc())
2678                .vvvv(src1.to_real_reg().unwrap().hw_enc())
2679                .rm(src2)
2680                .encode(sink);
2681        }
2682
2683        Inst::XmmRmRImmVex {
2684            op,
2685            src1,
2686            src2,
2687            dst,
2688            imm,
2689        } => {
2690            let dst = dst.to_reg().to_reg();
2691            let src1 = src1.to_reg();
2692            let src2 = match src2.clone().to_reg_mem().clone() {
2693                RegMem::Reg { reg } => {
2694                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2695                }
2696                RegMem::Mem { addr } => {
2697                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
2698                }
2699            };
2700
2701            let (w, prefix, map, opcode) = match op {
2702                AvxOpcode::Vcmpps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC2),
2703                AvxOpcode::Vcmppd => (false, LegacyPrefixes::_66, OpcodeMap::_0F, 0xC2),
2704                AvxOpcode::Vpalignr => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0F),
2705                AvxOpcode::Vinsertps => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x21),
2706                AvxOpcode::Vshufps => (false, LegacyPrefixes::None, OpcodeMap::_0F, 0xC6),
2707                AvxOpcode::Vpblendw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0E),
2708                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
2709            };
2710
2711            VexInstruction::new()
2712                .length(VexVectorLength::V128)
2713                .prefix(prefix)
2714                .map(map)
2715                .w(w)
2716                .opcode(opcode)
2717                .reg(dst.to_real_reg().unwrap().hw_enc())
2718                .vvvv(src1.to_real_reg().unwrap().hw_enc())
2719                .rm(src2)
2720                .imm(*imm)
2721                .encode(sink);
2722        }
2723
2724        Inst::XmmVexPinsr {
2725            op,
2726            src1,
2727            src2,
2728            dst,
2729            imm,
2730        } => {
2731            let dst = dst.to_reg().to_reg();
2732            let src1 = src1.to_reg();
2733            let src2 = match src2.clone().to_reg_mem().clone() {
2734                RegMem::Reg { reg } => {
2735                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2736                }
2737                RegMem::Mem { addr } => {
2738                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
2739                }
2740            };
2741
2742            let (w, map, opcode) = match op {
2743                AvxOpcode::Vpinsrb => (false, OpcodeMap::_0F3A, 0x20),
2744                AvxOpcode::Vpinsrw => (false, OpcodeMap::_0F, 0xC4),
2745                AvxOpcode::Vpinsrd => (false, OpcodeMap::_0F3A, 0x22),
2746                AvxOpcode::Vpinsrq => (true, OpcodeMap::_0F3A, 0x22),
2747                _ => panic!("unexpected vex_pinsr opcode {op:?}"),
2748            };
2749
2750            VexInstruction::new()
2751                .length(VexVectorLength::V128)
2752                .prefix(LegacyPrefixes::_66)
2753                .map(map)
2754                .w(w)
2755                .opcode(opcode)
2756                .reg(dst.to_real_reg().unwrap().hw_enc())
2757                .vvvv(src1.to_real_reg().unwrap().hw_enc())
2758                .rm(src2)
2759                .imm(*imm)
2760                .encode(sink);
2761        }
2762
2763        Inst::XmmRmRVex3 {
2764            op,
2765            src1,
2766            src2,
2767            src3,
2768            dst,
2769        } => {
2770            let src1 = src1.to_reg();
2771            let dst = dst.to_reg().to_reg();
2772            debug_assert_eq!(src1, dst);
2773            let src2 = src2.to_reg();
2774            let src3 = match src3.clone().to_reg_mem().clone() {
2775                RegMem::Reg { reg } => {
2776                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2777                }
2778                RegMem::Mem { addr } => {
2779                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
2780                }
2781            };
2782
2783            let (w, map, opcode) = match op {
2784                AvxOpcode::Vfmadd132ss => (false, OpcodeMap::_0F38, 0x99),
2785                AvxOpcode::Vfmadd213ss => (false, OpcodeMap::_0F38, 0xA9),
2786                AvxOpcode::Vfnmadd132ss => (false, OpcodeMap::_0F38, 0x9D),
2787                AvxOpcode::Vfnmadd213ss => (false, OpcodeMap::_0F38, 0xAD),
2788                AvxOpcode::Vfmadd132sd => (true, OpcodeMap::_0F38, 0x99),
2789                AvxOpcode::Vfmadd213sd => (true, OpcodeMap::_0F38, 0xA9),
2790                AvxOpcode::Vfnmadd132sd => (true, OpcodeMap::_0F38, 0x9D),
2791                AvxOpcode::Vfnmadd213sd => (true, OpcodeMap::_0F38, 0xAD),
2792                AvxOpcode::Vfmadd132ps => (false, OpcodeMap::_0F38, 0x98),
2793                AvxOpcode::Vfmadd213ps => (false, OpcodeMap::_0F38, 0xA8),
2794                AvxOpcode::Vfnmadd132ps => (false, OpcodeMap::_0F38, 0x9C),
2795                AvxOpcode::Vfnmadd213ps => (false, OpcodeMap::_0F38, 0xAC),
2796                AvxOpcode::Vfmadd132pd => (true, OpcodeMap::_0F38, 0x98),
2797                AvxOpcode::Vfmadd213pd => (true, OpcodeMap::_0F38, 0xA8),
2798                AvxOpcode::Vfnmadd132pd => (true, OpcodeMap::_0F38, 0x9C),
2799                AvxOpcode::Vfnmadd213pd => (true, OpcodeMap::_0F38, 0xAC),
2800                AvxOpcode::Vfmsub132ss => (false, OpcodeMap::_0F38, 0x9B),
2801                AvxOpcode::Vfmsub213ss => (false, OpcodeMap::_0F38, 0xAB),
2802                AvxOpcode::Vfnmsub132ss => (false, OpcodeMap::_0F38, 0x9F),
2803                AvxOpcode::Vfnmsub213ss => (false, OpcodeMap::_0F38, 0xAF),
2804                AvxOpcode::Vfmsub132sd => (true, OpcodeMap::_0F38, 0x9B),
2805                AvxOpcode::Vfmsub213sd => (true, OpcodeMap::_0F38, 0xAB),
2806                AvxOpcode::Vfnmsub132sd => (true, OpcodeMap::_0F38, 0x9F),
2807                AvxOpcode::Vfnmsub213sd => (true, OpcodeMap::_0F38, 0xAF),
2808                AvxOpcode::Vfmsub132ps => (false, OpcodeMap::_0F38, 0x9A),
2809                AvxOpcode::Vfmsub213ps => (false, OpcodeMap::_0F38, 0xAA),
2810                AvxOpcode::Vfnmsub132ps => (false, OpcodeMap::_0F38, 0x9E),
2811                AvxOpcode::Vfnmsub213ps => (false, OpcodeMap::_0F38, 0xAE),
2812                AvxOpcode::Vfmsub132pd => (true, OpcodeMap::_0F38, 0x9A),
2813                AvxOpcode::Vfmsub213pd => (true, OpcodeMap::_0F38, 0xAA),
2814                AvxOpcode::Vfnmsub132pd => (true, OpcodeMap::_0F38, 0x9E),
2815                AvxOpcode::Vfnmsub213pd => (true, OpcodeMap::_0F38, 0xAE),
2816                AvxOpcode::Vblendvps => (false, OpcodeMap::_0F3A, 0x4A),
2817                AvxOpcode::Vblendvpd => (false, OpcodeMap::_0F3A, 0x4B),
2818                AvxOpcode::Vpblendvb => (false, OpcodeMap::_0F3A, 0x4C),
2819                _ => unreachable!(),
2820            };
2821
2822            VexInstruction::new()
2823                .length(VexVectorLength::V128)
2824                .prefix(LegacyPrefixes::_66)
2825                .map(map)
2826                .w(w)
2827                .opcode(opcode)
2828                .reg(dst.to_real_reg().unwrap().hw_enc())
2829                .rm(src3)
2830                .vvvv(src2.to_real_reg().unwrap().hw_enc())
2831                .encode(sink);
2832        }
2833
2834        Inst::XmmRmRBlendVex {
2835            op,
2836            src1,
2837            src2,
2838            mask,
2839            dst,
2840        } => {
2841            let dst = dst.to_reg().to_reg();
2842            let src1 = src1.to_reg();
2843            let src2 = match src2.clone().to_reg_mem().clone() {
2844                RegMem::Reg { reg } => {
2845                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2846                }
2847                RegMem::Mem { addr } => {
2848                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
2849                }
2850            };
2851            let mask = mask.to_reg();
2852
2853            let opcode = match op {
2854                AvxOpcode::Vblendvps => 0x4A,
2855                AvxOpcode::Vblendvpd => 0x4B,
2856                AvxOpcode::Vpblendvb => 0x4C,
2857                _ => unreachable!(),
2858            };
2859
2860            VexInstruction::new()
2861                .length(VexVectorLength::V128)
2862                .prefix(LegacyPrefixes::_66)
2863                .map(OpcodeMap::_0F3A)
2864                .opcode(opcode)
2865                .reg(dst.to_real_reg().unwrap().hw_enc())
2866                .vvvv(src1.to_real_reg().unwrap().hw_enc())
2867                .rm(src2)
2868                .imm(mask.to_real_reg().unwrap().hw_enc() << 4)
2869                .encode(sink);
2870        }
2871
2872        Inst::XmmUnaryRmRVex { op, src, dst } => {
2873            let dst = dst.to_reg().to_reg();
2874            let src = match src.clone().to_reg_mem().clone() {
2875                RegMem::Reg { reg } => {
2876                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2877                }
2878                RegMem::Mem { addr } => {
2879                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
2880                }
2881            };
2882
2883            let (prefix, map, opcode) = match op {
2884                AvxOpcode::Vpmovsxbw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x20),
2885                AvxOpcode::Vpmovzxbw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x30),
2886                AvxOpcode::Vpmovsxwd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x23),
2887                AvxOpcode::Vpmovzxwd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x33),
2888                AvxOpcode::Vpmovsxdq => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x25),
2889                AvxOpcode::Vpmovzxdq => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x35),
2890                AvxOpcode::Vpabsb => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x1C),
2891                AvxOpcode::Vpabsw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x1D),
2892                AvxOpcode::Vpabsd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x1E),
2893                AvxOpcode::Vsqrtps => (LegacyPrefixes::None, OpcodeMap::_0F, 0x51),
2894                AvxOpcode::Vsqrtpd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x51),
2895                AvxOpcode::Vcvtdq2pd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0xE6),
2896                AvxOpcode::Vcvtdq2ps => (LegacyPrefixes::None, OpcodeMap::_0F, 0x5B),
2897                AvxOpcode::Vcvtpd2ps => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x5A),
2898                AvxOpcode::Vcvtps2pd => (LegacyPrefixes::None, OpcodeMap::_0F, 0x5A),
2899                AvxOpcode::Vcvttpd2dq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0xE6),
2900                AvxOpcode::Vcvttps2dq => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5B),
2901                AvxOpcode::Vmovdqu => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x6F),
2902                AvxOpcode::Vmovups => (LegacyPrefixes::None, OpcodeMap::_0F, 0x10),
2903                AvxOpcode::Vmovupd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x10),
2904
2905                // Note that for `vmov{s,d}` the `inst.isle` rules should
2906                // statically ensure that only `Amode` operands are used here.
2907                // Otherwise the other encodings of `vmovss` are more like
2908                // 2-operand instructions which this unary encoding does not
2909                // have.
2910                AvxOpcode::Vmovss => match &src {
2911                    RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x10),
2912                    _ => unreachable!(),
2913                },
2914                AvxOpcode::Vmovsd => match &src {
2915                    RegisterOrAmode::Amode(_) => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x10),
2916                    _ => unreachable!(),
2917                },
2918
2919                AvxOpcode::Vpbroadcastb => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x78),
2920                AvxOpcode::Vpbroadcastw => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x79),
2921                AvxOpcode::Vpbroadcastd => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x58),
2922                AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18),
2923                AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12),
2924
2925                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
2926            };
2927
2928            VexInstruction::new()
2929                .length(VexVectorLength::V128)
2930                .prefix(prefix)
2931                .map(map)
2932                .opcode(opcode)
2933                .reg(dst.to_real_reg().unwrap().hw_enc())
2934                .rm(src)
2935                .encode(sink);
2936        }
2937
2938        Inst::XmmUnaryRmRImmVex { op, src, dst, imm } => {
2939            let dst = dst.to_reg().to_reg();
2940            let src = match src.clone().to_reg_mem().clone() {
2941                RegMem::Reg { reg } => {
2942                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
2943                }
2944                RegMem::Mem { addr } => {
2945                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
2946                }
2947            };
2948
2949            let (prefix, map, opcode) = match op {
2950                AvxOpcode::Vroundps => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x08),
2951                AvxOpcode::Vroundpd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x09),
2952                AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
2953                AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
2954                AvxOpcode::Vpshufd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x70),
2955                AvxOpcode::Vroundss => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0A),
2956                AvxOpcode::Vroundsd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0B),
2957                _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
2958            };
2959
2960            let vex = VexInstruction::new()
2961                .length(VexVectorLength::V128)
2962                .prefix(prefix)
2963                .map(map)
2964                .opcode(opcode)
2965                .reg(dst.to_real_reg().unwrap().hw_enc())
2966                .rm(src)
2967                .imm(*imm);
2968
2969            // See comments in similar block above in `XmmUnaryRmRVex` for what
2970            // this is doing.
2971            let vex = match op {
2972                AvxOpcode::Vroundss | AvxOpcode::Vroundsd => {
2973                    vex.vvvv(dst.to_real_reg().unwrap().hw_enc())
2974                }
2975                _ => vex,
2976            };
2977            vex.encode(sink);
2978        }
2979
2980        Inst::XmmMovRMVex { op, src, dst } => {
2981            let src = src.to_reg();
2982            let dst = dst.clone().finalize(state.frame_layout(), sink);
2983
2984            let (prefix, map, opcode) = match op {
2985                AvxOpcode::Vmovdqu => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x7F),
2986                AvxOpcode::Vmovss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x11),
2987                AvxOpcode::Vmovsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x11),
2988                AvxOpcode::Vmovups => (LegacyPrefixes::None, OpcodeMap::_0F, 0x11),
2989                AvxOpcode::Vmovupd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x11),
2990                _ => unimplemented!("Opcode {:?} not implemented", op),
2991            };
2992            VexInstruction::new()
2993                .length(VexVectorLength::V128)
2994                .prefix(prefix)
2995                .map(map)
2996                .opcode(opcode)
2997                .rm(dst)
2998                .reg(src.to_real_reg().unwrap().hw_enc())
2999                .encode(sink);
3000        }
3001
3002        Inst::XmmMovRMImmVex { op, src, dst, imm } => {
3003            let src = src.to_reg();
3004            let dst = dst.clone().finalize(state.frame_layout(), sink);
3005
3006            let (w, prefix, map, opcode) = match op {
3007                AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14),
3008                AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15),
3009                AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
3010                AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
3011                _ => unimplemented!("Opcode {:?} not implemented", op),
3012            };
3013            VexInstruction::new()
3014                .length(VexVectorLength::V128)
3015                .w(w)
3016                .prefix(prefix)
3017                .map(map)
3018                .opcode(opcode)
3019                .rm(dst)
3020                .reg(src.to_real_reg().unwrap().hw_enc())
3021                .imm(*imm)
3022                .encode(sink);
3023        }
3024
3025        Inst::XmmToGprImmVex { op, src, dst, imm } => {
3026            let src = src.to_reg();
3027            let dst = dst.to_reg().to_reg();
3028
3029            let (w, prefix, map, opcode) = match op {
3030                AvxOpcode::Vpextrb => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x14),
3031                AvxOpcode::Vpextrw => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x15),
3032                AvxOpcode::Vpextrd => (false, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
3033                AvxOpcode::Vpextrq => (true, LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x16),
3034                _ => unimplemented!("Opcode {:?} not implemented", op),
3035            };
3036            VexInstruction::new()
3037                .length(VexVectorLength::V128)
3038                .w(w)
3039                .prefix(prefix)
3040                .map(map)
3041                .opcode(opcode)
3042                .rm(dst.to_real_reg().unwrap().hw_enc())
3043                .reg(src.to_real_reg().unwrap().hw_enc())
3044                .imm(*imm)
3045                .encode(sink);
3046        }
3047
3048        Inst::XmmToGprVex {
3049            op,
3050            src,
3051            dst,
3052            dst_size,
3053        } => {
3054            let src = src.to_reg();
3055            let dst = dst.to_reg().to_reg();
3056
3057            let (prefix, map, opcode) = match op {
3058                // vmovd/vmovq are differentiated by `w`
3059                AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x7E),
3060                AvxOpcode::Vmovmskps => (LegacyPrefixes::None, OpcodeMap::_0F, 0x50),
3061                AvxOpcode::Vmovmskpd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x50),
3062                AvxOpcode::Vpmovmskb => (LegacyPrefixes::_66, OpcodeMap::_0F, 0xD7),
3063                _ => unimplemented!("Opcode {:?} not implemented", op),
3064            };
3065            let w = match dst_size {
3066                OperandSize::Size64 => true,
3067                _ => false,
3068            };
3069            let mut vex = VexInstruction::new()
3070                .length(VexVectorLength::V128)
3071                .w(w)
3072                .prefix(prefix)
3073                .map(map)
3074                .opcode(opcode);
3075            vex = match op {
3076                // The `vmovq/vmovd` reverse the order of the destination/source
3077                // relative to other opcodes using this shape of instruction.
3078                AvxOpcode::Vmovd | AvxOpcode::Vmovq => vex
3079                    .rm(dst.to_real_reg().unwrap().hw_enc())
3080                    .reg(src.to_real_reg().unwrap().hw_enc()),
3081                _ => vex
3082                    .rm(src.to_real_reg().unwrap().hw_enc())
3083                    .reg(dst.to_real_reg().unwrap().hw_enc()),
3084            };
3085            vex.encode(sink);
3086        }
3087
3088        Inst::GprToXmmVex {
3089            op,
3090            src,
3091            dst,
3092            src_size,
3093        } => {
3094            let dst = dst.to_reg().to_reg();
3095            let src = match src.clone().to_reg_mem().clone() {
3096                RegMem::Reg { reg } => {
3097                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
3098                }
3099                RegMem::Mem { addr } => {
3100                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
3101                }
3102            };
3103
3104            let (prefix, map, opcode) = match op {
3105                // vmovd/vmovq are differentiated by `w`
3106                AvxOpcode::Vmovd | AvxOpcode::Vmovq => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x6E),
3107                _ => unimplemented!("Opcode {:?} not implemented", op),
3108            };
3109            let w = match src_size {
3110                OperandSize::Size64 => true,
3111                _ => false,
3112            };
3113            VexInstruction::new()
3114                .length(VexVectorLength::V128)
3115                .w(w)
3116                .prefix(prefix)
3117                .map(map)
3118                .opcode(opcode)
3119                .rm(src)
3120                .reg(dst.to_real_reg().unwrap().hw_enc())
3121                .encode(sink);
3122        }
3123
3124        Inst::XmmCmpRmRVex { op, src1, src2 } => {
3125            let src1 = src1.to_reg();
3126            let src2 = match src2.clone().to_reg_mem().clone() {
3127                RegMem::Reg { reg } => {
3128                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
3129                }
3130                RegMem::Mem { addr } => {
3131                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
3132                }
3133            };
3134
3135            let (prefix, map, opcode) = match op {
3136                AvxOpcode::Vucomiss => (LegacyPrefixes::None, OpcodeMap::_0F, 0x2E),
3137                AvxOpcode::Vucomisd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x2E),
3138                AvxOpcode::Vptest => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x17),
3139                _ => unimplemented!("Opcode {:?} not implemented", op),
3140            };
3141
3142            VexInstruction::new()
3143                .length(VexVectorLength::V128)
3144                .prefix(prefix)
3145                .map(map)
3146                .opcode(opcode)
3147                .rm(src2)
3148                .reg(src1.to_real_reg().unwrap().hw_enc())
3149                .encode(sink);
3150        }
3151
3152        Inst::XmmRmREvex {
3153            op,
3154            src1,
3155            src2,
3156            dst,
3157        }
3158        | Inst::XmmRmREvex3 {
3159            op,
3160            src1: _, // `dst` reuses `src1`.
3161            src2: src1,
3162            src3: src2,
3163            dst,
3164        } => {
3165            let reused_src = match inst {
3166                Inst::XmmRmREvex3 { src1, .. } => Some(src1.to_reg()),
3167                _ => None,
3168            };
3169            let src1 = src1.to_reg();
3170            let src2 = match src2.clone().to_reg_mem().clone() {
3171                RegMem::Reg { reg } => {
3172                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
3173                }
3174                RegMem::Mem { addr } => {
3175                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
3176                }
3177            };
3178            let dst = dst.to_reg().to_reg();
3179            if let Some(src1) = reused_src {
3180                debug_assert_eq!(src1, dst);
3181            }
3182
3183            let (w, opcode, map) = match op {
3184                Avx512Opcode::Vpermi2b => (false, 0x75, OpcodeMap::_0F38),
3185                Avx512Opcode::Vpmullq => (true, 0x40, OpcodeMap::_0F38),
3186                Avx512Opcode::Vpsraq => (true, 0xE2, OpcodeMap::_0F),
3187                _ => unimplemented!("Opcode {:?} not implemented", op),
3188            };
3189            EvexInstruction::new()
3190                .length(EvexVectorLength::V128)
3191                .prefix(LegacyPrefixes::_66)
3192                .map(map)
3193                .w(w)
3194                .opcode(opcode)
3195                .tuple_type(op.tuple_type())
3196                .reg(dst.to_real_reg().unwrap().hw_enc())
3197                .vvvvv(src1.to_real_reg().unwrap().hw_enc())
3198                .rm(src2)
3199                .encode(sink);
3200        }
3201
3202        Inst::XmmMinMaxSeq {
3203            size,
3204            is_min,
3205            lhs,
3206            rhs,
3207            dst,
3208        } => {
3209            let rhs = rhs.to_reg();
3210            let lhs = lhs.to_reg();
3211            let dst = dst.to_writable_reg();
3212            debug_assert_eq!(rhs, dst.to_reg());
3213
3214            // Generates the following sequence:
3215            // cmpss/cmpsd %lhs, %rhs_dst
3216            // jnz do_min_max
3217            // jp propagate_nan
3218            //
3219            // ;; ordered and equal: propagate the sign bit (for -0 vs 0):
3220            // {and,or}{ss,sd} %lhs, %rhs_dst
3221            // j done
3222            //
3223            // ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the
3224            // ;; NaN value is returned), we add both inputs.
3225            // propagate_nan:
3226            // add{ss,sd} %lhs, %rhs_dst
3227            // j done
3228            //
3229            // do_min_max:
3230            // {min,max}{ss,sd} %lhs, %rhs_dst
3231            //
3232            // done:
3233            let done = sink.get_label();
3234            let propagate_nan = sink.get_label();
3235            let do_min_max = sink.get_label();
3236
3237            let (add_op, cmp_op, and_op, or_op, min_max_op) = match size {
3238                OperandSize::Size32 => (
3239                    SseOpcode::Addss,
3240                    SseOpcode::Ucomiss,
3241                    SseOpcode::Andps,
3242                    SseOpcode::Orps,
3243                    if *is_min {
3244                        SseOpcode::Minss
3245                    } else {
3246                        SseOpcode::Maxss
3247                    },
3248                ),
3249                OperandSize::Size64 => (
3250                    SseOpcode::Addsd,
3251                    SseOpcode::Ucomisd,
3252                    SseOpcode::Andpd,
3253                    SseOpcode::Orpd,
3254                    if *is_min {
3255                        SseOpcode::Minsd
3256                    } else {
3257                        SseOpcode::Maxsd
3258                    },
3259                ),
3260                _ => unreachable!(),
3261            };
3262
3263            let inst = Inst::xmm_cmp_rm_r(cmp_op, dst.to_reg(), RegMem::reg(lhs));
3264            inst.emit(sink, info, state);
3265
3266            one_way_jmp(sink, CC::NZ, do_min_max);
3267            one_way_jmp(sink, CC::P, propagate_nan);
3268
3269            // Ordered and equal. The operands are bit-identical unless they are zero
3270            // and negative zero. These instructions merge the sign bits in that
3271            // case, and are no-ops otherwise.
3272            let op = if *is_min { or_op } else { and_op };
3273            let inst = Inst::xmm_rm_r(op, RegMem::reg(lhs), dst);
3274            inst.emit(sink, info, state);
3275
3276            let inst = Inst::jmp_known(done);
3277            inst.emit(sink, info, state);
3278
3279            // x86's min/max are not symmetric; if either operand is a NaN, they return the
3280            // read-only operand: perform an addition between the two operands, which has the
3281            // desired NaN propagation effects.
3282            sink.bind_label(propagate_nan, state.ctrl_plane_mut());
3283            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(lhs), dst);
3284            inst.emit(sink, info, state);
3285
3286            one_way_jmp(sink, CC::P, done);
3287
3288            sink.bind_label(do_min_max, state.ctrl_plane_mut());
3289
3290            let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(lhs), dst);
3291            inst.emit(sink, info, state);
3292
3293            sink.bind_label(done, state.ctrl_plane_mut());
3294        }
3295
3296        Inst::XmmRmRImm {
3297            op,
3298            src1,
3299            src2,
3300            dst,
3301            imm,
3302            size,
3303        } => {
3304            let src1 = *src1;
3305            let dst = dst.to_reg();
3306            let src2 = src2.clone();
3307            debug_assert_eq!(src1, dst);
3308
3309            let (prefix, opcode, len) = match op {
3310                SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2),
3311                SseOpcode::Cmppd => (LegacyPrefixes::_66, 0x0FC2, 2),
3312                SseOpcode::Cmpss => (LegacyPrefixes::_F3, 0x0FC2, 2),
3313                SseOpcode::Cmpsd => (LegacyPrefixes::_F2, 0x0FC2, 2),
3314                SseOpcode::Insertps => (LegacyPrefixes::_66, 0x0F3A21, 3),
3315                SseOpcode::Palignr => (LegacyPrefixes::_66, 0x0F3A0F, 3),
3316                SseOpcode::Pinsrb => (LegacyPrefixes::_66, 0x0F3A20, 3),
3317                SseOpcode::Pinsrw => (LegacyPrefixes::_66, 0x0FC4, 2),
3318                SseOpcode::Pinsrd => (LegacyPrefixes::_66, 0x0F3A22, 3),
3319                SseOpcode::Shufps => (LegacyPrefixes::None, 0x0FC6, 2),
3320                SseOpcode::Pblendw => (LegacyPrefixes::_66, 0x0F3A0E, 3),
3321                _ => unimplemented!("Opcode {:?} not implemented", op),
3322            };
3323            let rex = RexFlags::from(*size);
3324            let regs_swapped = match *op {
3325                // These opcodes (and not the SSE2 version of PEXTRW) flip the operand
3326                // encoding: `dst` in ModRM's r/m, `src` in ModRM's reg field.
3327                SseOpcode::Pextrb | SseOpcode::Pextrd => true,
3328                // The rest of the opcodes have the customary encoding: `dst` in ModRM's reg,
3329                // `src` in ModRM's r/m field.
3330                _ => false,
3331            };
3332            match src2 {
3333                RegMem::Reg { reg } => {
3334                    if regs_swapped {
3335                        emit_std_reg_reg(sink, prefix, opcode, len, reg, dst, rex);
3336                    } else {
3337                        emit_std_reg_reg(sink, prefix, opcode, len, dst, reg, rex);
3338                    }
3339                }
3340                RegMem::Mem { addr } => {
3341                    let addr = &addr.finalize(state.frame_layout(), sink);
3342                    assert!(
3343                        !regs_swapped,
3344                        "No existing way to encode a mem argument in the ModRM r/m field."
3345                    );
3346                    // N.B.: bytes_at_end == 1, because of the `imm` byte below.
3347                    emit_std_reg_mem(sink, prefix, opcode, len, dst, addr, rex, 1);
3348                }
3349            }
3350            sink.put1(*imm);
3351        }
3352
3353        Inst::XmmUninitializedValue { .. } => {
3354            // This instruction format only exists to declare a register as a `def`; no code is
3355            // emitted.
3356        }
3357
3358        Inst::XmmMovRM { op, src, dst } => {
3359            let src = src.to_reg();
3360            let dst = dst.clone();
3361
3362            let (prefix, opcode) = match op {
3363                SseOpcode::Movaps => (LegacyPrefixes::None, 0x0F29),
3364                SseOpcode::Movapd => (LegacyPrefixes::_66, 0x0F29),
3365                SseOpcode::Movdqu => (LegacyPrefixes::_F3, 0x0F7F),
3366                SseOpcode::Movss => (LegacyPrefixes::_F3, 0x0F11),
3367                SseOpcode::Movsd => (LegacyPrefixes::_F2, 0x0F11),
3368                SseOpcode::Movups => (LegacyPrefixes::None, 0x0F11),
3369                SseOpcode::Movupd => (LegacyPrefixes::_66, 0x0F11),
3370                _ => unimplemented!("Opcode {:?} not implemented", op),
3371            };
3372            let dst = &dst.finalize(state.frame_layout(), sink);
3373            emit_std_reg_mem(sink, prefix, opcode, 2, src, dst, RexFlags::clear_w(), 0);
3374        }
3375
3376        Inst::XmmMovRMImm { op, src, dst, imm } => {
3377            let src = src.to_reg();
3378            let dst = dst.clone();
3379
3380            let (w, prefix, opcode) = match op {
3381                SseOpcode::Pextrb => (false, LegacyPrefixes::_66, 0x0F3A14),
3382                SseOpcode::Pextrw => (false, LegacyPrefixes::_66, 0x0F3A15),
3383                SseOpcode::Pextrd => (false, LegacyPrefixes::_66, 0x0F3A16),
3384                SseOpcode::Pextrq => (true, LegacyPrefixes::_66, 0x0F3A16),
3385                _ => unimplemented!("Opcode {:?} not implemented", op),
3386            };
3387            let rex = if w {
3388                RexFlags::set_w()
3389            } else {
3390                RexFlags::clear_w()
3391            };
3392            let dst = &dst.finalize(state.frame_layout(), sink);
3393            emit_std_reg_mem(sink, prefix, opcode, 3, src, dst, rex, 1);
3394            sink.put1(*imm);
3395        }
3396
3397        Inst::XmmToGpr {
3398            op,
3399            src,
3400            dst,
3401            dst_size,
3402        } => {
3403            let src = src.to_reg();
3404            let dst = dst.to_reg().to_reg();
3405
3406            let (prefix, opcode, dst_first) = match op {
3407                SseOpcode::Cvttss2si => (LegacyPrefixes::_F3, 0x0F2C, true),
3408                SseOpcode::Cvttsd2si => (LegacyPrefixes::_F2, 0x0F2C, true),
3409                // Movd and movq use the same opcode; the presence of the REX prefix (set below)
3410                // actually determines which is used.
3411                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F7E, false),
3412                SseOpcode::Movmskps => (LegacyPrefixes::None, 0x0F50, true),
3413                SseOpcode::Movmskpd => (LegacyPrefixes::_66, 0x0F50, true),
3414                SseOpcode::Pmovmskb => (LegacyPrefixes::_66, 0x0FD7, true),
3415                _ => panic!("unexpected opcode {op:?}"),
3416            };
3417            let rex = RexFlags::from(*dst_size);
3418            let (src, dst) = if dst_first { (dst, src) } else { (src, dst) };
3419
3420            emit_std_reg_reg(sink, prefix, opcode, 2, src, dst, rex);
3421        }
3422
3423        Inst::XmmToGprImm { op, src, dst, imm } => {
3424            use OperandSize as OS;
3425
3426            let src = src.to_reg();
3427            let dst = dst.to_reg().to_reg();
3428
3429            let (prefix, opcode, opcode_bytes, dst_size, dst_first) = match op {
3430                SseOpcode::Pextrb => (LegacyPrefixes::_66, 0x0F3A14, 3, OS::Size32, false),
3431                SseOpcode::Pextrw => (LegacyPrefixes::_66, 0x0FC5, 2, OS::Size32, true),
3432                SseOpcode::Pextrd => (LegacyPrefixes::_66, 0x0F3A16, 3, OS::Size32, false),
3433                SseOpcode::Pextrq => (LegacyPrefixes::_66, 0x0F3A16, 3, OS::Size64, false),
3434                _ => panic!("unexpected opcode {op:?}"),
3435            };
3436            let rex = RexFlags::from(dst_size);
3437            let (src, dst) = if dst_first { (dst, src) } else { (src, dst) };
3438
3439            emit_std_reg_reg(sink, prefix, opcode, opcode_bytes, src, dst, rex);
3440            sink.put1(*imm);
3441        }
3442
3443        Inst::GprToXmm {
3444            op,
3445            src: src_e,
3446            dst: reg_g,
3447            src_size,
3448        } => {
3449            let reg_g = reg_g.to_reg().to_reg();
3450            let src_e = src_e.clone().to_reg_mem().clone();
3451
3452            let (prefix, opcode) = match op {
3453                // Movd and movq use the same opcode; the presence of the REX prefix (set below)
3454                // actually determines which is used.
3455                SseOpcode::Movd | SseOpcode::Movq => (LegacyPrefixes::_66, 0x0F6E),
3456                _ => panic!("unexpected opcode {op:?}"),
3457            };
3458            let rex = RexFlags::from(*src_size);
3459            match src_e {
3460                RegMem::Reg { reg: reg_e } => {
3461                    emit_std_reg_reg(sink, prefix, opcode, 2, reg_g, reg_e, rex);
3462                }
3463                RegMem::Mem { addr } => {
3464                    let addr = &addr.finalize(state.frame_layout(), sink);
3465                    emit_std_reg_mem(sink, prefix, opcode, 2, reg_g, addr, rex, 0);
3466                }
3467            }
3468        }
3469
3470        Inst::XmmCmpRmR { op, src1, src2 } => {
3471            let src1 = src1.to_reg();
3472            let src2 = src2.clone().to_reg_mem().clone();
3473
3474            let rex = RexFlags::clear_w();
3475            let (prefix, opcode, len) = match op {
3476                SseOpcode::Ptest => (LegacyPrefixes::_66, 0x0F3817, 3),
3477                SseOpcode::Ucomisd => (LegacyPrefixes::_66, 0x0F2E, 2),
3478                SseOpcode::Ucomiss => (LegacyPrefixes::None, 0x0F2E, 2),
3479                _ => unimplemented!("Emit xmm cmp rm r"),
3480            };
3481
3482            match src2 {
3483                RegMem::Reg { reg } => {
3484                    emit_std_reg_reg(sink, prefix, opcode, len, src1, reg, rex);
3485                }
3486                RegMem::Mem { addr } => {
3487                    let addr = &addr.finalize(state.frame_layout(), sink);
3488                    emit_std_reg_mem(sink, prefix, opcode, len, src1, addr, rex, 0);
3489                }
3490            }
3491        }
3492
3493        Inst::CvtIntToFloat {
3494            op,
3495            src1,
3496            src2,
3497            dst,
3498            src2_size,
3499        } => {
3500            let src1 = src1.to_reg();
3501            let dst = dst.to_reg().to_reg();
3502            assert_eq!(src1, dst);
3503            let src2 = src2.clone().to_reg_mem().clone();
3504
3505            let (prefix, opcode) = match op {
3506                SseOpcode::Cvtsi2ss => (LegacyPrefixes::_F3, 0x0F2A),
3507                SseOpcode::Cvtsi2sd => (LegacyPrefixes::_F2, 0x0F2A),
3508                _ => panic!("unexpected opcode {op:?}"),
3509            };
3510            let rex = RexFlags::from(*src2_size);
3511            match src2 {
3512                RegMem::Reg { reg: src2 } => {
3513                    emit_std_reg_reg(sink, prefix, opcode, 2, dst, src2, rex);
3514                }
3515                RegMem::Mem { addr } => {
3516                    let addr = &addr.finalize(state.frame_layout(), sink);
3517                    emit_std_reg_mem(sink, prefix, opcode, 2, dst, addr, rex, 0);
3518                }
3519            }
3520        }
3521
3522        Inst::CvtIntToFloatVex {
3523            op,
3524            src1,
3525            src2,
3526            dst,
3527            src2_size,
3528        } => {
3529            let dst = dst.to_reg().to_reg();
3530            let src1 = src1.to_reg();
3531            let src2 = match src2.clone().to_reg_mem().clone() {
3532                RegMem::Reg { reg } => {
3533                    RegisterOrAmode::Register(reg.to_real_reg().unwrap().hw_enc().into())
3534                }
3535                RegMem::Mem { addr } => {
3536                    RegisterOrAmode::Amode(addr.finalize(state.frame_layout(), sink))
3537                }
3538            };
3539
3540            let (prefix, map, opcode) = match op {
3541                AvxOpcode::Vcvtsi2ss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x2A),
3542                AvxOpcode::Vcvtsi2sd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x2A),
3543                _ => unimplemented!("Opcode {:?} not implemented", op),
3544            };
3545            let w = match src2_size {
3546                OperandSize::Size64 => true,
3547                _ => false,
3548            };
3549            VexInstruction::new()
3550                .length(VexVectorLength::V128)
3551                .w(w)
3552                .prefix(prefix)
3553                .map(map)
3554                .opcode(opcode)
3555                .rm(src2)
3556                .reg(dst.to_real_reg().unwrap().hw_enc())
3557                .vvvv(src1.to_real_reg().unwrap().hw_enc())
3558                .encode(sink);
3559        }
3560
3561        Inst::CvtUint64ToFloatSeq {
3562            dst_size,
3563            src,
3564            dst,
3565            tmp_gpr1,
3566            tmp_gpr2,
3567        } => {
3568            let src = src.to_reg();
3569            let dst = dst.to_writable_reg();
3570            let tmp_gpr1 = tmp_gpr1.to_writable_reg();
3571            let tmp_gpr2 = tmp_gpr2.to_writable_reg();
3572
3573            // Note: this sequence is specific to 64-bit mode; a 32-bit mode would require a
3574            // different sequence.
3575            //
3576            // Emit the following sequence:
3577            //
3578            //  cmp 0, %src
3579            //  jl handle_negative
3580            //
3581            //  ;; handle positive, which can't overflow
3582            //  cvtsi2sd/cvtsi2ss %src, %dst
3583            //  j done
3584            //
3585            //  ;; handle negative: see below for an explanation of what it's doing.
3586            //  handle_negative:
3587            //  mov %src, %tmp_gpr1
3588            //  shr $1, %tmp_gpr1
3589            //  mov %src, %tmp_gpr2
3590            //  and $1, %tmp_gpr2
3591            //  or %tmp_gpr1, %tmp_gpr2
3592            //  cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst
3593            //  addsd/addss %dst, %dst
3594            //
3595            //  done:
3596
3597            assert_ne!(src, tmp_gpr1.to_reg());
3598            assert_ne!(src, tmp_gpr2.to_reg());
3599
3600            let handle_negative = sink.get_label();
3601            let done = sink.get_label();
3602
3603            // If x seen as a signed int64 is not negative, a signed-conversion will do the right
3604            // thing.
3605            // TODO use tst src, src here.
3606            let inst = Inst::cmp_rmi_r(OperandSize::Size64, src, RegMemImm::imm(0));
3607            inst.emit(sink, info, state);
3608
3609            one_way_jmp(sink, CC::L, handle_negative);
3610
3611            // Handle a positive int64, which is the "easy" case: a signed conversion will do the
3612            // right thing.
3613            emit_signed_cvt(
3614                sink,
3615                info,
3616                state,
3617                src,
3618                dst,
3619                *dst_size == OperandSize::Size64,
3620            );
3621
3622            let inst = Inst::jmp_known(done);
3623            inst.emit(sink, info, state);
3624
3625            sink.bind_label(handle_negative, state.ctrl_plane_mut());
3626
3627            // Divide x by two to get it in range for the signed conversion, keep the LSB, and
3628            // scale it back up on the FP side.
3629            let inst = Inst::gen_move(tmp_gpr1, src, types::I64);
3630            inst.emit(sink, info, state);
3631
3632            // tmp_gpr1 := src >> 1
3633            let inst = Inst::shift_r(
3634                OperandSize::Size64,
3635                ShiftKind::ShiftRightLogical,
3636                Imm8Gpr::unwrap_new(Imm8Reg::Imm8 { imm: 1 }),
3637                tmp_gpr1.to_reg(),
3638                tmp_gpr1,
3639            );
3640            inst.emit(sink, info, state);
3641
3642            let inst = Inst::gen_move(tmp_gpr2, src, types::I64);
3643            inst.emit(sink, info, state);
3644
3645            let inst = Inst::alu_rmi_r(
3646                OperandSize::Size64,
3647                AluRmiROpcode::And,
3648                RegMemImm::imm(1),
3649                tmp_gpr2,
3650            );
3651            inst.emit(sink, info, state);
3652
3653            let inst = Inst::alu_rmi_r(
3654                OperandSize::Size64,
3655                AluRmiROpcode::Or,
3656                RegMemImm::reg(tmp_gpr1.to_reg()),
3657                tmp_gpr2,
3658            );
3659            inst.emit(sink, info, state);
3660
3661            emit_signed_cvt(
3662                sink,
3663                info,
3664                state,
3665                tmp_gpr2.to_reg(),
3666                dst,
3667                *dst_size == OperandSize::Size64,
3668            );
3669
3670            let add_op = if *dst_size == OperandSize::Size64 {
3671                SseOpcode::Addsd
3672            } else {
3673                SseOpcode::Addss
3674            };
3675            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), dst);
3676            inst.emit(sink, info, state);
3677
3678            sink.bind_label(done, state.ctrl_plane_mut());
3679        }
3680
3681        Inst::CvtFloatToSintSeq {
3682            src_size,
3683            dst_size,
3684            is_saturating,
3685            src,
3686            dst,
3687            tmp_gpr,
3688            tmp_xmm,
3689        } => {
3690            let src = src.to_reg();
3691            let dst = dst.to_writable_reg();
3692            let tmp_gpr = tmp_gpr.to_writable_reg();
3693            let tmp_xmm = tmp_xmm.to_writable_reg();
3694
3695            // Emits the following common sequence:
3696            //
3697            // cvttss2si/cvttsd2si %src, %dst
3698            // cmp %dst, 1
3699            // jno done
3700            //
3701            // Then, for saturating conversions:
3702            //
3703            // ;; check for NaN
3704            // cmpss/cmpsd %src, %src
3705            // jnp not_nan
3706            // xor %dst, %dst
3707            //
3708            // ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is
3709            // ;; already in %dst.
3710            // xorpd %tmp_xmm, %tmp_xmm
3711            // cmpss/cmpsd %src, %tmp_xmm
3712            // jnb done
3713            // mov/movaps $INT_MAX, %dst
3714            //
3715            // done:
3716            //
3717            // Then, for non-saturating conversions:
3718            //
3719            // ;; check for NaN
3720            // cmpss/cmpsd %src, %src
3721            // jnp not_nan
3722            // ud2 trap BadConversionToInteger
3723            //
3724            // ;; check if INT_MIN was the correct result, against a magic constant:
3725            // not_nan:
3726            // movaps/mov $magic, %tmp_gpr
3727            // movq/movd %tmp_gpr, %tmp_xmm
3728            // cmpss/cmpsd %tmp_xmm, %src
3729            // jnb/jnbe $check_positive
3730            // ud2 trap IntegerOverflow
3731            //
3732            // ;; if positive, it was a real overflow
3733            // check_positive:
3734            // xorpd %tmp_xmm, %tmp_xmm
3735            // cmpss/cmpsd %src, %tmp_xmm
3736            // jnb done
3737            // ud2 trap IntegerOverflow
3738            //
3739            // done:
3740
3741            let (cast_op, cmp_op, trunc_op) = match src_size {
3742                OperandSize::Size64 => (SseOpcode::Movq, SseOpcode::Ucomisd, SseOpcode::Cvttsd2si),
3743                OperandSize::Size32 => (SseOpcode::Movd, SseOpcode::Ucomiss, SseOpcode::Cvttss2si),
3744                _ => unreachable!(),
3745            };
3746
3747            let done = sink.get_label();
3748
3749            // The truncation.
3750            let inst = Inst::xmm_to_gpr(trunc_op, src, dst, *dst_size);
3751            inst.emit(sink, info, state);
3752
3753            // Compare against 1, in case of overflow the dst operand was INT_MIN.
3754            let inst = Inst::cmp_rmi_r(*dst_size, dst.to_reg(), RegMemImm::imm(1));
3755            inst.emit(sink, info, state);
3756
3757            one_way_jmp(sink, CC::NO, done); // no overflow => done
3758
3759            // Check for NaN.
3760
3761            let inst = Inst::xmm_cmp_rm_r(cmp_op, src, RegMem::reg(src));
3762            inst.emit(sink, info, state);
3763
3764            if *is_saturating {
3765                let not_nan = sink.get_label();
3766                one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
3767
3768                // For NaN, emit 0.
3769                let inst = Inst::alu_rmi_r(
3770                    *dst_size,
3771                    AluRmiROpcode::Xor,
3772                    RegMemImm::reg(dst.to_reg()),
3773                    dst,
3774                );
3775                inst.emit(sink, info, state);
3776
3777                let inst = Inst::jmp_known(done);
3778                inst.emit(sink, info, state);
3779
3780                sink.bind_label(not_nan, state.ctrl_plane_mut());
3781
3782                // If the input was positive, saturate to INT_MAX.
3783
3784                // Zero out tmp_xmm.
3785                let inst = Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), tmp_xmm);
3786                inst.emit(sink, info, state);
3787
3788                let inst = Inst::xmm_cmp_rm_r(cmp_op, tmp_xmm.to_reg(), RegMem::reg(src));
3789                inst.emit(sink, info, state);
3790
3791                // Jump if >= to done.
3792                one_way_jmp(sink, CC::NB, done);
3793
3794                // Otherwise, put INT_MAX.
3795                if *dst_size == OperandSize::Size64 {
3796                    let inst = Inst::imm(OperandSize::Size64, 0x7fffffffffffffff, dst);
3797                    inst.emit(sink, info, state);
3798                } else {
3799                    let inst = Inst::imm(OperandSize::Size32, 0x7fffffff, dst);
3800                    inst.emit(sink, info, state);
3801                }
3802            } else {
3803                let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);
3804                inst.emit(sink, info, state);
3805
3806                // Check if INT_MIN was the correct result: determine the smallest floating point
3807                // number that would convert to INT_MIN, put it in a temporary register, and compare
3808                // against the src register.
3809                // If the src register is less (or in some cases, less-or-equal) than the threshold,
3810                // trap!
3811
3812                let mut no_overflow_cc = CC::NB; // >=
3813                let output_bits = dst_size.to_bits();
3814                match *src_size {
3815                    OperandSize::Size32 => {
3816                        let cst = (-Ieee32::pow2(output_bits - 1)).bits();
3817                        let inst = Inst::imm(OperandSize::Size32, cst as u64, tmp_gpr);
3818                        inst.emit(sink, info, state);
3819                    }
3820                    OperandSize::Size64 => {
3821                        // An f64 can represent `i32::min_value() - 1` exactly with precision to spare,
3822                        // so there are values less than -2^(N-1) that convert correctly to INT_MIN.
3823                        let cst = if output_bits < 64 {
3824                            no_overflow_cc = CC::NBE; // >
3825                            Ieee64::fcvt_to_sint_negative_overflow(output_bits)
3826                        } else {
3827                            -Ieee64::pow2(output_bits - 1)
3828                        };
3829                        let inst = Inst::imm(OperandSize::Size64, cst.bits(), tmp_gpr);
3830                        inst.emit(sink, info, state);
3831                    }
3832                    _ => unreachable!(),
3833                }
3834
3835                let inst =
3836                    Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, tmp_xmm);
3837                inst.emit(sink, info, state);
3838
3839                let inst = Inst::xmm_cmp_rm_r(cmp_op, src, RegMem::reg(tmp_xmm.to_reg()));
3840                inst.emit(sink, info, state);
3841
3842                // no trap if src >= or > threshold
3843                let inst = Inst::trap_if(no_overflow_cc.invert(), TrapCode::INTEGER_OVERFLOW);
3844                inst.emit(sink, info, state);
3845
3846                // If positive, it was a real overflow.
3847
3848                // Zero out the tmp_xmm register.
3849                let inst = Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), tmp_xmm);
3850                inst.emit(sink, info, state);
3851
3852                let inst = Inst::xmm_cmp_rm_r(cmp_op, tmp_xmm.to_reg(), RegMem::reg(src));
3853                inst.emit(sink, info, state);
3854
3855                // no trap if 0 >= src
3856                let inst = Inst::trap_if(CC::B, TrapCode::INTEGER_OVERFLOW);
3857                inst.emit(sink, info, state);
3858            }
3859
3860            sink.bind_label(done, state.ctrl_plane_mut());
3861        }
3862
3863        Inst::CvtFloatToUintSeq {
3864            src_size,
3865            dst_size,
3866            is_saturating,
3867            src,
3868            dst,
3869            tmp_gpr,
3870            tmp_xmm,
3871            tmp_xmm2,
3872        } => {
3873            let src = src.to_reg();
3874            let dst = dst.to_writable_reg();
3875            let tmp_gpr = tmp_gpr.to_writable_reg();
3876            let tmp_xmm = tmp_xmm.to_writable_reg();
3877            let tmp_xmm2 = tmp_xmm2.to_writable_reg();
3878
3879            // The only difference in behavior between saturating and non-saturating is how we
3880            // handle errors. Emits the following sequence:
3881            //
3882            // movaps/mov 2**(int_width - 1), %tmp_gpr
3883            // movq/movd %tmp_gpr, %tmp_xmm
3884            // cmpss/cmpsd %tmp_xmm, %src
3885            // jnb is_large
3886            //
3887            // ;; check for NaN inputs
3888            // jnp not_nan
3889            // -- non-saturating: ud2 trap BadConversionToInteger
3890            // -- saturating: xor %dst, %dst; j done
3891            //
3892            // not_nan:
3893            // cvttss2si/cvttsd2si %src, %dst
3894            // cmp 0, %dst
3895            // jnl done
3896            // -- non-saturating: ud2 trap IntegerOverflow
3897            // -- saturating: xor %dst, %dst; j done
3898            //
3899            // is_large:
3900            // mov %src, %tmp_xmm2
3901            // subss/subsd %tmp_xmm, %tmp_xmm2
3902            // cvttss2si/cvttss2sd %tmp_x, %dst
3903            // cmp 0, %dst
3904            // jnl next_is_large
3905            // -- non-saturating: ud2 trap IntegerOverflow
3906            // -- saturating: movaps $UINT_MAX, %dst; j done
3907            //
3908            // next_is_large:
3909            // add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers
3910            //
3911            // done:
3912
3913            assert_ne!(tmp_xmm.to_reg(), src, "tmp_xmm clobbers src!");
3914
3915            let (sub_op, cast_op, cmp_op, trunc_op) = match src_size {
3916                OperandSize::Size32 => (
3917                    SseOpcode::Subss,
3918                    SseOpcode::Movd,
3919                    SseOpcode::Ucomiss,
3920                    SseOpcode::Cvttss2si,
3921                ),
3922                OperandSize::Size64 => (
3923                    SseOpcode::Subsd,
3924                    SseOpcode::Movq,
3925                    SseOpcode::Ucomisd,
3926                    SseOpcode::Cvttsd2si,
3927                ),
3928                _ => unreachable!(),
3929            };
3930
3931            let done = sink.get_label();
3932
3933            let cst = match src_size {
3934                OperandSize::Size32 => Ieee32::pow2(dst_size.to_bits() - 1).bits() as u64,
3935                OperandSize::Size64 => Ieee64::pow2(dst_size.to_bits() - 1).bits(),
3936                _ => unreachable!(),
3937            };
3938
3939            let inst = Inst::imm(*src_size, cst, tmp_gpr);
3940            inst.emit(sink, info, state);
3941
3942            let inst = Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, tmp_xmm);
3943            inst.emit(sink, info, state);
3944
3945            let inst = Inst::xmm_cmp_rm_r(cmp_op, src, RegMem::reg(tmp_xmm.to_reg()));
3946            inst.emit(sink, info, state);
3947
3948            let handle_large = sink.get_label();
3949            one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
3950
3951            if *is_saturating {
3952                // If not NaN jump over this 0-return, otherwise return 0
3953                let not_nan = sink.get_label();
3954                one_way_jmp(sink, CC::NP, not_nan);
3955                let inst = Inst::alu_rmi_r(
3956                    *dst_size,
3957                    AluRmiROpcode::Xor,
3958                    RegMemImm::reg(dst.to_reg()),
3959                    dst,
3960                );
3961                inst.emit(sink, info, state);
3962
3963                let inst = Inst::jmp_known(done);
3964                inst.emit(sink, info, state);
3965                sink.bind_label(not_nan, state.ctrl_plane_mut());
3966            } else {
3967                // Trap.
3968                let inst = Inst::trap_if(CC::P, TrapCode::BAD_CONVERSION_TO_INTEGER);
3969                inst.emit(sink, info, state);
3970            }
3971
3972            // Actual truncation for small inputs: if the result is not positive, then we had an
3973            // overflow.
3974
3975            let inst = Inst::xmm_to_gpr(trunc_op, src, dst, *dst_size);
3976            inst.emit(sink, info, state);
3977
3978            let inst = Inst::cmp_rmi_r(*dst_size, dst.to_reg(), RegMemImm::imm(0));
3979            inst.emit(sink, info, state);
3980
3981            one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
3982
3983            if *is_saturating {
3984                // The input was "small" (< 2**(width -1)), so the only way to get an integer
3985                // overflow is because the input was too small: saturate to the min value, i.e. 0.
3986                let inst = Inst::alu_rmi_r(
3987                    *dst_size,
3988                    AluRmiROpcode::Xor,
3989                    RegMemImm::reg(dst.to_reg()),
3990                    dst,
3991                );
3992                inst.emit(sink, info, state);
3993
3994                let inst = Inst::jmp_known(done);
3995                inst.emit(sink, info, state);
3996            } else {
3997                // Trap.
3998                let inst = Inst::trap(TrapCode::INTEGER_OVERFLOW);
3999                inst.emit(sink, info, state);
4000            }
4001
4002            // Now handle large inputs.
4003
4004            sink.bind_label(handle_large, state.ctrl_plane_mut());
4005
4006            let inst = Inst::gen_move(tmp_xmm2, src, types::F64);
4007            inst.emit(sink, info, state);
4008
4009            let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), tmp_xmm2);
4010            inst.emit(sink, info, state);
4011
4012            let inst = Inst::xmm_to_gpr(trunc_op, tmp_xmm2.to_reg(), dst, *dst_size);
4013            inst.emit(sink, info, state);
4014
4015            let inst = Inst::cmp_rmi_r(*dst_size, dst.to_reg(), RegMemImm::imm(0));
4016            inst.emit(sink, info, state);
4017
4018            if *is_saturating {
4019                let next_is_large = sink.get_label();
4020                one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
4021
4022                // The input was "large" (>= 2**(width -1)), so the only way to get an integer
4023                // overflow is because the input was too large: saturate to the max value.
4024                let inst = Inst::imm(
4025                    OperandSize::Size64,
4026                    if *dst_size == OperandSize::Size64 {
4027                        u64::max_value()
4028                    } else {
4029                        u32::max_value() as u64
4030                    },
4031                    dst,
4032                );
4033                inst.emit(sink, info, state);
4034
4035                let inst = Inst::jmp_known(done);
4036                inst.emit(sink, info, state);
4037                sink.bind_label(next_is_large, state.ctrl_plane_mut());
4038            } else {
4039                let inst = Inst::trap_if(CC::L, TrapCode::INTEGER_OVERFLOW);
4040                inst.emit(sink, info, state);
4041            }
4042
4043            if *dst_size == OperandSize::Size64 {
4044                let inst = Inst::imm(OperandSize::Size64, 1 << 63, tmp_gpr);
4045                inst.emit(sink, info, state);
4046
4047                let inst = Inst::alu_rmi_r(
4048                    OperandSize::Size64,
4049                    AluRmiROpcode::Add,
4050                    RegMemImm::reg(tmp_gpr.to_reg()),
4051                    dst,
4052                );
4053                inst.emit(sink, info, state);
4054            } else {
4055                let inst = Inst::alu_rmi_r(
4056                    OperandSize::Size32,
4057                    AluRmiROpcode::Add,
4058                    RegMemImm::imm(1 << 31),
4059                    dst,
4060                );
4061                inst.emit(sink, info, state);
4062            }
4063
4064            sink.bind_label(done, state.ctrl_plane_mut());
4065        }
4066
4067        Inst::LoadExtName {
4068            dst,
4069            name,
4070            offset,
4071            distance,
4072        } => {
4073            let dst = dst.to_reg();
4074
4075            if info.flags.is_pic() {
4076                // Generates: movq symbol@GOTPCREL(%rip), %dst
4077                let enc_dst = int_reg_enc(dst);
4078                sink.put1(0x48 | ((enc_dst >> 3) & 1) << 2);
4079                sink.put1(0x8B);
4080                sink.put1(0x05 | ((enc_dst & 7) << 3));
4081                emit_reloc(sink, Reloc::X86GOTPCRel4, name, -4);
4082                sink.put4(0);
4083                // Offset in the relocation above applies to the address of the *GOT entry*, not
4084                // the loaded address; so we emit a separate add or sub instruction if needed.
4085                if *offset < 0 {
4086                    assert!(*offset >= -i32::MAX as i64);
4087                    sink.put1(0x48 | ((enc_dst >> 3) & 1));
4088                    sink.put1(0x81);
4089                    sink.put1(0xe8 | (enc_dst & 7));
4090                    sink.put4((-*offset) as u32);
4091                } else if *offset > 0 {
4092                    assert!(*offset <= i32::MAX as i64);
4093                    sink.put1(0x48 | ((enc_dst >> 3) & 1));
4094                    sink.put1(0x81);
4095                    sink.put1(0xc0 | (enc_dst & 7));
4096                    sink.put4(*offset as u32);
4097                }
4098            } else if distance == &RelocDistance::Near {
4099                // If we know the distance to the name is within 2GB (e.g., a module-local function),
4100                // we can generate a RIP-relative address, with a relocation.
4101                // Generates: lea $name(%rip), $dst
4102                let enc_dst = int_reg_enc(dst);
4103                sink.put1(0x48 | ((enc_dst >> 3) & 1) << 2);
4104                sink.put1(0x8D);
4105                sink.put1(0x05 | ((enc_dst & 7) << 3));
4106                emit_reloc(sink, Reloc::X86CallPCRel4, name, -4);
4107                sink.put4(0);
4108            } else {
4109                // The full address can be encoded in the register, with a relocation.
4110                // Generates: movabsq $name, %dst
4111                let enc_dst = int_reg_enc(dst);
4112                sink.put1(0x48 | ((enc_dst >> 3) & 1));
4113                sink.put1(0xB8 | (enc_dst & 7));
4114                emit_reloc(sink, Reloc::Abs8, name, *offset);
4115                sink.put8(0);
4116            }
4117        }
4118
4119        Inst::LockCmpxchg {
4120            ty,
4121            replacement,
4122            expected,
4123            mem,
4124            dst_old,
4125        } => {
4126            let replacement = *replacement;
4127            let expected = *expected;
4128            let dst_old = dst_old.to_reg();
4129            let mem = mem.clone();
4130
4131            debug_assert_eq!(expected, regs::rax());
4132            debug_assert_eq!(dst_old, regs::rax());
4133
4134            // lock cmpxchg{b,w,l,q} %replacement, (mem)
4135            // Note that 0xF0 is the Lock prefix.
4136            let (prefix, opcodes) = match *ty {
4137                types::I8 => (LegacyPrefixes::_F0, 0x0FB0),
4138                types::I16 => (LegacyPrefixes::_66F0, 0x0FB1),
4139                types::I32 => (LegacyPrefixes::_F0, 0x0FB1),
4140                types::I64 => (LegacyPrefixes::_F0, 0x0FB1),
4141                _ => unreachable!(),
4142            };
4143            let rex = RexFlags::from((OperandSize::from_ty(*ty), replacement));
4144            let amode = mem.finalize(state.frame_layout(), sink);
4145            emit_std_reg_mem(sink, prefix, opcodes, 2, replacement, &amode, rex, 0);
4146        }
4147
4148        Inst::LockCmpxchg16b {
4149            replacement_low,
4150            replacement_high,
4151            expected_low,
4152            expected_high,
4153            mem,
4154            dst_old_low,
4155            dst_old_high,
4156        } => {
4157            let mem = mem.clone();
4158            debug_assert_eq!(*replacement_low, regs::rbx());
4159            debug_assert_eq!(*replacement_high, regs::rcx());
4160            debug_assert_eq!(*expected_low, regs::rax());
4161            debug_assert_eq!(*expected_high, regs::rdx());
4162            debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
4163            debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
4164
4165            let amode = mem.finalize(state.frame_layout(), sink);
4166            // lock cmpxchg16b (mem)
4167            // Note that 0xF0 is the Lock prefix.
4168            emit_std_enc_mem(
4169                sink,
4170                LegacyPrefixes::_F0,
4171                0x0FC7,
4172                2,
4173                1,
4174                &amode,
4175                RexFlags::set_w(),
4176                0,
4177            );
4178        }
4179
4180        Inst::LockXadd {
4181            size,
4182            operand,
4183            mem,
4184            dst_old,
4185        } => {
4186            debug_assert_eq!(dst_old.to_reg(), *operand);
4187            // lock xadd{b,w,l,q} %operand, (mem)
4188            // Note that 0xF0 is the Lock prefix.
4189            let (prefix, opcodes) = match size {
4190                OperandSize::Size8 => (LegacyPrefixes::_F0, 0x0FC0),
4191                OperandSize::Size16 => (LegacyPrefixes::_66F0, 0x0FC1),
4192                OperandSize::Size32 => (LegacyPrefixes::_F0, 0x0FC1),
4193                OperandSize::Size64 => (LegacyPrefixes::_F0, 0x0FC1),
4194            };
4195            let rex = RexFlags::from((*size, *operand));
4196            let amode = mem.finalize(state.frame_layout(), sink);
4197            emit_std_reg_mem(sink, prefix, opcodes, 2, *operand, &amode, rex, 0);
4198        }
4199
4200        Inst::Xchg {
4201            size,
4202            operand,
4203            mem,
4204            dst_old,
4205        } => {
4206            debug_assert_eq!(dst_old.to_reg(), *operand);
4207            // xchg{b,w,l,q} %operand, (mem)
4208            let (prefix, opcodes) = match size {
4209                OperandSize::Size8 => (LegacyPrefixes::None, 0x86),
4210                OperandSize::Size16 => (LegacyPrefixes::_66, 0x87),
4211                OperandSize::Size32 => (LegacyPrefixes::None, 0x87),
4212                OperandSize::Size64 => (LegacyPrefixes::None, 0x87),
4213            };
4214            let rex = RexFlags::from((*size, *operand));
4215            let amode = mem.finalize(state.frame_layout(), sink);
4216            emit_std_reg_mem(sink, prefix, opcodes, 1, *operand, &amode, rex, 0);
4217        }
4218
4219        Inst::AtomicRmwSeq {
4220            ty,
4221            op,
4222            mem,
4223            operand,
4224            temp,
4225            dst_old,
4226        } => {
4227            let operand = *operand;
4228            let temp = *temp;
4229            let dst_old = *dst_old;
4230            debug_assert_eq!(dst_old.to_reg(), regs::rax());
4231            let mem = mem.finalize(state.frame_layout(), sink).clone();
4232
4233            // Emit this:
4234            //    mov{zbq,zwq,zlq,q}     (%r_address), %rax    // rax = old value
4235            //  again:
4236            //    movq                   %rax, %r_temp         // rax = old value, r_temp = old value
4237            //    `op`q                  %r_operand, %r_temp   // rax = old value, r_temp = new value
4238            //    lock cmpxchg{b,w,l,q}  %r_temp, (%r_address) // try to store new value
4239            //    jnz again // If this is taken, rax will have a "revised" old value
4240            //
4241            // Operand conventions: IN:  %r_address, %r_operand OUT: %rax (old
4242            //    value), %r_temp (trashed), %rflags (trashed)
4243            let again_label = sink.get_label();
4244
4245            // mov{zbq,zwq,zlq,q} (%r_address), %rax
4246            // No need to call `add_trap` here, since the `i1` emit will do that.
4247            let i1 = Inst::load(*ty, mem.clone(), dst_old, ExtKind::ZeroExtend);
4248            i1.emit(sink, info, state);
4249
4250            // again:
4251            sink.bind_label(again_label, state.ctrl_plane_mut());
4252
4253            // movq %rax, %r_temp
4254            let i2 = Inst::mov_r_r(OperandSize::Size64, dst_old.to_reg(), temp);
4255            i2.emit(sink, info, state);
4256
4257            let operand_rmi = RegMemImm::reg(operand);
4258            use AtomicRmwSeqOp as RmwOp;
4259            match op {
4260                RmwOp::Nand => {
4261                    // andq %r_operand, %r_temp
4262                    let i3 =
4263                        Inst::alu_rmi_r(OperandSize::Size64, AluRmiROpcode::And, operand_rmi, temp);
4264                    i3.emit(sink, info, state);
4265
4266                    // notq %r_temp
4267                    let i4 = Inst::not(OperandSize::Size64, temp);
4268                    i4.emit(sink, info, state);
4269                }
4270                RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
4271                    // cmp %r_temp, %r_operand
4272                    let i3 = Inst::cmp_rmi_r(
4273                        OperandSize::from_ty(*ty),
4274                        operand,
4275                        RegMemImm::reg(temp.to_reg()),
4276                    );
4277                    i3.emit(sink, info, state);
4278
4279                    // cmovcc %r_operand, %r_temp
4280                    let cc = match op {
4281                        RmwOp::Umin => CC::BE,
4282                        RmwOp::Umax => CC::NB,
4283                        RmwOp::Smin => CC::LE,
4284                        RmwOp::Smax => CC::NL,
4285                        _ => unreachable!(),
4286                    };
4287                    let i4 = Inst::cmove(OperandSize::Size64, cc, RegMem::reg(operand), temp);
4288                    i4.emit(sink, info, state);
4289                }
4290                RmwOp::And | RmwOp::Or | RmwOp::Xor => {
4291                    // opq %r_operand, %r_temp
4292                    let alu_op = match op {
4293                        RmwOp::And => AluRmiROpcode::And,
4294                        RmwOp::Or => AluRmiROpcode::Or,
4295                        RmwOp::Xor => AluRmiROpcode::Xor,
4296                        _ => unreachable!(),
4297                    };
4298                    let i3 = Inst::alu_rmi_r(OperandSize::Size64, alu_op, operand_rmi, temp);
4299                    i3.emit(sink, info, state);
4300                }
4301            }
4302
4303            // lock cmpxchg{b,w,l,q} %r_temp, (%r_address)
4304            // No need to call `add_trap` here, since the `i4` emit will do that.
4305            let i4 = Inst::LockCmpxchg {
4306                ty: *ty,
4307                replacement: temp.to_reg(),
4308                expected: dst_old.to_reg(),
4309                mem: mem.into(),
4310                dst_old,
4311            };
4312            i4.emit(sink, info, state);
4313
4314            // jnz again
4315            one_way_jmp(sink, CC::NZ, again_label);
4316        }
4317
4318        Inst::Atomic128RmwSeq {
4319            op,
4320            mem,
4321            operand_low,
4322            operand_high,
4323            temp_low,
4324            temp_high,
4325            dst_old_low,
4326            dst_old_high,
4327        } => {
4328            let operand_low = *operand_low;
4329            let operand_high = *operand_high;
4330            let temp_low = *temp_low;
4331            let temp_high = *temp_high;
4332            let dst_old_low = *dst_old_low;
4333            let dst_old_high = *dst_old_high;
4334            debug_assert_eq!(temp_low.to_reg(), regs::rbx());
4335            debug_assert_eq!(temp_high.to_reg(), regs::rcx());
4336            debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
4337            debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
4338            let mem = mem.finalize(state.frame_layout(), sink).clone();
4339
4340            let again_label = sink.get_label();
4341
4342            // Load the initial value.
4343            Inst::load(types::I64, mem.clone(), dst_old_low, ExtKind::ZeroExtend)
4344                .emit(sink, info, state);
4345            Inst::load(types::I64, mem.offset(8), dst_old_high, ExtKind::ZeroExtend)
4346                .emit(sink, info, state);
4347
4348            // again:
4349            sink.bind_label(again_label, state.ctrl_plane_mut());
4350
4351            // Move old value to temp registers.
4352            Inst::mov_r_r(OperandSize::Size64, dst_old_low.to_reg(), temp_low)
4353                .emit(sink, info, state);
4354            Inst::mov_r_r(OperandSize::Size64, dst_old_high.to_reg(), temp_high)
4355                .emit(sink, info, state);
4356
4357            // Perform the operation.
4358            let operand_low_rmi = RegMemImm::reg(operand_low);
4359            let operand_high_rmi = RegMemImm::reg(operand_high);
4360            use Atomic128RmwSeqOp as RmwOp;
4361            match op {
4362                RmwOp::Nand => {
4363                    // temp &= operand
4364                    Inst::alu_rmi_r(
4365                        OperandSize::Size64,
4366                        AluRmiROpcode::And,
4367                        operand_low_rmi,
4368                        temp_low,
4369                    )
4370                    .emit(sink, info, state);
4371                    Inst::alu_rmi_r(
4372                        OperandSize::Size64,
4373                        AluRmiROpcode::And,
4374                        operand_high_rmi,
4375                        temp_high,
4376                    )
4377                    .emit(sink, info, state);
4378
4379                    // temp = !temp
4380                    Inst::not(OperandSize::Size64, temp_low).emit(sink, info, state);
4381                    Inst::not(OperandSize::Size64, temp_high).emit(sink, info, state);
4382                }
4383                RmwOp::Umin | RmwOp::Umax | RmwOp::Smin | RmwOp::Smax => {
4384                    // Do a comparison with LHS temp and RHS operand.
4385                    // `cmp_rmi_r` and `alu_rmi_r` have opposite argument orders.
4386                    Inst::cmp_rmi_r(OperandSize::Size64, temp_low.to_reg(), operand_low_rmi)
4387                        .emit(sink, info, state);
4388                    // This will clobber `temp_high`
4389                    Inst::alu_rmi_r(
4390                        OperandSize::Size64,
4391                        AluRmiROpcode::Sbb,
4392                        operand_high_rmi,
4393                        temp_high,
4394                    )
4395                    .emit(sink, info, state);
4396                    // Restore the clobbered value
4397                    Inst::mov_r_r(OperandSize::Size64, dst_old_high.to_reg(), temp_high)
4398                        .emit(sink, info, state);
4399                    let cc = match op {
4400                        RmwOp::Umin => CC::NB,
4401                        RmwOp::Umax => CC::B,
4402                        RmwOp::Smin => CC::NL,
4403                        RmwOp::Smax => CC::L,
4404                        _ => unreachable!(),
4405                    };
4406                    Inst::cmove(OperandSize::Size64, cc, operand_low.into(), temp_low)
4407                        .emit(sink, info, state);
4408                    Inst::cmove(OperandSize::Size64, cc, operand_high.into(), temp_high)
4409                        .emit(sink, info, state);
4410                }
4411                RmwOp::Add | RmwOp::Sub | RmwOp::And | RmwOp::Or | RmwOp::Xor => {
4412                    // temp op= operand
4413                    let (op_low, op_high) = match op {
4414                        RmwOp::Add => (AluRmiROpcode::Add, AluRmiROpcode::Adc),
4415                        RmwOp::Sub => (AluRmiROpcode::Sub, AluRmiROpcode::Sbb),
4416                        RmwOp::And => (AluRmiROpcode::And, AluRmiROpcode::And),
4417                        RmwOp::Or => (AluRmiROpcode::Or, AluRmiROpcode::Or),
4418                        RmwOp::Xor => (AluRmiROpcode::Xor, AluRmiROpcode::Xor),
4419                        _ => unreachable!(),
4420                    };
4421                    Inst::alu_rmi_r(OperandSize::Size64, op_low, operand_low_rmi, temp_low)
4422                        .emit(sink, info, state);
4423                    Inst::alu_rmi_r(OperandSize::Size64, op_high, operand_high_rmi, temp_high)
4424                        .emit(sink, info, state);
4425                }
4426            }
4427
4428            // cmpxchg16b (mem)
4429            Inst::LockCmpxchg16b {
4430                replacement_low: temp_low.to_reg(),
4431                replacement_high: temp_high.to_reg(),
4432                expected_low: dst_old_low.to_reg(),
4433                expected_high: dst_old_high.to_reg(),
4434                mem: Box::new(mem.into()),
4435                dst_old_low,
4436                dst_old_high,
4437            }
4438            .emit(sink, info, state);
4439
4440            // jnz again
4441            one_way_jmp(sink, CC::NZ, again_label);
4442        }
4443
4444        Inst::Atomic128XchgSeq {
4445            mem,
4446            operand_low,
4447            operand_high,
4448            dst_old_low,
4449            dst_old_high,
4450        } => {
4451            let operand_low = *operand_low;
4452            let operand_high = *operand_high;
4453            let dst_old_low = *dst_old_low;
4454            let dst_old_high = *dst_old_high;
4455            debug_assert_eq!(operand_low, regs::rbx());
4456            debug_assert_eq!(operand_high, regs::rcx());
4457            debug_assert_eq!(dst_old_low.to_reg(), regs::rax());
4458            debug_assert_eq!(dst_old_high.to_reg(), regs::rdx());
4459            let mem = mem.finalize(state.frame_layout(), sink).clone();
4460
4461            let again_label = sink.get_label();
4462
4463            // Load the initial value.
4464            Inst::load(types::I64, mem.clone(), dst_old_low, ExtKind::ZeroExtend)
4465                .emit(sink, info, state);
4466            Inst::load(types::I64, mem.offset(8), dst_old_high, ExtKind::ZeroExtend)
4467                .emit(sink, info, state);
4468
4469            // again:
4470            sink.bind_label(again_label, state.ctrl_plane_mut());
4471
4472            // cmpxchg16b (mem)
4473            Inst::LockCmpxchg16b {
4474                replacement_low: operand_low,
4475                replacement_high: operand_high,
4476                expected_low: dst_old_low.to_reg(),
4477                expected_high: dst_old_high.to_reg(),
4478                mem: Box::new(mem.into()),
4479                dst_old_low,
4480                dst_old_high,
4481            }
4482            .emit(sink, info, state);
4483
4484            // jnz again
4485            one_way_jmp(sink, CC::NZ, again_label);
4486        }
4487
4488        Inst::Fence { kind } => {
4489            sink.put1(0x0F);
4490            sink.put1(0xAE);
4491            match kind {
4492                FenceKind::MFence => sink.put1(0xF0), // mfence = 0F AE F0
4493                FenceKind::LFence => sink.put1(0xE8), // lfence = 0F AE E8
4494                FenceKind::SFence => sink.put1(0xF8), // sfence = 0F AE F8
4495            }
4496        }
4497
4498        Inst::Hlt => {
4499            sink.put1(0xcc);
4500        }
4501
4502        Inst::Ud2 { trap_code } => {
4503            sink.add_trap(*trap_code);
4504            sink.put_data(Inst::TRAP_OPCODE);
4505        }
4506
4507        Inst::Nop { len } => {
4508            // These encodings can all be found in Intel's architecture manual, at the NOP
4509            // instruction description.
4510            let mut len = *len;
4511            while len != 0 {
4512                let emitted = u8::min(len, 9);
4513                match emitted {
4514                    0 => {}
4515                    1 => sink.put1(0x90), // NOP
4516                    2 => {
4517                        // 66 NOP
4518                        sink.put1(0x66);
4519                        sink.put1(0x90);
4520                    }
4521                    3 => {
4522                        // NOP [EAX]
4523                        sink.put1(0x0F);
4524                        sink.put1(0x1F);
4525                        sink.put1(0x00);
4526                    }
4527                    4 => {
4528                        // NOP 0(EAX), with 0 a 1-byte immediate.
4529                        sink.put1(0x0F);
4530                        sink.put1(0x1F);
4531                        sink.put1(0x40);
4532                        sink.put1(0x00);
4533                    }
4534                    5 => {
4535                        // NOP [EAX, EAX, 1]
4536                        sink.put1(0x0F);
4537                        sink.put1(0x1F);
4538                        sink.put1(0x44);
4539                        sink.put1(0x00);
4540                        sink.put1(0x00);
4541                    }
4542                    6 => {
4543                        // 66 NOP [EAX, EAX, 1]
4544                        sink.put1(0x66);
4545                        sink.put1(0x0F);
4546                        sink.put1(0x1F);
4547                        sink.put1(0x44);
4548                        sink.put1(0x00);
4549                        sink.put1(0x00);
4550                    }
4551                    7 => {
4552                        // NOP 0[EAX], but 0 is a 4 bytes immediate.
4553                        sink.put1(0x0F);
4554                        sink.put1(0x1F);
4555                        sink.put1(0x80);
4556                        sink.put1(0x00);
4557                        sink.put1(0x00);
4558                        sink.put1(0x00);
4559                        sink.put1(0x00);
4560                    }
4561                    8 => {
4562                        // NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate.
4563                        sink.put1(0x0F);
4564                        sink.put1(0x1F);
4565                        sink.put1(0x84);
4566                        sink.put1(0x00);
4567                        sink.put1(0x00);
4568                        sink.put1(0x00);
4569                        sink.put1(0x00);
4570                        sink.put1(0x00);
4571                    }
4572                    9 => {
4573                        // 66 NOP 0[EAX, EAX, 1], with 0 a 4 bytes immediate.
4574                        sink.put1(0x66);
4575                        sink.put1(0x0F);
4576                        sink.put1(0x1F);
4577                        sink.put1(0x84);
4578                        sink.put1(0x00);
4579                        sink.put1(0x00);
4580                        sink.put1(0x00);
4581                        sink.put1(0x00);
4582                        sink.put1(0x00);
4583                    }
4584                    _ => unreachable!(),
4585                }
4586                len -= emitted;
4587            }
4588        }
4589
4590        Inst::ElfTlsGetAddr { symbol, dst } => {
4591            let dst = dst.to_reg().to_reg();
4592            debug_assert_eq!(dst, regs::rax());
4593
4594            // N.B.: Must be exactly this byte sequence; the linker requires it,
4595            // because it must know how to rewrite the bytes.
4596
4597            // data16 lea gv@tlsgd(%rip),%rdi
4598            sink.put1(0x66); // data16
4599            sink.put1(0b01001000); // REX.W
4600            sink.put1(0x8d); // LEA
4601            sink.put1(0x3d); // ModRM byte
4602            emit_reloc(sink, Reloc::ElfX86_64TlsGd, symbol, -4);
4603            sink.put4(0); // offset
4604
4605            // data16 data16 callq __tls_get_addr-4
4606            sink.put1(0x66); // data16
4607            sink.put1(0x66); // data16
4608            sink.put1(0b01001000); // REX.W
4609            sink.put1(0xe8); // CALL
4610            emit_reloc(
4611                sink,
4612                Reloc::X86CallPLTRel4,
4613                &ExternalName::LibCall(LibCall::ElfTlsGetAddr),
4614                -4,
4615            );
4616            sink.put4(0); // offset
4617        }
4618
4619        Inst::MachOTlsGetAddr { symbol, dst } => {
4620            let dst = dst.to_reg().to_reg();
4621            debug_assert_eq!(dst, regs::rax());
4622
4623            // movq gv@tlv(%rip), %rdi
4624            sink.put1(0x48); // REX.w
4625            sink.put1(0x8b); // MOV
4626            sink.put1(0x3d); // ModRM byte
4627            emit_reloc(sink, Reloc::MachOX86_64Tlv, symbol, -4);
4628            sink.put4(0); // offset
4629
4630            // callq *(%rdi)
4631            sink.put1(0xff);
4632            sink.put1(0x17);
4633        }
4634
4635        Inst::CoffTlsGetAddr { symbol, dst, tmp } => {
4636            let dst = dst.to_reg().to_reg();
4637            debug_assert_eq!(dst, regs::rax());
4638
4639            // tmp is used below directly as %rcx
4640            let tmp = tmp.to_reg().to_reg();
4641            debug_assert_eq!(tmp, regs::rcx());
4642
4643            // See: https://gcc.godbolt.org/z/M8or9x6ss
4644            // And: https://github.com/bjorn3/rustc_codegen_cranelift/issues/388#issuecomment-532930282
4645
4646            // Emit the following sequence
4647            // movl	(%rip), %eax          ; IMAGE_REL_AMD64_REL32	_tls_index
4648            // movq	%gs:88, %rcx
4649            // movq	(%rcx,%rax,8), %rax
4650            // leaq	(%rax), %rax          ; Reloc: IMAGE_REL_AMD64_SECREL	symbol
4651
4652            // Load TLS index for current thread
4653            // movl	(%rip), %eax
4654            sink.put1(0x8b); // mov
4655            sink.put1(0x05);
4656            emit_reloc(
4657                sink,
4658                Reloc::X86PCRel4,
4659                &ExternalName::KnownSymbol(KnownSymbol::CoffTlsIndex),
4660                -4,
4661            );
4662            sink.put4(0); // offset
4663
4664            // movq	%gs:88, %rcx
4665            // Load the TLS Storage Array pointer
4666            // The gs segment register refers to the base address of the TEB on x64.
4667            // 0x58 is the offset in the TEB for the ThreadLocalStoragePointer member on x64:
4668            sink.put_data(&[
4669                0x65, 0x48, // REX.W
4670                0x8b, // MOV
4671                0x0c, 0x25, 0x58, // 0x58 - ThreadLocalStoragePointer offset
4672                0x00, 0x00, 0x00,
4673            ]);
4674
4675            // movq	(%rcx,%rax,8), %rax
4676            // Load the actual TLS entry for this thread.
4677            // Computes ThreadLocalStoragePointer + _tls_index*8
4678            sink.put_data(&[0x48, 0x8b, 0x04, 0xc1]);
4679
4680            // leaq	(%rax), %rax
4681            sink.put1(0x48);
4682            sink.put1(0x8d);
4683            sink.put1(0x80);
4684            emit_reloc(sink, Reloc::X86SecRel, symbol, 0);
4685            sink.put4(0); // offset
4686        }
4687
4688        Inst::Unwind { inst } => {
4689            sink.add_unwind(inst.clone());
4690        }
4691
4692        Inst::DummyUse { .. } => {
4693            // Nothing.
4694        }
4695
4696        Inst::External { inst } => {
4697            let mut known_offsets = [0, 0];
4698            // These values are transcribed from what is happening in
4699            // `SyntheticAmode::finalize`. This, plus the `Into` logic
4700            // converting a `SyntheticAmode` to its external counterpart, are
4701            // necessary to communicate Cranelift's internal offsets to the
4702            // assembler; due to when Cranelift determines these offsets, this
4703            // happens quite late (i.e., here during emission).
4704            let frame = state.frame_layout();
4705            known_offsets[external::offsets::KEY_INCOMING_ARG] =
4706                i32::try_from(frame.tail_args_size + frame.setup_area_size).unwrap();
4707            known_offsets[external::offsets::KEY_SLOT_OFFSET] =
4708                i32::try_from(frame.outgoing_args_size).unwrap();
4709            inst.encode(sink, &known_offsets);
4710        }
4711    }
4712
4713    state.clear_post_insn();
4714}
4715
4716/// Emit the common sequence used for both direct and indirect tail calls:
4717///
4718/// * Copy the new frame's stack arguments over the top of our current frame.
4719///
4720/// * Restore the old frame pointer.
4721///
4722/// * Initialize the tail callee's stack pointer (simultaneously deallocating
4723///   the temporary stack space we allocated when creating the new frame's stack
4724///   arguments).
4725///
4726/// * Move the return address into its stack slot.
4727fn emit_return_call_common_sequence<T>(
4728    sink: &mut MachBuffer<Inst>,
4729    info: &EmitInfo,
4730    state: &mut EmitState,
4731    call_info: &ReturnCallInfo<T>,
4732) {
4733    assert!(
4734        info.flags.preserve_frame_pointers(),
4735        "frame pointers aren't fundamentally required for tail calls, \
4736                 but the current implementation relies on them being present"
4737    );
4738
4739    let tmp = call_info.tmp.to_writable_reg();
4740
4741    for inst in
4742        X64ABIMachineSpec::gen_clobber_restore(CallConv::Tail, &info.flags, state.frame_layout())
4743    {
4744        inst.emit(sink, info, state);
4745    }
4746
4747    for inst in X64ABIMachineSpec::gen_epilogue_frame_restore(
4748        CallConv::Tail,
4749        &info.flags,
4750        &info.isa_flags,
4751        state.frame_layout(),
4752    ) {
4753        inst.emit(sink, info, state);
4754    }
4755
4756    let incoming_args_diff = state.frame_layout().tail_args_size - call_info.new_stack_arg_size;
4757    if incoming_args_diff > 0 {
4758        // Move the saved return address up by `incoming_args_diff`
4759        Inst::mov64_m_r(Amode::imm_reg(0, regs::rsp()), tmp).emit(sink, info, state);
4760        Inst::mov_r_m(
4761            OperandSize::Size64,
4762            tmp.to_reg(),
4763            Amode::imm_reg(i32::try_from(incoming_args_diff).unwrap(), regs::rsp()),
4764        )
4765        .emit(sink, info, state);
4766
4767        // Increment the stack pointer to shrink the argument area for the new call.
4768        Inst::alu_rmi_r(
4769            OperandSize::Size64,
4770            AluRmiROpcode::Add,
4771            RegMemImm::imm(incoming_args_diff),
4772            Writable::from_reg(regs::rsp()),
4773        )
4774        .emit(sink, info, state);
4775    }
4776}