cranelift_assembler_x64/
fuzz.rs

1//! A fuzz testing oracle for roundtrip assembly-disassembly.
2//!
3//! This contains manual implementations of the `Arbitrary` trait for types
4//! throughout this crate to avoid depending on the `arbitrary` crate
5//! unconditionally (use the `fuzz` feature instead).
6
7use std::string::{String, ToString};
8use std::vec::Vec;
9use std::{format, println};
10
11use crate::{
12    AmodeOffset, AmodeOffsetPlusKnownOffset, AsReg, CodeSink, DeferredTarget, Fixed, Gpr, Inst,
13    KnownOffset, NonRspGpr, Registers, TrapCode, Xmm,
14};
15use arbitrary::{Arbitrary, Result, Unstructured};
16use capstone::{Capstone, arch::BuildsCapstone, arch::BuildsCapstoneSyntax, arch::x86};
17
18/// Take a random assembly instruction and check its encoding and
19/// pretty-printing against a known-good disassembler.
20///
21/// # Panics
22///
23/// This function panics to express failure as expected by the `arbitrary`
24/// fuzzer infrastructure. It may fail during assembly, disassembly, or when
25/// comparing the disassembled strings.
26pub fn roundtrip(inst: &Inst<FuzzRegs>) {
27    // Check that we can actually assemble this instruction.
28    let assembled = assemble(inst);
29    let expected = disassemble(&assembled, inst);
30
31    // Check that our pretty-printed output matches the known-good output. Trim
32    // off the instruction offset first.
33    let expected = expected.split_once(' ').unwrap().1;
34    let actual = inst.to_string();
35    if expected != actual && expected.trim() != fix_up(&actual) {
36        println!("> {inst}");
37        println!("  debug: {inst:x?}");
38        println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
39        println!("  expected (capstone): {expected}");
40        println!("  actual (to_string):  {actual}");
41        assert_eq!(expected, &actual);
42    }
43}
44
45/// Use this assembler to emit machine code into a byte buffer.
46///
47/// This will skip any traps or label registrations, but this is fine for the
48/// single-instruction disassembly we're doing here.
49fn assemble(inst: &Inst<FuzzRegs>) -> Vec<u8> {
50    let mut sink = TestCodeSink::default();
51    inst.encode(&mut sink);
52    sink.patch_labels_as_if_they_referred_to_end();
53    sink.buf
54}
55
56#[derive(Default)]
57struct TestCodeSink {
58    buf: Vec<u8>,
59    offsets_using_label: Vec<usize>,
60}
61
62impl TestCodeSink {
63    /// References to labels, e.g. RIP-relative addressing, is stored with an
64    /// adjustment that takes into account the distance from the relative offset
65    /// to the end of the instruction, where the offset is relative to. That
66    /// means that to indeed make the offset relative to the end of the
67    /// instruction, which is what we pretend all labels are bound to, it's
68    /// required that this adjustment is taken into account.
69    ///
70    /// This function will iterate over all labels bound to this code sink and
71    /// pretend the label is found at the end of the `buf`. That means that the
72    /// distance from the label to the end of `buf` minus 4, which is the width
73    /// of the offset, is added to what's already present in the encoding buffer.
74    ///
75    /// This is effectively undoing the `bytes_at_end` adjustment that's part of
76    /// `Amode::RipRelative` addressing.
77    fn patch_labels_as_if_they_referred_to_end(&mut self) {
78        let len = i32::try_from(self.buf.len()).unwrap();
79        for offset in self.offsets_using_label.iter() {
80            let range = self.buf[*offset..].first_chunk_mut::<4>().unwrap();
81            let offset = i32::try_from(*offset).unwrap() + 4;
82            let rel_distance = len - offset;
83            *range = (i32::from_le_bytes(*range) + rel_distance).to_le_bytes();
84        }
85    }
86}
87
88impl CodeSink for TestCodeSink {
89    fn put1(&mut self, v: u8) {
90        self.buf.extend_from_slice(&[v]);
91    }
92
93    fn put2(&mut self, v: u16) {
94        self.buf.extend_from_slice(&v.to_le_bytes());
95    }
96
97    fn put4(&mut self, v: u32) {
98        self.buf.extend_from_slice(&v.to_le_bytes());
99    }
100
101    fn put8(&mut self, v: u64) {
102        self.buf.extend_from_slice(&v.to_le_bytes());
103    }
104
105    fn add_trap(&mut self, _: TrapCode) {}
106
107    fn use_target(&mut self, _: DeferredTarget) {
108        let offset = self.buf.len();
109        self.offsets_using_label.push(offset);
110    }
111
112    fn known_offset(&self, target: KnownOffset) -> i32 {
113        panic!("unsupported known target {target:?}")
114    }
115}
116
117/// Building a new `Capstone` each time is suboptimal (TODO).
118fn disassemble(assembled: &[u8], original: &Inst<FuzzRegs>) -> String {
119    let cs = Capstone::new()
120        .x86()
121        .mode(x86::ArchMode::Mode64)
122        .syntax(x86::ArchSyntax::Att)
123        .detail(true)
124        .build()
125        .expect("failed to create Capstone object");
126    let insts = cs
127        .disasm_all(assembled, 0x0)
128        .expect("failed to disassemble");
129
130    if insts.len() != 1 {
131        println!("> {original}");
132        println!("  debug: {original:x?}");
133        println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
134        assert_eq!(insts.len(), 1, "not a single instruction");
135    }
136
137    let inst = insts.first().expect("at least one instruction");
138    if assembled.len() != inst.len() {
139        println!("> {original}");
140        println!("  debug: {original:x?}");
141        println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
142        println!(
143            "  capstone-assembled: {}",
144            pretty_print_hexadecimal(inst.bytes())
145        );
146        assert_eq!(assembled.len(), inst.len(), "extra bytes not disassembled");
147    }
148
149    inst.to_string()
150}
151
152fn pretty_print_hexadecimal(hex: &[u8]) -> String {
153    use core::fmt::Write;
154    let mut s = String::with_capacity(hex.len() * 2);
155    for b in hex {
156        write!(&mut s, "{b:02X}").unwrap();
157    }
158    s
159}
160
161/// See `replace_signed_immediates`.
162macro_rules! hex_print_signed_imm {
163    ($hex:expr, $from:ty => $to:ty) => {{
164        let imm = <$from>::from_str_radix($hex, 16).unwrap() as $to;
165        let mut simm = String::new();
166        if imm < 0 {
167            simm.push_str("-");
168        }
169        let abs = match imm.checked_abs() {
170            Some(i) => i,
171            None => <$to>::MIN,
172        };
173        if imm > -10 && imm < 10 {
174            simm.push_str(&format!("{:x}", abs));
175        } else {
176            simm.push_str(&format!("0x{:x}", abs));
177        }
178        simm
179    }};
180}
181
182/// Replace signed immediates in the disassembly with their unsigned hexadecimal
183/// equivalent. This is only necessary to match `capstone`'s complex
184/// pretty-printing rules; e.g. `capstone` will:
185/// - omit the `0x` prefix when printing `0x0` as `0`.
186/// - omit the `0x` prefix when print small values (less than 10)
187/// - print negative values as `-0x...` (signed hex) instead of `0xff...`
188///   (normal hex)
189/// - print `mov` immediates as base-10 instead of base-16 (?!).
190fn replace_signed_immediates(dis: &str) -> alloc::borrow::Cow<'_, str> {
191    match dis.find('$') {
192        None => dis.into(),
193        Some(idx) => {
194            let (prefix, rest) = dis.split_at(idx + 1); // Skip the '$'.
195            let (_, rest) = chomp("-", rest); // Skip the '-' if it's there.
196            let (_, rest) = chomp("0x", rest); // Skip the '0x' if it's there.
197            let n = rest.chars().take_while(char::is_ascii_hexdigit).count();
198            let (hex, rest) = rest.split_at(n); // Split at next non-hex character.
199            let simm = if dis.starts_with("mov") {
200                u64::from_str_radix(hex, 16).unwrap().to_string()
201            } else {
202                match hex.len() {
203                    1 | 2 => hex_print_signed_imm!(hex, u8 => i8),
204                    4 => hex_print_signed_imm!(hex, u16 => i16),
205                    8 => hex_print_signed_imm!(hex, u32 => i32),
206                    16 => hex_print_signed_imm!(hex, u64 => i64),
207                    _ => panic!("unexpected length for hex: {hex}"),
208                }
209            };
210            format!("{prefix}{simm}{rest}").into()
211        }
212    }
213}
214
215// See `replace_signed_immediates`.
216fn chomp<'a>(pat: &str, s: &'a str) -> (&'a str, &'a str) {
217    if s.starts_with(pat) {
218        s.split_at(pat.len())
219    } else {
220        ("", s)
221    }
222}
223
224#[test]
225fn replace() {
226    assert_eq!(
227        replace_signed_immediates("andl $0xffffff9a, %r11d"),
228        "andl $-0x66, %r11d"
229    );
230    assert_eq!(
231        replace_signed_immediates("xorq $0xffffffffffffffbc, 0x7f139ecc(%r9)"),
232        "xorq $-0x44, 0x7f139ecc(%r9)"
233    );
234    assert_eq!(
235        replace_signed_immediates("subl $0x3ca77a19, -0x1a030f40(%r14)"),
236        "subl $0x3ca77a19, -0x1a030f40(%r14)"
237    );
238    assert_eq!(
239        replace_signed_immediates("movq $0xffffffff864ae103, %rsi"),
240        "movq $18446744071667638531, %rsi"
241    );
242}
243
244/// Remove everything after the first semicolon in the disassembly and trim any
245/// trailing spaces. This is necessary to remove the implicit operands we end up
246/// printing for Cranelift's sake.
247fn remove_after_semicolon(dis: &str) -> &str {
248    match dis.find(';') {
249        None => dis,
250        Some(idx) => {
251            let (prefix, _) = dis.split_at(idx);
252            prefix.trim()
253        }
254    }
255}
256
257#[test]
258fn remove_after_parenthesis_test() {
259    assert_eq!(
260        remove_after_semicolon("imulb 0x7658eddd(%rcx) ;; implicit: %ax"),
261        "imulb 0x7658eddd(%rcx)"
262    );
263}
264
265/// Run some post-processing on the disassembly to make it match Capstone.
266fn fix_up(dis: &str) -> alloc::borrow::Cow<'_, str> {
267    let dis = remove_after_semicolon(dis);
268    replace_signed_immediates(&dis)
269}
270
271/// Fuzz-specific registers.
272///
273/// For the fuzzer, we do not need any fancy register types; see [`FuzzReg`].
274#[derive(Clone, Arbitrary, Debug)]
275pub struct FuzzRegs;
276
277impl Registers for FuzzRegs {
278    type ReadGpr = FuzzReg;
279    type ReadWriteGpr = FuzzReg;
280    type WriteGpr = FuzzReg;
281    type ReadXmm = FuzzReg;
282    type ReadWriteXmm = FuzzReg;
283    type WriteXmm = FuzzReg;
284}
285
286/// A simple `u8` register type for fuzzing only.
287#[derive(Clone, Copy, Debug, PartialEq)]
288pub struct FuzzReg(u8);
289
290impl<'a> Arbitrary<'a> for FuzzReg {
291    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
292        Ok(Self(u.int_in_range(0..=15)?))
293    }
294}
295
296impl AsReg for FuzzReg {
297    fn new(enc: u8) -> Self {
298        Self(enc)
299    }
300    fn enc(&self) -> u8 {
301        self.0
302    }
303}
304
305impl Arbitrary<'_> for AmodeOffset {
306    fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
307        // Custom implementation to try to generate some "interesting" offsets.
308        // For example choose either an arbitrary 8-bit or 32-bit number as the
309        // base, and then optionally shift that number to the left to create
310        // multiples of constants. This can help stress some of the more
311        // interesting encodings in EVEX instructions for example.
312        let base = if u.arbitrary()? {
313            i32::from(u.arbitrary::<i8>()?)
314        } else {
315            u.arbitrary::<i32>()?
316        };
317        Ok(match u.int_in_range(0..=5)? {
318            0 => AmodeOffset::ZERO,
319            n => AmodeOffset::new(base << (n - 1)),
320        })
321    }
322}
323
324impl Arbitrary<'_> for AmodeOffsetPlusKnownOffset {
325    fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
326        // For now, we don't generate offsets (TODO).
327        Ok(Self {
328            simm32: AmodeOffset::arbitrary(u)?,
329            offset: None,
330        })
331    }
332}
333
334impl<R: AsReg, const E: u8> Arbitrary<'_> for Fixed<R, E> {
335    fn arbitrary(_: &mut Unstructured<'_>) -> Result<Self> {
336        Ok(Self::new(E))
337    }
338}
339
340impl<R: AsReg> Arbitrary<'_> for NonRspGpr<R> {
341    fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
342        use crate::gpr::enc::*;
343        let gpr = u.choose(&[
344            RAX, RCX, RDX, RBX, RBP, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15,
345        ])?;
346        Ok(Self::new(R::new(*gpr)))
347    }
348}
349impl<'a, R: AsReg> Arbitrary<'a> for Gpr<R> {
350    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
351        Ok(Self(R::new(u.int_in_range(0..=15)?)))
352    }
353}
354impl<'a, R: AsReg> Arbitrary<'a> for Xmm<R> {
355    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
356        Ok(Self(R::new(u.int_in_range(0..=15)?)))
357    }
358}
359
360/// Helper trait that's used to be the same as `Registers` except with an extra
361/// `for<'a> Arbitrary<'a>` bound on all of the associated types.
362pub trait RegistersArbitrary:
363    Registers<
364        ReadGpr: for<'a> Arbitrary<'a>,
365        ReadWriteGpr: for<'a> Arbitrary<'a>,
366        WriteGpr: for<'a> Arbitrary<'a>,
367        ReadXmm: for<'a> Arbitrary<'a>,
368        ReadWriteXmm: for<'a> Arbitrary<'a>,
369        WriteXmm: for<'a> Arbitrary<'a>,
370    >
371{
372}
373
374impl<R> RegistersArbitrary for R
375where
376    R: Registers,
377    R::ReadGpr: for<'a> Arbitrary<'a>,
378    R::ReadWriteGpr: for<'a> Arbitrary<'a>,
379    R::WriteGpr: for<'a> Arbitrary<'a>,
380    R::ReadXmm: for<'a> Arbitrary<'a>,
381    R::ReadWriteXmm: for<'a> Arbitrary<'a>,
382    R::WriteXmm: for<'a> Arbitrary<'a>,
383{
384}
385
386#[cfg(test)]
387mod test {
388    use super::*;
389    use arbtest::arbtest;
390    use std::sync::atomic::{AtomicUsize, Ordering};
391
392    #[test]
393    fn smoke() {
394        let count = AtomicUsize::new(0);
395        arbtest(|u| {
396            let inst: Inst<FuzzRegs> = u.arbitrary()?;
397            roundtrip(&inst);
398            println!("#{}: {inst}", count.fetch_add(1, Ordering::SeqCst));
399            Ok(())
400        })
401        .budget_ms(1_000);
402
403        // This will run the `roundtrip` fuzzer for one second. To repeatably
404        // test a single input, append `.seed(0x<failing seed>)`.
405    }
406
407    #[test]
408    fn callq() {
409        for i in -500..500 {
410            println!("immediate: {i}");
411            let inst = crate::inst::callq_d::new(i);
412            roundtrip(&inst.into());
413        }
414    }
415}