cranelift_assembler_x64/
fuzz.rs

1//! A fuzz testing oracle for roundtrip assembly-disassembly.
2//!
3//! This contains manual implementations of the `Arbitrary` trait for types
4//! throughout this crate to avoid depending on the `arbitrary` crate
5//! unconditionally (use the `fuzz` feature instead).
6
7use crate::{
8    AmodeOffset, AmodeOffsetPlusKnownOffset, AsReg, CodeSink, DeferredTarget, Fixed, Gpr, Inst,
9    KnownOffset, NonRspGpr, Registers, TrapCode, Xmm,
10};
11use arbitrary::{Arbitrary, Result, Unstructured};
12use capstone::{Capstone, arch::BuildsCapstone, arch::BuildsCapstoneSyntax, arch::x86};
13
14/// Take a random assembly instruction and check its encoding and
15/// pretty-printing against a known-good disassembler.
16///
17/// # Panics
18///
19/// This function panics to express failure as expected by the `arbitrary`
20/// fuzzer infrastructure. It may fail during assembly, disassembly, or when
21/// comparing the disassembled strings.
22pub fn roundtrip(inst: &Inst<FuzzRegs>) {
23    // Check that we can actually assemble this instruction.
24    let assembled = assemble(inst);
25    let expected = disassemble(&assembled, inst);
26
27    // Check that our pretty-printed output matches the known-good output. Trim
28    // off the instruction offset first.
29    let expected = expected.split_once(' ').unwrap().1;
30    let actual = inst.to_string();
31    if expected != actual && expected.trim() != fix_up(&actual) {
32        println!("> {inst}");
33        println!("  debug: {inst:x?}");
34        println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
35        println!("  expected (capstone): {expected}");
36        println!("  actual (to_string):  {actual}");
37        assert_eq!(expected, &actual);
38    }
39}
40
41/// Use this assembler to emit machine code into a byte buffer.
42///
43/// This will skip any traps or label registrations, but this is fine for the
44/// single-instruction disassembly we're doing here.
45fn assemble(inst: &Inst<FuzzRegs>) -> Vec<u8> {
46    let mut sink = TestCodeSink::default();
47    inst.encode(&mut sink);
48    sink.patch_labels_as_if_they_referred_to_end();
49    sink.buf
50}
51
52#[derive(Default)]
53struct TestCodeSink {
54    buf: Vec<u8>,
55    offsets_using_label: Vec<usize>,
56}
57
58impl TestCodeSink {
59    /// References to labels, e.g. RIP-relative addressing, is stored with an
60    /// adjustment that takes into account the distance from the relative offset
61    /// to the end of the instruction, where the offset is relative to. That
62    /// means that to indeed make the offset relative to the end of the
63    /// instruction, which is what we pretend all labels are bound to, it's
64    /// required that this adjustment is taken into account.
65    ///
66    /// This function will iterate over all labels bound to this code sink and
67    /// pretend the label is found at the end of the `buf`. That means that the
68    /// distance from the label to the end of `buf` minus 4, which is the width
69    /// of the offset, is added to what's already present in the encoding buffer.
70    ///
71    /// This is effectively undoing the `bytes_at_end` adjustment that's part of
72    /// `Amode::RipRelative` addressing.
73    fn patch_labels_as_if_they_referred_to_end(&mut self) {
74        let len = i32::try_from(self.buf.len()).unwrap();
75        for offset in self.offsets_using_label.iter() {
76            let range = self.buf[*offset..].first_chunk_mut::<4>().unwrap();
77            let offset = i32::try_from(*offset).unwrap() + 4;
78            let rel_distance = len - offset;
79            *range = (i32::from_le_bytes(*range) + rel_distance).to_le_bytes();
80        }
81    }
82}
83
84impl CodeSink for TestCodeSink {
85    fn put1(&mut self, v: u8) {
86        self.buf.extend_from_slice(&[v]);
87    }
88
89    fn put2(&mut self, v: u16) {
90        self.buf.extend_from_slice(&v.to_le_bytes());
91    }
92
93    fn put4(&mut self, v: u32) {
94        self.buf.extend_from_slice(&v.to_le_bytes());
95    }
96
97    fn put8(&mut self, v: u64) {
98        self.buf.extend_from_slice(&v.to_le_bytes());
99    }
100
101    fn add_trap(&mut self, _: TrapCode) {}
102
103    fn use_target(&mut self, _: DeferredTarget) {
104        let offset = self.buf.len();
105        self.offsets_using_label.push(offset);
106    }
107
108    fn known_offset(&self, target: KnownOffset) -> i32 {
109        panic!("unsupported known target {target:?}")
110    }
111}
112
113/// Building a new `Capstone` each time is suboptimal (TODO).
114fn disassemble(assembled: &[u8], original: &Inst<FuzzRegs>) -> String {
115    let cs = Capstone::new()
116        .x86()
117        .mode(x86::ArchMode::Mode64)
118        .syntax(x86::ArchSyntax::Att)
119        .detail(true)
120        .build()
121        .expect("failed to create Capstone object");
122    let insts = cs
123        .disasm_all(assembled, 0x0)
124        .expect("failed to disassemble");
125
126    if insts.len() != 1 {
127        println!("> {original}");
128        println!("  debug: {original:x?}");
129        println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
130        assert_eq!(insts.len(), 1, "not a single instruction");
131    }
132
133    let inst = insts.first().expect("at least one instruction");
134    if assembled.len() != inst.len() {
135        println!("> {original}");
136        println!("  debug: {original:x?}");
137        println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
138        println!(
139            "  capstone-assembled: {}",
140            pretty_print_hexadecimal(inst.bytes())
141        );
142        assert_eq!(assembled.len(), inst.len(), "extra bytes not disassembled");
143    }
144
145    inst.to_string()
146}
147
148fn pretty_print_hexadecimal(hex: &[u8]) -> String {
149    use std::fmt::Write;
150    let mut s = String::with_capacity(hex.len() * 2);
151    for b in hex {
152        write!(&mut s, "{b:02X}").unwrap();
153    }
154    s
155}
156
157/// See `replace_signed_immediates`.
158macro_rules! hex_print_signed_imm {
159    ($hex:expr, $from:ty => $to:ty) => {{
160        let imm = <$from>::from_str_radix($hex, 16).unwrap() as $to;
161        let mut simm = String::new();
162        if imm < 0 {
163            simm.push_str("-");
164        }
165        let abs = match imm.checked_abs() {
166            Some(i) => i,
167            None => <$to>::MIN,
168        };
169        if imm > -10 && imm < 10 {
170            simm.push_str(&format!("{:x}", abs));
171        } else {
172            simm.push_str(&format!("0x{:x}", abs));
173        }
174        simm
175    }};
176}
177
178/// Replace signed immediates in the disassembly with their unsigned hexadecimal
179/// equivalent. This is only necessary to match `capstone`'s complex
180/// pretty-printing rules; e.g. `capstone` will:
181/// - omit the `0x` prefix when printing `0x0` as `0`.
182/// - omit the `0x` prefix when print small values (less than 10)
183/// - print negative values as `-0x...` (signed hex) instead of `0xff...`
184///   (normal hex)
185/// - print `mov` immediates as base-10 instead of base-16 (?!).
186fn replace_signed_immediates(dis: &str) -> std::borrow::Cow<'_, str> {
187    match dis.find('$') {
188        None => dis.into(),
189        Some(idx) => {
190            let (prefix, rest) = dis.split_at(idx + 1); // Skip the '$'.
191            let (_, rest) = chomp("-", rest); // Skip the '-' if it's there.
192            let (_, rest) = chomp("0x", rest); // Skip the '0x' if it's there.
193            let n = rest.chars().take_while(char::is_ascii_hexdigit).count();
194            let (hex, rest) = rest.split_at(n); // Split at next non-hex character.
195            let simm = if dis.starts_with("mov") {
196                u64::from_str_radix(hex, 16).unwrap().to_string()
197            } else {
198                match hex.len() {
199                    1 | 2 => hex_print_signed_imm!(hex, u8 => i8),
200                    4 => hex_print_signed_imm!(hex, u16 => i16),
201                    8 => hex_print_signed_imm!(hex, u32 => i32),
202                    16 => hex_print_signed_imm!(hex, u64 => i64),
203                    _ => panic!("unexpected length for hex: {hex}"),
204                }
205            };
206            format!("{prefix}{simm}{rest}").into()
207        }
208    }
209}
210
211// See `replace_signed_immediates`.
212fn chomp<'a>(pat: &str, s: &'a str) -> (&'a str, &'a str) {
213    if s.starts_with(pat) {
214        s.split_at(pat.len())
215    } else {
216        ("", s)
217    }
218}
219
220#[test]
221fn replace() {
222    assert_eq!(
223        replace_signed_immediates("andl $0xffffff9a, %r11d"),
224        "andl $-0x66, %r11d"
225    );
226    assert_eq!(
227        replace_signed_immediates("xorq $0xffffffffffffffbc, 0x7f139ecc(%r9)"),
228        "xorq $-0x44, 0x7f139ecc(%r9)"
229    );
230    assert_eq!(
231        replace_signed_immediates("subl $0x3ca77a19, -0x1a030f40(%r14)"),
232        "subl $0x3ca77a19, -0x1a030f40(%r14)"
233    );
234    assert_eq!(
235        replace_signed_immediates("movq $0xffffffff864ae103, %rsi"),
236        "movq $18446744071667638531, %rsi"
237    );
238}
239
240/// Remove everything after the first semicolon in the disassembly and trim any
241/// trailing spaces. This is necessary to remove the implicit operands we end up
242/// printing for Cranelift's sake.
243fn remove_after_semicolon(dis: &str) -> &str {
244    match dis.find(';') {
245        None => dis,
246        Some(idx) => {
247            let (prefix, _) = dis.split_at(idx);
248            prefix.trim()
249        }
250    }
251}
252
253#[test]
254fn remove_after_parenthesis_test() {
255    assert_eq!(
256        remove_after_semicolon("imulb 0x7658eddd(%rcx) ;; implicit: %ax"),
257        "imulb 0x7658eddd(%rcx)"
258    );
259}
260
261/// Run some post-processing on the disassembly to make it match Capstone.
262fn fix_up(dis: &str) -> std::borrow::Cow<'_, str> {
263    let dis = remove_after_semicolon(dis);
264    replace_signed_immediates(&dis)
265}
266
267/// Fuzz-specific registers.
268///
269/// For the fuzzer, we do not need any fancy register types; see [`FuzzReg`].
270#[derive(Arbitrary, Debug)]
271pub struct FuzzRegs;
272
273impl Registers for FuzzRegs {
274    type ReadGpr = FuzzReg;
275    type ReadWriteGpr = FuzzReg;
276    type WriteGpr = FuzzReg;
277    type ReadXmm = FuzzReg;
278    type ReadWriteXmm = FuzzReg;
279    type WriteXmm = FuzzReg;
280}
281
282/// A simple `u8` register type for fuzzing only.
283#[derive(Clone, Copy, Debug, PartialEq)]
284pub struct FuzzReg(u8);
285
286impl<'a> Arbitrary<'a> for FuzzReg {
287    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
288        Ok(Self(u.int_in_range(0..=15)?))
289    }
290}
291
292impl AsReg for FuzzReg {
293    fn new(enc: u8) -> Self {
294        Self(enc)
295    }
296    fn enc(&self) -> u8 {
297        self.0
298    }
299}
300
301impl Arbitrary<'_> for AmodeOffsetPlusKnownOffset {
302    fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
303        // For now, we don't generate offsets (TODO).
304        Ok(Self {
305            simm32: AmodeOffset::arbitrary(u)?,
306            offset: None,
307        })
308    }
309}
310
311impl<R: AsReg, const E: u8> Arbitrary<'_> for Fixed<R, E> {
312    fn arbitrary(_: &mut Unstructured<'_>) -> Result<Self> {
313        Ok(Self::new(E))
314    }
315}
316
317impl<R: AsReg> Arbitrary<'_> for NonRspGpr<R> {
318    fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
319        use crate::gpr::enc::*;
320        let gpr = u.choose(&[
321            RAX, RCX, RDX, RBX, RBP, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15,
322        ])?;
323        Ok(Self::new(R::new(*gpr)))
324    }
325}
326impl<'a, R: AsReg> Arbitrary<'a> for Gpr<R> {
327    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
328        Ok(Self(R::new(u.int_in_range(0..=15)?)))
329    }
330}
331impl<'a, R: AsReg> Arbitrary<'a> for Xmm<R> {
332    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
333        Ok(Self(R::new(u.int_in_range(0..=15)?)))
334    }
335}
336
337/// Helper trait that's used to be the same as `Registers` except with an extra
338/// `for<'a> Arbitrary<'a>` bound on all of the associated types.
339pub trait RegistersArbitrary:
340    Registers<
341        ReadGpr: for<'a> Arbitrary<'a>,
342        ReadWriteGpr: for<'a> Arbitrary<'a>,
343        WriteGpr: for<'a> Arbitrary<'a>,
344        ReadXmm: for<'a> Arbitrary<'a>,
345        ReadWriteXmm: for<'a> Arbitrary<'a>,
346        WriteXmm: for<'a> Arbitrary<'a>,
347    >
348{
349}
350
351impl<R> RegistersArbitrary for R
352where
353    R: Registers,
354    R::ReadGpr: for<'a> Arbitrary<'a>,
355    R::ReadWriteGpr: for<'a> Arbitrary<'a>,
356    R::WriteGpr: for<'a> Arbitrary<'a>,
357    R::ReadXmm: for<'a> Arbitrary<'a>,
358    R::ReadWriteXmm: for<'a> Arbitrary<'a>,
359    R::WriteXmm: for<'a> Arbitrary<'a>,
360{
361}
362
363#[cfg(test)]
364mod test {
365    use super::*;
366    use arbtest::arbtest;
367    use std::sync::atomic::{AtomicUsize, Ordering};
368
369    #[test]
370    fn smoke() {
371        let count = AtomicUsize::new(0);
372        arbtest(|u| {
373            let inst: Inst<FuzzRegs> = u.arbitrary()?;
374            roundtrip(&inst);
375            println!("#{}: {inst}", count.fetch_add(1, Ordering::SeqCst));
376            Ok(())
377        })
378        .budget_ms(1_000);
379
380        // This will run the `roundtrip` fuzzer for one second. To repeatably
381        // test a single input, append `.seed(0x<failing seed>)`.
382    }
383
384    #[test]
385    fn callq() {
386        for i in -500..500 {
387            println!("immediate: {i}");
388            let inst = crate::inst::callq_d::new(i);
389            roundtrip(&inst.into());
390        }
391    }
392}