cranelift_assembler_x64/
fuzz.rs

1//! A fuzz testing oracle for roundtrip assembly-disassembly.
2//!
3//! This contains manual implementations of the `Arbitrary` trait for types
4//! throughout this crate to avoid depending on the `arbitrary` crate
5//! unconditionally (use the `fuzz` feature instead).
6
7use crate::{
8    AmodeOffset, AmodeOffsetPlusKnownOffset, AsReg, CodeSink, DeferredTarget, Fixed, Gpr, Inst,
9    KnownOffset, NonRspGpr, Registers, TrapCode, Xmm,
10};
11use arbitrary::{Arbitrary, Result, Unstructured};
12use capstone::{Capstone, arch::BuildsCapstone, arch::BuildsCapstoneSyntax, arch::x86};
13
14/// Take a random assembly instruction and check its encoding and
15/// pretty-printing against a known-good disassembler.
16///
17/// # Panics
18///
19/// This function panics to express failure as expected by the `arbitrary`
20/// fuzzer infrastructure. It may fail during assembly, disassembly, or when
21/// comparing the disassembled strings.
22pub fn roundtrip(inst: &Inst<FuzzRegs>) {
23    // Check that we can actually assemble this instruction.
24    let assembled = assemble(inst);
25    let expected = disassemble(&assembled, inst);
26
27    // Check that our pretty-printed output matches the known-good output. Trim
28    // off the instruction offset first.
29    let expected = expected.split_once(' ').unwrap().1;
30    let actual = inst.to_string();
31    if expected != actual && expected.trim() != fix_up(&actual) {
32        println!("> {inst}");
33        println!("  debug: {inst:x?}");
34        println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
35        println!("  expected (capstone): {expected}");
36        println!("  actual (to_string):  {actual}");
37        assert_eq!(expected, &actual);
38    }
39}
40
41/// Use this assembler to emit machine code into a byte buffer.
42///
43/// This will skip any traps or label registrations, but this is fine for the
44/// single-instruction disassembly we're doing here.
45fn assemble(inst: &Inst<FuzzRegs>) -> Vec<u8> {
46    let mut sink = TestCodeSink::default();
47    inst.encode(&mut sink);
48    sink.patch_labels_as_if_they_referred_to_end();
49    sink.buf
50}
51
52#[derive(Default)]
53struct TestCodeSink {
54    buf: Vec<u8>,
55    offsets_using_label: Vec<usize>,
56}
57
58impl TestCodeSink {
59    /// References to labels, e.g. RIP-relative addressing, is stored with an
60    /// adjustment that takes into account the distance from the relative offset
61    /// to the end of the instruction, where the offset is relative to. That
62    /// means that to indeed make the offset relative to the end of the
63    /// instruction, which is what we pretend all labels are bound to, it's
64    /// required that this adjustment is taken into account.
65    ///
66    /// This function will iterate over all labels bound to this code sink and
67    /// pretend the label is found at the end of the `buf`. That means that the
68    /// distance from the label to the end of `buf` minus 4, which is the width
69    /// of the offset, is added to what's already present in the encoding buffer.
70    ///
71    /// This is effectively undoing the `bytes_at_end` adjustment that's part of
72    /// `Amode::RipRelative` addressing.
73    fn patch_labels_as_if_they_referred_to_end(&mut self) {
74        let len = i32::try_from(self.buf.len()).unwrap();
75        for offset in self.offsets_using_label.iter() {
76            let range = self.buf[*offset..].first_chunk_mut::<4>().unwrap();
77            let offset = i32::try_from(*offset).unwrap() + 4;
78            let rel_distance = len - offset;
79            *range = (i32::from_le_bytes(*range) + rel_distance).to_le_bytes();
80        }
81    }
82}
83
84impl CodeSink for TestCodeSink {
85    fn put1(&mut self, v: u8) {
86        self.buf.extend_from_slice(&[v]);
87    }
88
89    fn put2(&mut self, v: u16) {
90        self.buf.extend_from_slice(&v.to_le_bytes());
91    }
92
93    fn put4(&mut self, v: u32) {
94        self.buf.extend_from_slice(&v.to_le_bytes());
95    }
96
97    fn put8(&mut self, v: u64) {
98        self.buf.extend_from_slice(&v.to_le_bytes());
99    }
100
101    fn add_trap(&mut self, _: TrapCode) {}
102
103    fn use_target(&mut self, _: DeferredTarget) {
104        let offset = self.buf.len();
105        self.offsets_using_label.push(offset);
106    }
107
108    fn known_offset(&self, target: KnownOffset) -> i32 {
109        panic!("unsupported known target {target:?}")
110    }
111}
112
113/// Building a new `Capstone` each time is suboptimal (TODO).
114fn disassemble(assembled: &[u8], original: &Inst<FuzzRegs>) -> String {
115    let cs = Capstone::new()
116        .x86()
117        .mode(x86::ArchMode::Mode64)
118        .syntax(x86::ArchSyntax::Att)
119        .detail(true)
120        .build()
121        .expect("failed to create Capstone object");
122    let insts = cs
123        .disasm_all(assembled, 0x0)
124        .expect("failed to disassemble");
125
126    if insts.len() != 1 {
127        println!("> {original}");
128        println!("  debug: {original:x?}");
129        println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
130        assert_eq!(insts.len(), 1, "not a single instruction");
131    }
132
133    let inst = insts.first().expect("at least one instruction");
134    if assembled.len() != inst.len() {
135        println!("> {original}");
136        println!("  debug: {original:x?}");
137        println!("  assembled: {}", pretty_print_hexadecimal(&assembled));
138        println!(
139            "  capstone-assembled: {}",
140            pretty_print_hexadecimal(inst.bytes())
141        );
142        assert_eq!(assembled.len(), inst.len(), "extra bytes not disassembled");
143    }
144
145    inst.to_string()
146}
147
148fn pretty_print_hexadecimal(hex: &[u8]) -> String {
149    use std::fmt::Write;
150    let mut s = String::with_capacity(hex.len() * 2);
151    for b in hex {
152        write!(&mut s, "{b:02X}").unwrap();
153    }
154    s
155}
156
157/// See `replace_signed_immediates`.
158macro_rules! hex_print_signed_imm {
159    ($hex:expr, $from:ty => $to:ty) => {{
160        let imm = <$from>::from_str_radix($hex, 16).unwrap() as $to;
161        let mut simm = String::new();
162        if imm < 0 {
163            simm.push_str("-");
164        }
165        let abs = match imm.checked_abs() {
166            Some(i) => i,
167            None => <$to>::MIN,
168        };
169        if imm > -10 && imm < 10 {
170            simm.push_str(&format!("{:x}", abs));
171        } else {
172            simm.push_str(&format!("0x{:x}", abs));
173        }
174        simm
175    }};
176}
177
178/// Replace signed immediates in the disassembly with their unsigned hexadecimal
179/// equivalent. This is only necessary to match `capstone`'s complex
180/// pretty-printing rules; e.g. `capstone` will:
181/// - omit the `0x` prefix when printing `0x0` as `0`.
182/// - omit the `0x` prefix when print small values (less than 10)
183/// - print negative values as `-0x...` (signed hex) instead of `0xff...`
184///   (normal hex)
185/// - print `mov` immediates as base-10 instead of base-16 (?!).
186fn replace_signed_immediates(dis: &str) -> std::borrow::Cow<'_, str> {
187    match dis.find('$') {
188        None => dis.into(),
189        Some(idx) => {
190            let (prefix, rest) = dis.split_at(idx + 1); // Skip the '$'.
191            let (_, rest) = chomp("-", rest); // Skip the '-' if it's there.
192            let (_, rest) = chomp("0x", rest); // Skip the '0x' if it's there.
193            let n = rest.chars().take_while(char::is_ascii_hexdigit).count();
194            let (hex, rest) = rest.split_at(n); // Split at next non-hex character.
195            let simm = if dis.starts_with("mov") {
196                u64::from_str_radix(hex, 16).unwrap().to_string()
197            } else {
198                match hex.len() {
199                    1 | 2 => hex_print_signed_imm!(hex, u8 => i8),
200                    4 => hex_print_signed_imm!(hex, u16 => i16),
201                    8 => hex_print_signed_imm!(hex, u32 => i32),
202                    16 => hex_print_signed_imm!(hex, u64 => i64),
203                    _ => panic!("unexpected length for hex: {hex}"),
204                }
205            };
206            format!("{prefix}{simm}{rest}").into()
207        }
208    }
209}
210
211// See `replace_signed_immediates`.
212fn chomp<'a>(pat: &str, s: &'a str) -> (&'a str, &'a str) {
213    if s.starts_with(pat) {
214        s.split_at(pat.len())
215    } else {
216        ("", s)
217    }
218}
219
220#[test]
221fn replace() {
222    assert_eq!(
223        replace_signed_immediates("andl $0xffffff9a, %r11d"),
224        "andl $-0x66, %r11d"
225    );
226    assert_eq!(
227        replace_signed_immediates("xorq $0xffffffffffffffbc, 0x7f139ecc(%r9)"),
228        "xorq $-0x44, 0x7f139ecc(%r9)"
229    );
230    assert_eq!(
231        replace_signed_immediates("subl $0x3ca77a19, -0x1a030f40(%r14)"),
232        "subl $0x3ca77a19, -0x1a030f40(%r14)"
233    );
234    assert_eq!(
235        replace_signed_immediates("movq $0xffffffff864ae103, %rsi"),
236        "movq $18446744071667638531, %rsi"
237    );
238}
239
240/// Remove everything after the first semicolon in the disassembly and trim any
241/// trailing spaces. This is necessary to remove the implicit operands we end up
242/// printing for Cranelift's sake.
243fn remove_after_semicolon(dis: &str) -> &str {
244    match dis.find(';') {
245        None => dis,
246        Some(idx) => {
247            let (prefix, _) = dis.split_at(idx);
248            prefix.trim()
249        }
250    }
251}
252
253#[test]
254fn remove_after_parenthesis_test() {
255    assert_eq!(
256        remove_after_semicolon("imulb 0x7658eddd(%rcx) ;; implicit: %ax"),
257        "imulb 0x7658eddd(%rcx)"
258    );
259}
260
261/// Run some post-processing on the disassembly to make it match Capstone.
262fn fix_up(dis: &str) -> std::borrow::Cow<'_, str> {
263    let dis = remove_after_semicolon(dis);
264    replace_signed_immediates(&dis)
265}
266
267/// Fuzz-specific registers.
268///
269/// For the fuzzer, we do not need any fancy register types; see [`FuzzReg`].
270#[derive(Clone, Arbitrary, Debug)]
271pub struct FuzzRegs;
272
273impl Registers for FuzzRegs {
274    type ReadGpr = FuzzReg;
275    type ReadWriteGpr = FuzzReg;
276    type WriteGpr = FuzzReg;
277    type ReadXmm = FuzzReg;
278    type ReadWriteXmm = FuzzReg;
279    type WriteXmm = FuzzReg;
280}
281
282/// A simple `u8` register type for fuzzing only.
283#[derive(Clone, Copy, Debug, PartialEq)]
284pub struct FuzzReg(u8);
285
286impl<'a> Arbitrary<'a> for FuzzReg {
287    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
288        Ok(Self(u.int_in_range(0..=15)?))
289    }
290}
291
292impl AsReg for FuzzReg {
293    fn new(enc: u8) -> Self {
294        Self(enc)
295    }
296    fn enc(&self) -> u8 {
297        self.0
298    }
299}
300
301impl Arbitrary<'_> for AmodeOffset {
302    fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
303        // Custom implementation to try to generate some "interesting" offsets.
304        // For example choose either an arbitrary 8-bit or 32-bit number as the
305        // base, and then optionally shift that number to the left to create
306        // multiples of constants. This can help stress some of the more
307        // interesting encodings in EVEX instructions for example.
308        let base = if u.arbitrary()? {
309            i32::from(u.arbitrary::<i8>()?)
310        } else {
311            u.arbitrary::<i32>()?
312        };
313        Ok(match u.int_in_range(0..=5)? {
314            0 => AmodeOffset::ZERO,
315            n => AmodeOffset::new(base << (n - 1)),
316        })
317    }
318}
319
320impl Arbitrary<'_> for AmodeOffsetPlusKnownOffset {
321    fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
322        // For now, we don't generate offsets (TODO).
323        Ok(Self {
324            simm32: AmodeOffset::arbitrary(u)?,
325            offset: None,
326        })
327    }
328}
329
330impl<R: AsReg, const E: u8> Arbitrary<'_> for Fixed<R, E> {
331    fn arbitrary(_: &mut Unstructured<'_>) -> Result<Self> {
332        Ok(Self::new(E))
333    }
334}
335
336impl<R: AsReg> Arbitrary<'_> for NonRspGpr<R> {
337    fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {
338        use crate::gpr::enc::*;
339        let gpr = u.choose(&[
340            RAX, RCX, RDX, RBX, RBP, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15,
341        ])?;
342        Ok(Self::new(R::new(*gpr)))
343    }
344}
345impl<'a, R: AsReg> Arbitrary<'a> for Gpr<R> {
346    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
347        Ok(Self(R::new(u.int_in_range(0..=15)?)))
348    }
349}
350impl<'a, R: AsReg> Arbitrary<'a> for Xmm<R> {
351    fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
352        Ok(Self(R::new(u.int_in_range(0..=15)?)))
353    }
354}
355
356/// Helper trait that's used to be the same as `Registers` except with an extra
357/// `for<'a> Arbitrary<'a>` bound on all of the associated types.
358pub trait RegistersArbitrary:
359    Registers<
360        ReadGpr: for<'a> Arbitrary<'a>,
361        ReadWriteGpr: for<'a> Arbitrary<'a>,
362        WriteGpr: for<'a> Arbitrary<'a>,
363        ReadXmm: for<'a> Arbitrary<'a>,
364        ReadWriteXmm: for<'a> Arbitrary<'a>,
365        WriteXmm: for<'a> Arbitrary<'a>,
366    >
367{
368}
369
370impl<R> RegistersArbitrary for R
371where
372    R: Registers,
373    R::ReadGpr: for<'a> Arbitrary<'a>,
374    R::ReadWriteGpr: for<'a> Arbitrary<'a>,
375    R::WriteGpr: for<'a> Arbitrary<'a>,
376    R::ReadXmm: for<'a> Arbitrary<'a>,
377    R::ReadWriteXmm: for<'a> Arbitrary<'a>,
378    R::WriteXmm: for<'a> Arbitrary<'a>,
379{
380}
381
382#[cfg(test)]
383mod test {
384    use super::*;
385    use arbtest::arbtest;
386    use std::sync::atomic::{AtomicUsize, Ordering};
387
388    #[test]
389    fn smoke() {
390        let count = AtomicUsize::new(0);
391        arbtest(|u| {
392            let inst: Inst<FuzzRegs> = u.arbitrary()?;
393            roundtrip(&inst);
394            println!("#{}: {inst}", count.fetch_add(1, Ordering::SeqCst));
395            Ok(())
396        })
397        .budget_ms(1_000);
398
399        // This will run the `roundtrip` fuzzer for one second. To repeatably
400        // test a single input, append `.seed(0x<failing seed>)`.
401    }
402
403    #[test]
404    fn callq() {
405        for i in -500..500 {
406            println!("immediate: {i}");
407            let inst = crate::inst::callq_d::new(i);
408            roundtrip(&inst.into());
409        }
410    }
411}