cranelift_codegen/
egraph.rs

1//! Support for egraphs represented in the DataFlowGraph.
2
3use crate::alias_analysis::{AliasAnalysis, LastStores};
4use crate::ctxhash::{CtxEq, CtxHash, NullCtx};
5use crate::cursor::{Cursor, CursorPosition, FuncCursor};
6use crate::dominator_tree::DominatorTree;
7use crate::egraph::elaborate::Elaborator;
8use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph};
9use crate::ir::pcc::Fact;
10use crate::ir::{
11    Block, DataFlowGraph, Function, Inst, InstructionData, Opcode, Type, Value, ValueDef,
12    ValueListPool,
13};
14use crate::loop_analysis::LoopAnalysis;
15use crate::opts::IsleContext;
16use crate::opts::generated_code::SkeletonInstSimplification;
17use crate::scoped_hash_map::{Entry as ScopedEntry, ScopedHashMap};
18use crate::settings::Flags;
19use crate::take_and_replace::TakeAndReplace;
20use crate::trace;
21use alloc::vec::Vec;
22use core::cmp::Ordering;
23use core::hash::Hasher;
24use cranelift_control::ControlPlane;
25use cranelift_entity::SecondaryMap;
26use cranelift_entity::packed_option::ReservedValue;
27use rustc_hash::FxHashSet;
28use smallvec::SmallVec;
29
30mod cost;
31mod elaborate;
32
33/// Pass over a Function that does the whole aegraph thing.
34///
35/// - Removes non-skeleton nodes from the Layout.
36/// - Performs a GVN-and-rule-application pass over all Values
37///   reachable from the skeleton, potentially creating new Union
38///   nodes (i.e., an aegraph) so that some values have multiple
39///   representations.
40/// - Does "extraction" on the aegraph: selects the best value out of
41///   the tree-of-Union nodes for each used value.
42/// - Does "scoped elaboration" on the aegraph: chooses one or more
43///   locations for pure nodes to become instructions again in the
44///   layout, as forced by the skeleton.
45///
46/// At the beginning and end of this pass, the CLIF should be in a
47/// state that passes the verifier and, additionally, has no Union
48/// nodes. During the pass, Union nodes may exist, and instructions in
49/// the layout may refer to results of instructions that are not
50/// placed in the layout.
51pub struct EgraphPass<'a> {
52    /// The function we're operating on.
53    func: &'a mut Function,
54    /// Dominator tree for the CFG, used to visit blocks in pre-order
55    /// so we see value definitions before their uses, and also used for
56    /// O(1) dominance checks.
57    domtree: &'a DominatorTree,
58    /// Alias analysis, used during optimization.
59    alias_analysis: &'a mut AliasAnalysis<'a>,
60    /// Loop analysis results, used for built-in LICM during
61    /// elaboration.
62    loop_analysis: &'a LoopAnalysis,
63    /// Compiler flags.
64    flags: &'a Flags,
65    /// Chaos-mode control-plane so we can test that we still get
66    /// correct results when our heuristics make bad decisions.
67    ctrl_plane: &'a mut ControlPlane,
68    /// Which Values do we want to rematerialize in each block where
69    /// they're used?
70    remat_values: FxHashSet<Value>,
71    /// Stats collected while we run this pass.
72    pub(crate) stats: Stats,
73}
74
75/// The maximum number of rewrites we will take from a single call into ISLE.
76const MATCHES_LIMIT: usize = 5;
77
78/// The maximum number of enodes in any given eclass.
79const ECLASS_ENODE_LIMIT: usize = 5;
80
81/// Context passed through node insertion and optimization.
82pub(crate) struct OptimizeCtx<'opt, 'analysis>
83where
84    'analysis: 'opt,
85{
86    // Borrowed from EgraphPass:
87    pub(crate) func: &'opt mut Function,
88    pub(crate) value_to_opt_value: &'opt mut SecondaryMap<Value, Value>,
89    available_block: &'opt mut SecondaryMap<Value, Block>,
90    eclass_size: &'opt mut SecondaryMap<Value, u8>,
91    pub(crate) gvn_map: &'opt mut ScopedHashMap<(Type, InstructionData), Option<Value>>,
92    pub(crate) gvn_map_blocks: &'opt Vec<Block>,
93    pub(crate) remat_values: &'opt mut FxHashSet<Value>,
94    pub(crate) stats: &'opt mut Stats,
95    domtree: &'opt DominatorTree,
96    pub(crate) alias_analysis: &'opt mut AliasAnalysis<'analysis>,
97    pub(crate) alias_analysis_state: &'opt mut LastStores,
98    flags: &'opt Flags,
99    ctrl_plane: &'opt mut ControlPlane,
100    // Held locally during optimization of one node (recursively):
101    pub(crate) rewrite_depth: usize,
102    pub(crate) subsume_values: FxHashSet<Value>,
103    optimized_values: SmallVec<[Value; MATCHES_LIMIT]>,
104    optimized_insts: SmallVec<[SkeletonInstSimplification; MATCHES_LIMIT]>,
105}
106
107/// For passing to `insert_pure_enode`. Sometimes the enode already
108/// exists as an Inst (from the original CLIF), and sometimes we're in
109/// the middle of creating it and want to avoid inserting it if
110/// possible until we know we need it.
111pub(crate) enum NewOrExistingInst {
112    New(InstructionData, Type),
113    Existing(Inst),
114}
115
116impl NewOrExistingInst {
117    fn get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData) {
118        match self {
119            NewOrExistingInst::New(data, ty) => (*ty, *data),
120            NewOrExistingInst::Existing(inst) => {
121                let ty = dfg.ctrl_typevar(*inst);
122                (ty, dfg.insts[*inst])
123            }
124        }
125    }
126}
127
128impl<'opt, 'analysis> OptimizeCtx<'opt, 'analysis>
129where
130    'analysis: 'opt,
131{
132    /// Optimization of a single instruction.
133    ///
134    /// This does a few things:
135    /// - Looks up the instruction in the GVN deduplication map. If we
136    ///   already have the same instruction somewhere else, with the
137    ///   same args, then we can alias the original instruction's
138    ///   results and omit this instruction entirely.
139    /// - If the instruction is "new" (not deduplicated), then apply
140    ///   optimization rules:
141    ///   - All of the mid-end rules written in ISLE.
142    ///   - Store-to-load forwarding.
143    /// - Update the value-to-opt-value map, and update the eclass
144    ///   union-find, if we rewrote the value to different form(s).
145    pub(crate) fn insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value {
146        // Create the external context for looking up and updating the
147        // GVN map. This is necessary so that instructions themselves
148        // do not have to carry all the references or data for a full
149        // `Eq` or `Hash` impl.
150        let gvn_context = GVNContext {
151            value_lists: &self.func.dfg.value_lists,
152        };
153
154        self.stats.pure_inst += 1;
155        if let NewOrExistingInst::New(..) = inst {
156            self.stats.new_inst += 1;
157        }
158
159        // Does this instruction already exist? If so, add entries to
160        // the value-map to rewrite uses of its results to the results
161        // of the original (existing) instruction. If not, optimize
162        // the new instruction.
163        if let Some(&Some(orig_result)) = self
164            .gvn_map
165            .get(&gvn_context, &inst.get_inst_key(&self.func.dfg))
166        {
167            self.stats.pure_inst_deduped += 1;
168            if let NewOrExistingInst::Existing(inst) = inst {
169                debug_assert_eq!(self.func.dfg.inst_results(inst).len(), 1);
170                let result = self.func.dfg.first_result(inst);
171                self.value_to_opt_value[result] = orig_result;
172                self.available_block[result] = self.available_block[orig_result];
173                self.func.dfg.merge_facts(result, orig_result);
174            }
175            orig_result
176        } else {
177            // Now actually insert the InstructionData and attach
178            // result value (exactly one).
179            let (inst, result, ty) = match inst {
180                NewOrExistingInst::New(data, typevar) => {
181                    self.stats.pure_inst_insert_new += 1;
182                    let inst = self.func.dfg.make_inst(data);
183                    // TODO: reuse return value?
184                    self.func.dfg.make_inst_results(inst, typevar);
185                    let result = self.func.dfg.first_result(inst);
186                    // New inst. We need to do the analysis of its result.
187                    (inst, result, typevar)
188                }
189                NewOrExistingInst::Existing(inst) => {
190                    self.stats.pure_inst_insert_orig += 1;
191                    let result = self.func.dfg.first_result(inst);
192                    let ty = self.func.dfg.ctrl_typevar(inst);
193                    (inst, result, ty)
194                }
195            };
196
197            self.attach_constant_fact(inst, result, ty);
198
199            self.available_block[result] = self.get_available_block(inst);
200            let opt_value = self.optimize_pure_enode(inst);
201            log::trace!("optimizing inst {inst} orig result {result} gave {opt_value}");
202
203            let gvn_context = GVNContext {
204                value_lists: &self.func.dfg.value_lists,
205            };
206            // Insert at level implied by args. This enables merging
207            // in LICM cases like:
208            //
209            // while (...) {
210            //   if (...) {
211            //     let x = loop_invariant_expr;
212            //   }
213            //   if (...) {
214            //     let x = loop_invariant_expr;
215            //   }
216            // }
217            //
218            // where the two instances of the expression otherwise
219            // wouldn't merge because each would be in a separate
220            // subscope of the scoped hashmap during traversal.
221            log::trace!(
222                "value {} is available at {}",
223                opt_value,
224                self.available_block[opt_value]
225            );
226            let depth = self.depth_of_block_in_gvn_map(self.available_block[opt_value]);
227            self.gvn_map.insert_with_depth(
228                &gvn_context,
229                (ty, self.func.dfg.insts[inst]),
230                Some(opt_value),
231                depth,
232            );
233            self.value_to_opt_value[result] = opt_value;
234            opt_value
235        }
236    }
237
238    /// Find the block where a pure instruction first becomes available,
239    /// defined as the block that is closest to the root where all of
240    /// its arguments are available. In the unusual case where a pure
241    /// instruction has no arguments (e.g. get_return_address), we can
242    /// place it anywhere, so it is available in the entry block.
243    ///
244    /// This function does not compute available blocks recursively.
245    /// All of the instruction's arguments must have had their available
246    /// blocks assigned already.
247    fn get_available_block(&self, inst: Inst) -> Block {
248        // Side-effecting instructions have different rules for where
249        // they become available, so this function does not apply.
250        debug_assert!(is_pure_for_egraph(self.func, inst));
251
252        // Note that the def-point of all arguments to an instruction
253        // in SSA lie on a line of direct ancestors in the domtree, and
254        // so do their available-blocks. This means that for any pair of
255        // arguments, their available blocks are either the same or one
256        // strictly dominates the other. We just need to find any argument
257        // whose available block is deepest in the domtree.
258        self.func.dfg.insts[inst]
259            .arguments(&self.func.dfg.value_lists)
260            .iter()
261            .map(|&v| {
262                let block = self.available_block[v];
263                debug_assert!(!block.is_reserved_value());
264                block
265            })
266            .max_by(|&x, &y| {
267                if self.domtree.block_dominates(x, y) {
268                    Ordering::Less
269                } else {
270                    debug_assert!(self.domtree.block_dominates(y, x));
271                    Ordering::Greater
272                }
273            })
274            .unwrap_or(self.func.layout.entry_block().unwrap())
275    }
276
277    fn depth_of_block_in_gvn_map(&self, block: Block) -> usize {
278        log::trace!(
279            "finding depth of available block {} in domtree stack: {:?}",
280            block,
281            self.gvn_map_blocks
282        );
283        self.gvn_map_blocks
284            .iter()
285            .enumerate()
286            .rev()
287            .find(|&(_, b)| *b == block)
288            .unwrap()
289            .0
290    }
291
292    /// Optimizes an enode by applying any matching mid-end rewrite
293    /// rules (or store-to-load forwarding, which is a special case),
294    /// unioning together all possible optimized (or rewritten) forms
295    /// of this expression into an eclass and returning the `Value`
296    /// that represents that eclass.
297    fn optimize_pure_enode(&mut self, inst: Inst) -> Value {
298        // A pure node always has exactly one result.
299        let orig_value = self.func.dfg.first_result(inst);
300
301        let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_values);
302        let (ctx, optimized_values) = guard.get();
303
304        // Limit rewrite depth. When we apply optimization rules, they
305        // may create new nodes (values) and those are, recursively,
306        // optimized eagerly as soon as they are created. So we may
307        // have more than one ISLE invocation on the stack. (This is
308        // necessary so that as the toplevel builds the
309        // right-hand-side expression bottom-up, it uses the "latest"
310        // optimized values for all the constituent parts.) To avoid
311        // infinite or problematic recursion, we bound the rewrite
312        // depth to a small constant here.
313        const REWRITE_LIMIT: usize = 5;
314        if ctx.rewrite_depth > REWRITE_LIMIT {
315            ctx.stats.rewrite_depth_limit += 1;
316            return orig_value;
317        }
318        ctx.rewrite_depth += 1;
319        trace!("Incrementing rewrite depth; now {}", ctx.rewrite_depth);
320
321        // Invoke the ISLE toplevel constructor, getting all new
322        // values produced as equivalents to this value.
323        trace!("Calling into ISLE with original value {}", orig_value);
324        ctx.stats.rewrite_rule_invoked += 1;
325        debug_assert!(optimized_values.is_empty());
326        crate::opts::generated_code::constructor_simplify(
327            &mut IsleContext { ctx },
328            orig_value,
329            optimized_values,
330        );
331
332        ctx.stats.rewrite_rule_results += optimized_values.len() as u64;
333
334        // It's not supposed to matter what order `simplify` returns values in.
335        ctx.ctrl_plane.shuffle(optimized_values);
336
337        let num_matches = optimized_values.len();
338        if num_matches > MATCHES_LIMIT {
339            trace!(
340                "Reached maximum matches limit; too many optimized values \
341                 ({num_matches} > {MATCHES_LIMIT}); ignoring rest.",
342            );
343            optimized_values.truncate(MATCHES_LIMIT);
344        }
345
346        // Sort and deduplicate optimized values, in case multiple
347        // rules produced the same simplification.
348        optimized_values.sort_unstable();
349        optimized_values.dedup();
350
351        trace!("  -> returned from ISLE: {orig_value} -> {optimized_values:?}");
352
353        // Construct a union-node tree representing the new eclass
354        // that results from rewriting. If any returned value was
355        // marked "subsume", take only that value. Otherwise,
356        // sequentially build the chain over the original value and
357        // all returned values.
358        let result_value = if let Some(&subsuming_value) = optimized_values
359            .iter()
360            .find(|&value| ctx.subsume_values.contains(value))
361        {
362            optimized_values.clear();
363            ctx.stats.pure_inst_subsume += 1;
364            subsuming_value
365        } else {
366            let mut union_value = orig_value;
367            let mut eclass_size = ctx.eclass_size[orig_value] + 1;
368            for optimized_value in optimized_values.drain(..) {
369                trace!(
370                    "Returned from ISLE for {}, got {:?}",
371                    orig_value, optimized_value
372                );
373                if optimized_value == orig_value {
374                    trace!(" -> same as orig value; skipping");
375                    ctx.stats.pure_inst_rewrite_to_self += 1;
376                    continue;
377                }
378                let rhs_eclass_size = ctx.eclass_size[optimized_value] + 1;
379                if usize::from(eclass_size) + usize::from(rhs_eclass_size) > ECLASS_ENODE_LIMIT {
380                    trace!(" -> reached eclass size limit");
381                    ctx.stats.eclass_size_limit += 1;
382                    break;
383                }
384                let old_union_value = union_value;
385                union_value = ctx.func.dfg.union(old_union_value, optimized_value);
386                eclass_size += rhs_eclass_size;
387                ctx.eclass_size[union_value] = eclass_size - 1;
388                ctx.stats.union += 1;
389                trace!(" -> union: now {}", union_value);
390                ctx.func.dfg.merge_facts(old_union_value, optimized_value);
391                ctx.available_block[union_value] =
392                    ctx.merge_availability(old_union_value, optimized_value);
393            }
394            union_value
395        };
396
397        ctx.rewrite_depth -= 1;
398        trace!("Decrementing rewrite depth; now {}", ctx.rewrite_depth);
399        if ctx.rewrite_depth == 0 {
400            ctx.subsume_values.clear();
401        }
402
403        debug_assert!(ctx.optimized_values.is_empty());
404        result_value
405    }
406
407    fn merge_availability(&self, a: Value, b: Value) -> Block {
408        let a = self.available_block[a];
409        let b = self.available_block[b];
410        if self.domtree.block_dominates(a, b) {
411            a
412        } else {
413            b
414        }
415    }
416
417    /// Optimize a "skeleton" instruction.
418    ///
419    /// Returns an optional command of how to continue processing the optimized
420    /// instruction (e.g. removing it or replacing it with a new instruction).
421    fn optimize_skeleton_inst(
422        &mut self,
423        inst: Inst,
424        block: Block,
425    ) -> Option<SkeletonInstSimplification> {
426        self.stats.skeleton_inst += 1;
427
428        // If we have a rewrite rule for this instruction, do that first, so
429        // that GVN and alias analysis only see simplified skeleton
430        // instructions.
431        if let Some(cmd) = self.simplify_skeleton_inst(inst) {
432            self.stats.skeleton_inst_simplified += 1;
433            return Some(cmd);
434        }
435
436        // First, can we try to deduplicate? We need to keep some copy
437        // of the instruction around because it's side-effecting, but
438        // we may be able to reuse an earlier instance of it.
439        if is_mergeable_for_egraph(self.func, inst) {
440            let result = self.func.dfg.inst_results(inst).get(0).copied();
441            trace!(" -> mergeable side-effecting op {}", inst);
442
443            // Does this instruction already exist? If so, add entries to
444            // the value-map to rewrite uses of its results to the results
445            // of the original (existing) instruction. If not, optimize
446            // the new instruction.
447            //
448            // Note that the GVN map is scoped, which is important
449            // here: because effectful ops are not removed from the
450            // skeleton (`Layout`), we need to be mindful of whether
451            // our current position is dominated by an instance of the
452            // instruction. (See #5796 for details.)
453            let ty = self.func.dfg.ctrl_typevar(inst);
454            match self
455                .gvn_map
456                .entry(&NullCtx, (ty, self.func.dfg.insts[inst]))
457            {
458                ScopedEntry::Occupied(o) => {
459                    let orig_result = *o.get();
460                    match (result, orig_result) {
461                        (Some(result), Some(orig_result)) => {
462                            // Hit in GVN map -- reuse value.
463                            self.stats.skeleton_inst_gvn += 1;
464                            self.value_to_opt_value[result] = orig_result;
465                            self.available_block[result] = self.available_block[orig_result];
466                            trace!(" -> merges result {} to {}", result, orig_result);
467                        }
468                        (None, None) => {
469                            // Hit in the GVN map, but the instruction doesn't
470                            // produce results, only side effects. Nothing else
471                            // to do here.
472                            self.stats.skeleton_inst_gvn += 1;
473                            trace!(" -> merges with dominating instruction");
474                        }
475                        (_, _) => unreachable!(),
476                    }
477                    Some(SkeletonInstSimplification::Remove)
478                }
479                ScopedEntry::Vacant(v) => {
480                    // Otherwise, insert it into the value-map.
481                    if let Some(result) = result {
482                        self.value_to_opt_value[result] = result;
483                        self.available_block[result] = block;
484                    }
485                    v.insert(result);
486                    trace!(" -> inserts as new (no GVN)");
487                    None
488                }
489            }
490        }
491        // Otherwise, if a load or store, process it with the alias
492        // analysis to see if we can optimize it (rewrite in terms of
493        // an earlier load or stored value).
494        else if let Some(new_result) =
495            self.alias_analysis
496                .process_inst(self.func, self.alias_analysis_state, inst)
497        {
498            self.stats.alias_analysis_removed += 1;
499            let result = self.func.dfg.first_result(inst);
500            trace!(
501                " -> inst {} has result {} replaced with {}",
502                inst, result, new_result
503            );
504            self.value_to_opt_value[result] = new_result;
505            self.available_block[result] = self.available_block[new_result];
506            self.func.dfg.merge_facts(result, new_result);
507            Some(SkeletonInstSimplification::Remove)
508        }
509        // Otherwise, generic side-effecting op -- always keep it, and
510        // set its results to identity-map to original values.
511        else {
512            // Set all results to identity-map to themselves
513            // in the value-to-opt-value map.
514            for &result in self.func.dfg.inst_results(inst) {
515                self.value_to_opt_value[result] = result;
516                self.available_block[result] = block;
517            }
518            None
519        }
520    }
521
522    /// Find the best simplification of the given skeleton instruction, if any,
523    /// by consulting our `simplify_skeleton` ISLE rules.
524    fn simplify_skeleton_inst(&mut self, inst: Inst) -> Option<SkeletonInstSimplification> {
525        // We cannot currently simplify terminators, or simplify into
526        // terminators. Anything that could change the control-flow graph is off
527        // limits.
528        //
529        // Consider the following CLIF snippet:
530        //
531        //     block0(v0: i64):
532        //         v1 = iconst.i32 0
533        //         trapz v1, user42
534        //         v2 = load.i32 v0
535        //         brif v1, block1, block2
536        //     block1:
537        //         return v2
538        //     block2:
539        //         v3 = iconst.i32 1
540        //         v4 = iadd v2, v3
541        //         return v4
542        //
543        // We would ideally like to perform simplifications like replacing the
544        // `trapz` with an unconditional `trap` and the conditional `brif`
545        // branch with an unconditional `jump`. Note, however, that blocks
546        // `block1` and `block2` are dominated by `block0` and therefore can and
547        // do use values defined in `block0`. This presents challenges:
548        //
549        // * If we replace the `brif` with a `jump`, then we've mutated the
550        //   control-flow graph and removed that domination property. The uses
551        //   of `v2` and `v3` in those blocks become invalid.
552        //
553        // * Even worse, if we turn the `trapz` into a `trap`, we are
554        //   introducing a terminator into the middle of the block, which leaves
555        //   us with two choices to fix up the IR so that there aren't any
556        //   instructions following the terminator in the block:
557        //
558        //   1. We can split the unreachable instructions off into a new
559        //      block. However, there is no control-flow edge from the current
560        //      block to this new block and so, again, the new block isn't
561        //      dominated by the current block, and therefore the can't use
562        //      values defined in this block or any dominating it. The `load`
563        //      instruction uses `v0` but is not dominated by `v0`'s
564        //      definition.
565        //
566        //   2. Alternatively, we can simply delete the trailing instructions,
567        //      since they are unreachable. But then not only are the old
568        //      instructions' uses no longer dominated by their definitions, but
569        //      the definitions do not exist at all anymore!
570        //
571        // Whatever approach we would take, we would invalidate value uses, and
572        // would need to track and fix them up.
573        if self.func.dfg.insts[inst].opcode().is_branch() {
574            return None;
575        }
576
577        let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_insts);
578        let (ctx, optimized_insts) = guard.get();
579
580        crate::opts::generated_code::constructor_simplify_skeleton(
581            &mut IsleContext { ctx },
582            inst,
583            optimized_insts,
584        );
585
586        let simplifications_len = optimized_insts.len();
587        log::trace!(" -> simplify_skeleton: yielded {simplifications_len} simplification(s)");
588        if simplifications_len > MATCHES_LIMIT {
589            log::trace!("      too many candidate simplifications; truncating to {MATCHES_LIMIT}");
590            optimized_insts.truncate(MATCHES_LIMIT);
591        }
592
593        // Find the best simplification, if any, from our candidates.
594        //
595        // Unlike simplifying pure values, we do not add side-effectful
596        // instructions to the egraph, nor do we extract the best version via
597        // dynamic programming and considering the costs of operands. Instead,
598        // we greedily choose the best simplification. This is because there is
599        // an impedance mismatch: the egraph and our pure rewrites are centered
600        // around *values*, but we don't represent side-effects with values, we
601        // represent them implicitly in their *instructions*.
602        //
603        // The initial best choice is "no simplification, just use the original
604        // instruction" which has the original instruction's cost.
605        let mut best = None;
606        let mut best_cost = cost::Cost::of_skeleton_op(
607            ctx.func.dfg.insts[inst].opcode(),
608            ctx.func.dfg.inst_args(inst).len(),
609        );
610        while let Some(simplification) = optimized_insts.pop() {
611            let (new_inst, new_val) = match simplification {
612                // We can't do better than completely removing the skeleton
613                // instruction, so short-cicuit the loop and eagerly return the
614                // `Remove*` simplifications.
615                SkeletonInstSimplification::Remove => {
616                    log::trace!(" -> simplify_skeleton: remove inst");
617                    debug_assert!(ctx.func.dfg.inst_results(inst).is_empty());
618                    return Some(simplification);
619                }
620                SkeletonInstSimplification::RemoveWithVal { val } => {
621                    log::trace!(" -> simplify_skeleton: remove inst and use {val} as its result");
622                    if cfg!(debug_assertions) {
623                        let results = ctx.func.dfg.inst_results(inst);
624                        debug_assert_eq!(results.len(), 1);
625                        debug_assert_eq!(
626                            ctx.func.dfg.value_type(results[0]),
627                            ctx.func.dfg.value_type(val),
628                        );
629                    }
630                    return Some(simplification);
631                }
632
633                // For instruction replacement simplification, we want to check
634                // that the replacements define the same number and types of
635                // values as the original instruction, and also determine
636                // whether they are actually an improvement over (i.e. have
637                // lower cost than) the original instruction.
638                SkeletonInstSimplification::Replace { inst } => {
639                    log::trace!(
640                        " -> simplify_skeleton: replace inst with {inst}: {}",
641                        ctx.func.dfg.display_inst(inst)
642                    );
643                    (inst, None)
644                }
645                SkeletonInstSimplification::ReplaceWithVal { inst, val } => {
646                    log::trace!(
647                        " -> simplify_skeleton: replace inst with {val} and {inst}: {}",
648                        ctx.func.dfg.display_inst(inst)
649                    );
650                    (inst, Some(val))
651                }
652            };
653
654            if cfg!(debug_assertions) {
655                let opcode = ctx.func.dfg.insts[inst].opcode();
656                debug_assert!(
657                    !(opcode.is_terminator() || opcode.is_branch()),
658                    "simplifying control-flow instructions and terminators is not yet supported",
659                );
660
661                let old_vals = ctx.func.dfg.inst_results(inst);
662                let new_vals = if let Some(val) = new_val.as_ref() {
663                    std::slice::from_ref(val)
664                } else {
665                    ctx.func.dfg.inst_results(new_inst)
666                };
667                debug_assert_eq!(
668                    old_vals.len(),
669                    new_vals.len(),
670                    "skeleton simplification should result in the same number of result values",
671                );
672
673                for (old_val, new_val) in old_vals.iter().zip(new_vals) {
674                    let old_ty = ctx.func.dfg.value_type(*old_val);
675                    let new_ty = ctx.func.dfg.value_type(*new_val);
676                    debug_assert_eq!(
677                        old_ty, new_ty,
678                        "skeleton simplification should result in values of the correct type",
679                    );
680                }
681            }
682
683            // Our best simplification is the one with the least cost. Update
684            // `best` if necessary.
685            let cost = cost::Cost::of_skeleton_op(
686                ctx.func.dfg.insts[new_inst].opcode(),
687                ctx.func.dfg.inst_args(new_inst).len(),
688            );
689            if cost < best_cost {
690                best = Some(simplification);
691                best_cost = cost;
692            }
693        }
694
695        // Return the best simplification!
696        best
697    }
698
699    /// Helper to propagate facts on constant values: if PCC is
700    /// enabled, then unconditionally add a fact attesting to the
701    /// Value's concrete value.
702    fn attach_constant_fact(&mut self, inst: Inst, value: Value, ty: Type) {
703        if self.flags.enable_pcc() {
704            if let InstructionData::UnaryImm {
705                opcode: Opcode::Iconst,
706                imm,
707            } = self.func.dfg.insts[inst]
708            {
709                let imm: i64 = imm.into();
710                self.func.dfg.facts[value] =
711                    Some(Fact::constant(ty.bits().try_into().unwrap(), imm as u64));
712            }
713        }
714    }
715}
716
717impl<'a> EgraphPass<'a> {
718    /// Create a new EgraphPass.
719    pub fn new(
720        func: &'a mut Function,
721        domtree: &'a DominatorTree,
722        loop_analysis: &'a LoopAnalysis,
723        alias_analysis: &'a mut AliasAnalysis<'a>,
724        flags: &'a Flags,
725        ctrl_plane: &'a mut ControlPlane,
726    ) -> Self {
727        Self {
728            func,
729            domtree,
730            loop_analysis,
731            alias_analysis,
732            flags,
733            ctrl_plane,
734            stats: Stats::default(),
735            remat_values: FxHashSet::default(),
736        }
737    }
738
739    /// Run the process.
740    pub fn run(&mut self) {
741        self.remove_pure_and_optimize();
742
743        trace!("egraph built:\n{}\n", self.func.display());
744        if cfg!(feature = "trace-log") {
745            for (value, def) in self.func.dfg.values_and_defs() {
746                trace!(" -> {} = {:?}", value, def);
747                match def {
748                    ValueDef::Result(i, 0) => {
749                        trace!("  -> {} = {:?}", i, self.func.dfg.insts[i]);
750                    }
751                    _ => {}
752                }
753            }
754        }
755
756        self.elaborate();
757
758        log::trace!("stats: {:#?}", self.stats);
759    }
760
761    /// Remove pure nodes from the `Layout` of the function, ensuring
762    /// that only the "side-effect skeleton" remains, and also
763    /// optimize the pure nodes. This is the first step of
764    /// egraph-based processing and turns the pure CFG-based CLIF into
765    /// a CFG skeleton with a sea of (optimized) nodes tying it
766    /// together.
767    ///
768    /// As we walk through the code, we eagerly apply optimization
769    /// rules; at any given point we have a "latest version" of an
770    /// eclass of possible representations for a `Value` in the
771    /// original program, which is itself a `Value` at the root of a
772    /// union-tree. We keep a map from the original values to these
773    /// optimized values. When we encounter any instruction (pure or
774    /// side-effecting skeleton) we rewrite its arguments to capture
775    /// the "latest" optimized forms of these values. (We need to do
776    /// this as part of this pass, and not later using a finished map,
777    /// because the eclass can continue to be updated and we need to
778    /// only refer to its subset that exists at this stage, to
779    /// maintain acyclicity.)
780    fn remove_pure_and_optimize(&mut self) {
781        let mut cursor = FuncCursor::new(self.func);
782        let mut value_to_opt_value: SecondaryMap<Value, Value> =
783            SecondaryMap::with_default(Value::reserved_value());
784
785        // Map from instruction to value for hash-consing of pure ops
786        // into the egraph. This can be a standard (non-scoped)
787        // hashmap because pure ops have no location: they are
788        // "outside of" control flow.
789        //
790        // Note also that we keep the controlling typevar (the `Type`
791        // in the tuple below) because it may disambiguate
792        // instructions that are identical except for type.
793        //
794        // We store both skeleton and non-skeleton instructions in the
795        // GVN map; for skeleton instructions, we only store those
796        // that are idempotent, i.e., still eligible to GVN. Note that
797        // some skeleton instructions are idempotent but do not
798        // produce a value: e.g., traps on a given condition. To allow
799        // for both cases, we store an `Option<Value>` as the value in
800        // this map.
801        let mut gvn_map: ScopedHashMap<(Type, InstructionData), Option<Value>> =
802            ScopedHashMap::with_capacity(cursor.func.dfg.num_values());
803
804        // The block in the domtree preorder traversal at each level
805        // of the GVN map.
806        let mut gvn_map_blocks: Vec<Block> = vec![];
807
808        // To get the best possible merging and canonicalization, we
809        // track where a value is "available" at: this is the
810        // domtree-nearest-ancestor join of all args if the value
811        // itself is pure, otherwise the block where the value is
812        // defined. (And for union nodes, the
813        // domtree-highest-ancestor, i.e., the meet or the dual to the
814        // above join.)
815        let mut available_block: SecondaryMap<Value, Block> =
816            SecondaryMap::with_default(Block::reserved_value());
817
818        // To avoid blowing up eclasses too much, we track the size of
819        // each eclass reachable by a tree of union nodes from a given
820        // value ID, and we avoid union'ing additional values into an
821        // eclass when it reaches `ECLASS_ENODE_LIMIT`.
822        //
823        // For efficiency, this encodes size minus one: so a value of
824        // zero (which is cheap to bulk-initialize) means a singleton
825        // eclass of size one. This also allows us to avoid explicitly
826        // writing the size for any values that are not union nodes.
827        let mut eclass_size: SecondaryMap<Value, u8> = SecondaryMap::with_default(0);
828
829        // This is an initial guess at the size we'll need, but we add
830        // more values as we build simplified alternative expressions so
831        // this is likely to realloc again later.
832        available_block.resize(cursor.func.dfg.num_values());
833
834        // In domtree preorder, visit blocks. (TODO: factor out an
835        // iterator from this and elaborator.)
836        let root = cursor.layout().entry_block().unwrap();
837        enum StackEntry {
838            Visit(Block),
839            Pop,
840        }
841        let mut block_stack = vec![StackEntry::Visit(root)];
842        while let Some(entry) = block_stack.pop() {
843            match entry {
844                StackEntry::Visit(block) => {
845                    // We popped this block; push children
846                    // immediately, then process this block.
847                    block_stack.push(StackEntry::Pop);
848                    block_stack.extend(
849                        self.ctrl_plane
850                            .shuffled(self.domtree.children(block))
851                            .map(StackEntry::Visit),
852                    );
853                    gvn_map.increment_depth();
854                    gvn_map_blocks.push(block);
855
856                    trace!("Processing block {}", block);
857                    cursor.set_position(CursorPosition::Before(block));
858
859                    let mut alias_analysis_state = self.alias_analysis.block_starting_state(block);
860
861                    for &param in cursor.func.dfg.block_params(block) {
862                        trace!("creating initial singleton eclass for blockparam {}", param);
863                        value_to_opt_value[param] = param;
864                        available_block[param] = block;
865                    }
866                    while let Some(inst) = cursor.next_inst() {
867                        trace!(
868                            "Processing inst {inst}: {}",
869                            cursor.func.dfg.display_inst(inst),
870                        );
871
872                        // Rewrite args of *all* instructions using the
873                        // value-to-opt-value map.
874                        cursor.func.dfg.map_inst_values(inst, |arg| {
875                            let new_value = value_to_opt_value[arg];
876                            trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value);
877                            debug_assert_ne!(new_value, Value::reserved_value());
878                            new_value
879                        });
880
881                        // Build a context for optimization, with borrows of
882                        // state. We can't invoke a method on `self` because
883                        // we've borrowed `self.func` mutably (as
884                        // `cursor.func`) so we pull apart the pieces instead
885                        // here.
886                        let mut ctx = OptimizeCtx {
887                            func: cursor.func,
888                            value_to_opt_value: &mut value_to_opt_value,
889                            gvn_map: &mut gvn_map,
890                            gvn_map_blocks: &mut gvn_map_blocks,
891                            available_block: &mut available_block,
892                            eclass_size: &mut eclass_size,
893                            rewrite_depth: 0,
894                            subsume_values: FxHashSet::default(),
895                            remat_values: &mut self.remat_values,
896                            stats: &mut self.stats,
897                            domtree: &self.domtree,
898                            alias_analysis: self.alias_analysis,
899                            alias_analysis_state: &mut alias_analysis_state,
900                            flags: self.flags,
901                            ctrl_plane: self.ctrl_plane,
902                            optimized_values: Default::default(),
903                            optimized_insts: Default::default(),
904                        };
905
906                        if is_pure_for_egraph(ctx.func, inst) {
907                            // Insert into GVN map and optimize any new nodes
908                            // inserted (recursively performing this work for
909                            // any nodes the optimization rules produce).
910                            let inst = NewOrExistingInst::Existing(inst);
911                            ctx.insert_pure_enode(inst);
912                            // We've now rewritten all uses, or will when we
913                            // see them, and the instruction exists as a pure
914                            // enode in the eclass, so we can remove it.
915                            cursor.remove_inst_and_step_back();
916                        } else {
917                            if let Some(cmd) = ctx.optimize_skeleton_inst(inst, block) {
918                                Self::execute_skeleton_inst_simplification(
919                                    cmd,
920                                    &mut cursor,
921                                    &mut value_to_opt_value,
922                                    inst,
923                                );
924                            }
925                        }
926                    }
927                }
928                StackEntry::Pop => {
929                    gvn_map.decrement_depth();
930                    gvn_map_blocks.pop();
931                }
932            }
933        }
934    }
935
936    /// Execute a simplification of an instruction in the side-effectful
937    /// skeleton.
938    fn execute_skeleton_inst_simplification(
939        simplification: SkeletonInstSimplification,
940        cursor: &mut FuncCursor,
941        value_to_opt_value: &mut SecondaryMap<Value, Value>,
942        old_inst: Inst,
943    ) {
944        let mut forward_val = |cursor: &mut FuncCursor, old_val, new_val| {
945            cursor.func.dfg.change_to_alias(old_val, new_val);
946            value_to_opt_value[old_val] = new_val;
947        };
948
949        let (new_inst, new_val) = match simplification {
950            SkeletonInstSimplification::Remove => {
951                cursor.remove_inst_and_step_back();
952                return;
953            }
954            SkeletonInstSimplification::RemoveWithVal { val } => {
955                cursor.remove_inst_and_step_back();
956                let old_val = cursor.func.dfg.first_result(old_inst);
957                cursor.func.dfg.detach_inst_results(old_inst);
958                forward_val(cursor, old_val, val);
959                return;
960            }
961            SkeletonInstSimplification::Replace { inst } => (inst, None),
962            SkeletonInstSimplification::ReplaceWithVal { inst, val } => (inst, Some(val)),
963        };
964
965        // Replace the old instruction with the new one.
966        cursor.replace_inst(new_inst);
967        debug_assert!(!cursor.func.dfg.insts[new_inst].opcode().is_terminator());
968
969        // Redirect the old instruction's result values to our new instruction's
970        // result values.
971        let mut i = 0;
972        let mut next_new_val = |dfg: &crate::ir::DataFlowGraph| -> Value {
973            if let Some(val) = new_val {
974                val
975            } else {
976                let val = dfg.inst_results(new_inst)[i];
977                i += 1;
978                val
979            }
980        };
981        for i in 0..cursor.func.dfg.inst_results(old_inst).len() {
982            let old_val = cursor.func.dfg.inst_results(old_inst)[i];
983            let new_val = next_new_val(&cursor.func.dfg);
984            forward_val(cursor, old_val, new_val);
985        }
986
987        // Back up so that the next iteration of the outer egraph loop will
988        // process the new instruction.
989        cursor.goto_inst(new_inst);
990        cursor.prev_inst();
991    }
992
993    /// Scoped elaboration: compute a final ordering of op computation
994    /// for each block and update the given Func body. After this
995    /// runs, the function body is back into the state where every
996    /// Inst with an used result is placed in the layout (possibly
997    /// duplicated, if our code-motion logic decides this is the best
998    /// option).
999    ///
1000    /// This works in concert with the domtree. We do a preorder
1001    /// traversal of the domtree, tracking a scoped map from Id to
1002    /// (new) Value. The map's scopes correspond to levels in the
1003    /// domtree.
1004    ///
1005    /// At each block, we iterate forward over the side-effecting
1006    /// eclasses, and recursively generate their arg eclasses, then
1007    /// emit the ops themselves.
1008    ///
1009    /// To use an eclass in a given block, we first look it up in the
1010    /// scoped map, and get the Value if already present. If not, we
1011    /// need to generate it. We emit the extracted enode for this
1012    /// eclass after recursively generating its args. Eclasses are
1013    /// thus computed "as late as possible", but then memoized into
1014    /// the Id-to-Value map and available to all dominated blocks and
1015    /// for the rest of this block. (This subsumes GVN.)
1016    fn elaborate(&mut self) {
1017        let mut elaborator = Elaborator::new(
1018            self.func,
1019            &self.domtree,
1020            self.loop_analysis,
1021            &self.remat_values,
1022            &mut self.stats,
1023            self.ctrl_plane,
1024        );
1025        elaborator.elaborate();
1026
1027        self.check_post_egraph();
1028    }
1029
1030    #[cfg(debug_assertions)]
1031    fn check_post_egraph(&self) {
1032        // Verify that no union nodes are reachable from inst args,
1033        // and that all inst args' defining instructions are in the
1034        // layout.
1035        for block in self.func.layout.blocks() {
1036            for inst in self.func.layout.block_insts(block) {
1037                self.func
1038                    .dfg
1039                    .inst_values(inst)
1040                    .for_each(|arg| match self.func.dfg.value_def(arg) {
1041                        ValueDef::Result(i, _) => {
1042                            debug_assert!(self.func.layout.inst_block(i).is_some());
1043                        }
1044                        ValueDef::Union(..) => {
1045                            panic!("egraph union node {arg} still reachable at {inst}!");
1046                        }
1047                        _ => {}
1048                    })
1049            }
1050        }
1051    }
1052
1053    #[cfg(not(debug_assertions))]
1054    fn check_post_egraph(&self) {}
1055}
1056
1057/// Implementation of external-context equality and hashing on
1058/// InstructionData. This allows us to deduplicate instructions given
1059/// some context that lets us see its value lists, so we don't need to
1060/// store arguments inline in the `InstuctionData` (or alongside it in
1061/// some newly-defined key type) in all cases.
1062struct GVNContext<'a> {
1063    value_lists: &'a ValueListPool,
1064}
1065
1066impl<'a> CtxEq<(Type, InstructionData), (Type, InstructionData)> for GVNContext<'a> {
1067    fn ctx_eq(
1068        &self,
1069        (a_ty, a_inst): &(Type, InstructionData),
1070        (b_ty, b_inst): &(Type, InstructionData),
1071    ) -> bool {
1072        a_ty == b_ty && a_inst.eq(b_inst, self.value_lists)
1073    }
1074}
1075
1076impl<'a> CtxHash<(Type, InstructionData)> for GVNContext<'a> {
1077    fn ctx_hash<H: Hasher>(&self, state: &mut H, (ty, inst): &(Type, InstructionData)) {
1078        std::hash::Hash::hash(&ty, state);
1079        inst.hash(state, self.value_lists);
1080    }
1081}
1082
1083/// Statistics collected during egraph-based processing.
1084#[derive(Clone, Debug, Default)]
1085pub(crate) struct Stats {
1086    pub(crate) pure_inst: u64,
1087    pub(crate) pure_inst_deduped: u64,
1088    pub(crate) pure_inst_subsume: u64,
1089    pub(crate) pure_inst_rewrite_to_self: u64,
1090    pub(crate) pure_inst_insert_orig: u64,
1091    pub(crate) pure_inst_insert_new: u64,
1092    pub(crate) skeleton_inst: u64,
1093    pub(crate) skeleton_inst_simplified: u64,
1094    pub(crate) skeleton_inst_gvn: u64,
1095    pub(crate) alias_analysis_removed: u64,
1096    pub(crate) new_inst: u64,
1097    pub(crate) union: u64,
1098    pub(crate) subsume: u64,
1099    pub(crate) remat: u64,
1100    pub(crate) rewrite_rule_invoked: u64,
1101    pub(crate) rewrite_rule_results: u64,
1102    pub(crate) rewrite_depth_limit: u64,
1103    pub(crate) elaborate_visit_node: u64,
1104    pub(crate) elaborate_memoize_hit: u64,
1105    pub(crate) elaborate_memoize_miss: u64,
1106    pub(crate) elaborate_remat: u64,
1107    pub(crate) elaborate_licm_hoist: u64,
1108    pub(crate) elaborate_func: u64,
1109    pub(crate) elaborate_func_pre_insts: u64,
1110    pub(crate) elaborate_func_post_insts: u64,
1111    pub(crate) elaborate_best_cost_fixpoint_iters: u64,
1112    pub(crate) eclass_size_limit: u64,
1113}