cranelift_codegen/egraph.rs
1//! Support for egraphs represented in the DataFlowGraph.
2
3use crate::alias_analysis::{AliasAnalysis, LastStores};
4use crate::ctxhash::{CtxEq, CtxHash, NullCtx};
5use crate::cursor::{Cursor, CursorPosition, FuncCursor};
6use crate::dominator_tree::{DominatorTree, DominatorTreePreorder};
7use crate::egraph::elaborate::Elaborator;
8use crate::inst_predicates::{is_mergeable_for_egraph, is_pure_for_egraph};
9use crate::ir::pcc::Fact;
10use crate::ir::{
11 Block, DataFlowGraph, Function, Inst, InstructionData, Opcode, Type, Value, ValueDef,
12 ValueListPool,
13};
14use crate::loop_analysis::LoopAnalysis;
15use crate::opts::IsleContext;
16use crate::opts::generated_code::SkeletonInstSimplification;
17use crate::scoped_hash_map::{Entry as ScopedEntry, ScopedHashMap};
18use crate::settings::Flags;
19use crate::take_and_replace::TakeAndReplace;
20use crate::trace;
21use alloc::vec::Vec;
22use core::cmp::Ordering;
23use core::hash::Hasher;
24use cranelift_control::ControlPlane;
25use cranelift_entity::SecondaryMap;
26use cranelift_entity::packed_option::ReservedValue;
27use rustc_hash::FxHashSet;
28use smallvec::SmallVec;
29
30mod cost;
31mod elaborate;
32
33/// Pass over a Function that does the whole aegraph thing.
34///
35/// - Removes non-skeleton nodes from the Layout.
36/// - Performs a GVN-and-rule-application pass over all Values
37/// reachable from the skeleton, potentially creating new Union
38/// nodes (i.e., an aegraph) so that some values have multiple
39/// representations.
40/// - Does "extraction" on the aegraph: selects the best value out of
41/// the tree-of-Union nodes for each used value.
42/// - Does "scoped elaboration" on the aegraph: chooses one or more
43/// locations for pure nodes to become instructions again in the
44/// layout, as forced by the skeleton.
45///
46/// At the beginning and end of this pass, the CLIF should be in a
47/// state that passes the verifier and, additionally, has no Union
48/// nodes. During the pass, Union nodes may exist, and instructions in
49/// the layout may refer to results of instructions that are not
50/// placed in the layout.
51pub struct EgraphPass<'a> {
52 /// The function we're operating on.
53 func: &'a mut Function,
54 /// Dominator tree for the CFG, used to visit blocks in pre-order
55 /// so we see value definitions before their uses, and also used for
56 /// O(1) dominance checks.
57 domtree: DominatorTreePreorder,
58 /// Alias analysis, used during optimization.
59 alias_analysis: &'a mut AliasAnalysis<'a>,
60 /// Loop analysis results, used for built-in LICM during
61 /// elaboration.
62 loop_analysis: &'a LoopAnalysis,
63 /// Compiler flags.
64 flags: &'a Flags,
65 /// Chaos-mode control-plane so we can test that we still get
66 /// correct results when our heuristics make bad decisions.
67 ctrl_plane: &'a mut ControlPlane,
68 /// Which Values do we want to rematerialize in each block where
69 /// they're used?
70 remat_values: FxHashSet<Value>,
71 /// Stats collected while we run this pass.
72 pub(crate) stats: Stats,
73}
74
75/// The maximum number of rewrites we will take from a single call into ISLE.
76const MATCHES_LIMIT: usize = 5;
77
78/// The maximum number of enodes in any given eclass.
79const ECLASS_ENODE_LIMIT: usize = 5;
80
81/// Context passed through node insertion and optimization.
82pub(crate) struct OptimizeCtx<'opt, 'analysis>
83where
84 'analysis: 'opt,
85{
86 // Borrowed from EgraphPass:
87 pub(crate) func: &'opt mut Function,
88 pub(crate) value_to_opt_value: &'opt mut SecondaryMap<Value, Value>,
89 available_block: &'opt mut SecondaryMap<Value, Block>,
90 eclass_size: &'opt mut SecondaryMap<Value, u8>,
91 pub(crate) gvn_map: &'opt mut ScopedHashMap<(Type, InstructionData), Option<Value>>,
92 pub(crate) gvn_map_blocks: &'opt Vec<Block>,
93 pub(crate) remat_values: &'opt mut FxHashSet<Value>,
94 pub(crate) stats: &'opt mut Stats,
95 domtree: &'opt DominatorTreePreorder,
96 pub(crate) alias_analysis: &'opt mut AliasAnalysis<'analysis>,
97 pub(crate) alias_analysis_state: &'opt mut LastStores,
98 flags: &'opt Flags,
99 ctrl_plane: &'opt mut ControlPlane,
100 // Held locally during optimization of one node (recursively):
101 pub(crate) rewrite_depth: usize,
102 pub(crate) subsume_values: FxHashSet<Value>,
103 optimized_values: SmallVec<[Value; MATCHES_LIMIT]>,
104 optimized_insts: SmallVec<[SkeletonInstSimplification; MATCHES_LIMIT]>,
105}
106
107/// For passing to `insert_pure_enode`. Sometimes the enode already
108/// exists as an Inst (from the original CLIF), and sometimes we're in
109/// the middle of creating it and want to avoid inserting it if
110/// possible until we know we need it.
111pub(crate) enum NewOrExistingInst {
112 New(InstructionData, Type),
113 Existing(Inst),
114}
115
116impl NewOrExistingInst {
117 fn get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData) {
118 match self {
119 NewOrExistingInst::New(data, ty) => (*ty, *data),
120 NewOrExistingInst::Existing(inst) => {
121 let ty = dfg.ctrl_typevar(*inst);
122 (ty, dfg.insts[*inst])
123 }
124 }
125 }
126}
127
128impl<'opt, 'analysis> OptimizeCtx<'opt, 'analysis>
129where
130 'analysis: 'opt,
131{
132 /// Optimization of a single instruction.
133 ///
134 /// This does a few things:
135 /// - Looks up the instruction in the GVN deduplication map. If we
136 /// already have the same instruction somewhere else, with the
137 /// same args, then we can alias the original instruction's
138 /// results and omit this instruction entirely.
139 /// - If the instruction is "new" (not deduplicated), then apply
140 /// optimization rules:
141 /// - All of the mid-end rules written in ISLE.
142 /// - Store-to-load forwarding.
143 /// - Update the value-to-opt-value map, and update the eclass
144 /// union-find, if we rewrote the value to different form(s).
145 pub(crate) fn insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value {
146 // Create the external context for looking up and updating the
147 // GVN map. This is necessary so that instructions themselves
148 // do not have to carry all the references or data for a full
149 // `Eq` or `Hash` impl.
150 let gvn_context = GVNContext {
151 value_lists: &self.func.dfg.value_lists,
152 };
153
154 self.stats.pure_inst += 1;
155 if let NewOrExistingInst::New(..) = inst {
156 self.stats.new_inst += 1;
157 }
158
159 // Does this instruction already exist? If so, add entries to
160 // the value-map to rewrite uses of its results to the results
161 // of the original (existing) instruction. If not, optimize
162 // the new instruction.
163 if let Some(&Some(orig_result)) = self
164 .gvn_map
165 .get(&gvn_context, &inst.get_inst_key(&self.func.dfg))
166 {
167 self.stats.pure_inst_deduped += 1;
168 if let NewOrExistingInst::Existing(inst) = inst {
169 debug_assert_eq!(self.func.dfg.inst_results(inst).len(), 1);
170 let result = self.func.dfg.first_result(inst);
171 self.value_to_opt_value[result] = orig_result;
172 self.available_block[result] = self.available_block[orig_result];
173 self.func.dfg.merge_facts(result, orig_result);
174 }
175 orig_result
176 } else {
177 // Now actually insert the InstructionData and attach
178 // result value (exactly one).
179 let (inst, result, ty) = match inst {
180 NewOrExistingInst::New(data, typevar) => {
181 self.stats.pure_inst_insert_new += 1;
182 let inst = self.func.dfg.make_inst(data);
183 // TODO: reuse return value?
184 self.func.dfg.make_inst_results(inst, typevar);
185 let result = self.func.dfg.first_result(inst);
186 // New inst. We need to do the analysis of its result.
187 (inst, result, typevar)
188 }
189 NewOrExistingInst::Existing(inst) => {
190 self.stats.pure_inst_insert_orig += 1;
191 let result = self.func.dfg.first_result(inst);
192 let ty = self.func.dfg.ctrl_typevar(inst);
193 (inst, result, ty)
194 }
195 };
196
197 self.attach_constant_fact(inst, result, ty);
198
199 self.available_block[result] = self.get_available_block(inst);
200 let opt_value = self.optimize_pure_enode(inst);
201 log::trace!("optimizing inst {inst} orig result {result} gave {opt_value}");
202
203 let gvn_context = GVNContext {
204 value_lists: &self.func.dfg.value_lists,
205 };
206 // Insert at level implied by args. This enables merging
207 // in LICM cases like:
208 //
209 // while (...) {
210 // if (...) {
211 // let x = loop_invariant_expr;
212 // }
213 // if (...) {
214 // let x = loop_invariant_expr;
215 // }
216 // }
217 //
218 // where the two instances of the expression otherwise
219 // wouldn't merge because each would be in a separate
220 // subscope of the scoped hashmap during traversal.
221 log::trace!(
222 "value {} is available at {}",
223 opt_value,
224 self.available_block[opt_value]
225 );
226 let depth = self.depth_of_block_in_gvn_map(self.available_block[opt_value]);
227 self.gvn_map.insert_with_depth(
228 &gvn_context,
229 (ty, self.func.dfg.insts[inst]),
230 Some(opt_value),
231 depth,
232 );
233 self.value_to_opt_value[result] = opt_value;
234 opt_value
235 }
236 }
237
238 /// Find the block where a pure instruction first becomes available,
239 /// defined as the block that is closest to the root where all of
240 /// its arguments are available. In the unusual case where a pure
241 /// instruction has no arguments (e.g. get_return_address), we can
242 /// place it anywhere, so it is available in the entry block.
243 ///
244 /// This function does not compute available blocks recursively.
245 /// All of the instruction's arguments must have had their available
246 /// blocks assigned already.
247 fn get_available_block(&self, inst: Inst) -> Block {
248 // Side-effecting instructions have different rules for where
249 // they become available, so this function does not apply.
250 debug_assert!(is_pure_for_egraph(self.func, inst));
251
252 // Note that the def-point of all arguments to an instruction
253 // in SSA lie on a line of direct ancestors in the domtree, and
254 // so do their available-blocks. This means that for any pair of
255 // arguments, their available blocks are either the same or one
256 // strictly dominates the other. We just need to find any argument
257 // whose available block is deepest in the domtree.
258 self.func.dfg.insts[inst]
259 .arguments(&self.func.dfg.value_lists)
260 .iter()
261 .map(|&v| {
262 let block = self.available_block[v];
263 debug_assert!(!block.is_reserved_value());
264 block
265 })
266 .max_by(|&x, &y| {
267 if self.domtree.dominates(x, y) {
268 Ordering::Less
269 } else {
270 debug_assert!(self.domtree.dominates(y, x));
271 Ordering::Greater
272 }
273 })
274 .unwrap_or(self.func.layout.entry_block().unwrap())
275 }
276
277 fn depth_of_block_in_gvn_map(&self, block: Block) -> usize {
278 log::trace!(
279 "finding depth of available block {} in domtree stack: {:?}",
280 block,
281 self.gvn_map_blocks
282 );
283 self.gvn_map_blocks
284 .iter()
285 .enumerate()
286 .rev()
287 .find(|&(_, b)| *b == block)
288 .unwrap()
289 .0
290 }
291
292 /// Optimizes an enode by applying any matching mid-end rewrite
293 /// rules (or store-to-load forwarding, which is a special case),
294 /// unioning together all possible optimized (or rewritten) forms
295 /// of this expression into an eclass and returning the `Value`
296 /// that represents that eclass.
297 fn optimize_pure_enode(&mut self, inst: Inst) -> Value {
298 // A pure node always has exactly one result.
299 let orig_value = self.func.dfg.first_result(inst);
300
301 let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_values);
302 let (ctx, optimized_values) = guard.get();
303
304 // Limit rewrite depth. When we apply optimization rules, they
305 // may create new nodes (values) and those are, recursively,
306 // optimized eagerly as soon as they are created. So we may
307 // have more than one ISLE invocation on the stack. (This is
308 // necessary so that as the toplevel builds the
309 // right-hand-side expression bottom-up, it uses the "latest"
310 // optimized values for all the constituent parts.) To avoid
311 // infinite or problematic recursion, we bound the rewrite
312 // depth to a small constant here.
313 const REWRITE_LIMIT: usize = 5;
314 if ctx.rewrite_depth > REWRITE_LIMIT {
315 ctx.stats.rewrite_depth_limit += 1;
316 return orig_value;
317 }
318 ctx.rewrite_depth += 1;
319 trace!("Incrementing rewrite depth; now {}", ctx.rewrite_depth);
320
321 // Invoke the ISLE toplevel constructor, getting all new
322 // values produced as equivalents to this value.
323 trace!("Calling into ISLE with original value {}", orig_value);
324 ctx.stats.rewrite_rule_invoked += 1;
325 debug_assert!(optimized_values.is_empty());
326 crate::opts::generated_code::constructor_simplify(
327 &mut IsleContext { ctx },
328 orig_value,
329 optimized_values,
330 );
331
332 ctx.stats.rewrite_rule_results += optimized_values.len() as u64;
333
334 // It's not supposed to matter what order `simplify` returns values in.
335 ctx.ctrl_plane.shuffle(optimized_values);
336
337 let num_matches = optimized_values.len();
338 if num_matches > MATCHES_LIMIT {
339 trace!(
340 "Reached maximum matches limit; too many optimized values \
341 ({num_matches} > {MATCHES_LIMIT}); ignoring rest.",
342 );
343 optimized_values.truncate(MATCHES_LIMIT);
344 }
345
346 // Sort and deduplicate optimized values, in case multiple
347 // rules produced the same simplification.
348 optimized_values.sort_unstable();
349 optimized_values.dedup();
350
351 trace!(" -> returned from ISLE: {orig_value} -> {optimized_values:?}");
352
353 // Construct a union-node tree representing the new eclass
354 // that results from rewriting. If any returned value was
355 // marked "subsume", take only that value. Otherwise,
356 // sequentially build the chain over the original value and
357 // all returned values.
358 let result_value = if let Some(&subsuming_value) = optimized_values
359 .iter()
360 .find(|&value| ctx.subsume_values.contains(value))
361 {
362 optimized_values.clear();
363 ctx.stats.pure_inst_subsume += 1;
364 subsuming_value
365 } else {
366 let mut union_value = orig_value;
367 let mut eclass_size = ctx.eclass_size[orig_value] + 1;
368 for optimized_value in optimized_values.drain(..) {
369 trace!(
370 "Returned from ISLE for {}, got {:?}",
371 orig_value, optimized_value
372 );
373 if optimized_value == orig_value {
374 trace!(" -> same as orig value; skipping");
375 ctx.stats.pure_inst_rewrite_to_self += 1;
376 continue;
377 }
378 let rhs_eclass_size = ctx.eclass_size[optimized_value] + 1;
379 if usize::from(eclass_size) + usize::from(rhs_eclass_size) > ECLASS_ENODE_LIMIT {
380 trace!(" -> reached eclass size limit");
381 ctx.stats.eclass_size_limit += 1;
382 break;
383 }
384 let old_union_value = union_value;
385 union_value = ctx.func.dfg.union(old_union_value, optimized_value);
386 eclass_size += rhs_eclass_size;
387 ctx.eclass_size[union_value] = eclass_size - 1;
388 ctx.stats.union += 1;
389 trace!(" -> union: now {}", union_value);
390 ctx.func.dfg.merge_facts(old_union_value, optimized_value);
391 ctx.available_block[union_value] =
392 ctx.merge_availability(old_union_value, optimized_value);
393 }
394 union_value
395 };
396
397 ctx.rewrite_depth -= 1;
398 trace!("Decrementing rewrite depth; now {}", ctx.rewrite_depth);
399 if ctx.rewrite_depth == 0 {
400 ctx.subsume_values.clear();
401 }
402
403 debug_assert!(ctx.optimized_values.is_empty());
404 result_value
405 }
406
407 fn merge_availability(&self, a: Value, b: Value) -> Block {
408 let a = self.available_block[a];
409 let b = self.available_block[b];
410 if self.domtree.dominates(a, b) { a } else { b }
411 }
412
413 /// Optimize a "skeleton" instruction.
414 ///
415 /// Returns an optional command of how to continue processing the optimized
416 /// instruction (e.g. removing it or replacing it with a new instruction).
417 fn optimize_skeleton_inst(
418 &mut self,
419 inst: Inst,
420 block: Block,
421 ) -> Option<SkeletonInstSimplification> {
422 self.stats.skeleton_inst += 1;
423
424 // If we have a rewrite rule for this instruction, do that first, so
425 // that GVN and alias analysis only see simplified skeleton
426 // instructions.
427 if let Some(cmd) = self.simplify_skeleton_inst(inst) {
428 self.stats.skeleton_inst_simplified += 1;
429 return Some(cmd);
430 }
431
432 // First, can we try to deduplicate? We need to keep some copy
433 // of the instruction around because it's side-effecting, but
434 // we may be able to reuse an earlier instance of it.
435 if is_mergeable_for_egraph(self.func, inst) {
436 let result = self.func.dfg.inst_results(inst).get(0).copied();
437 trace!(" -> mergeable side-effecting op {}", inst);
438
439 // Does this instruction already exist? If so, add entries to
440 // the value-map to rewrite uses of its results to the results
441 // of the original (existing) instruction. If not, optimize
442 // the new instruction.
443 //
444 // Note that the GVN map is scoped, which is important
445 // here: because effectful ops are not removed from the
446 // skeleton (`Layout`), we need to be mindful of whether
447 // our current position is dominated by an instance of the
448 // instruction. (See #5796 for details.)
449 let ty = self.func.dfg.ctrl_typevar(inst);
450 match self
451 .gvn_map
452 .entry(&NullCtx, (ty, self.func.dfg.insts[inst]))
453 {
454 ScopedEntry::Occupied(o) => {
455 let orig_result = *o.get();
456 match (result, orig_result) {
457 (Some(result), Some(orig_result)) => {
458 // Hit in GVN map -- reuse value.
459 self.stats.skeleton_inst_gvn += 1;
460 self.value_to_opt_value[result] = orig_result;
461 self.available_block[result] = self.available_block[orig_result];
462 trace!(" -> merges result {} to {}", result, orig_result);
463 }
464 (None, None) => {
465 // Hit in the GVN map, but the instruction doesn't
466 // produce results, only side effects. Nothing else
467 // to do here.
468 self.stats.skeleton_inst_gvn += 1;
469 trace!(" -> merges with dominating instruction");
470 }
471 (_, _) => unreachable!(),
472 }
473 Some(SkeletonInstSimplification::Remove)
474 }
475 ScopedEntry::Vacant(v) => {
476 // Otherwise, insert it into the value-map.
477 if let Some(result) = result {
478 self.value_to_opt_value[result] = result;
479 self.available_block[result] = block;
480 }
481 v.insert(result);
482 trace!(" -> inserts as new (no GVN)");
483 None
484 }
485 }
486 }
487 // Otherwise, if a load or store, process it with the alias
488 // analysis to see if we can optimize it (rewrite in terms of
489 // an earlier load or stored value).
490 else if let Some(new_result) =
491 self.alias_analysis
492 .process_inst(self.func, self.alias_analysis_state, inst)
493 {
494 self.stats.alias_analysis_removed += 1;
495 let result = self.func.dfg.first_result(inst);
496 trace!(
497 " -> inst {} has result {} replaced with {}",
498 inst, result, new_result
499 );
500 self.value_to_opt_value[result] = new_result;
501 self.available_block[result] = self.available_block[new_result];
502 self.func.dfg.merge_facts(result, new_result);
503 Some(SkeletonInstSimplification::Remove)
504 }
505 // Otherwise, generic side-effecting op -- always keep it, and
506 // set its results to identity-map to original values.
507 else {
508 // Set all results to identity-map to themselves
509 // in the value-to-opt-value map.
510 for &result in self.func.dfg.inst_results(inst) {
511 self.value_to_opt_value[result] = result;
512 self.available_block[result] = block;
513 }
514 None
515 }
516 }
517
518 /// Find the best simplification of the given skeleton instruction, if any,
519 /// by consulting our `simplify_skeleton` ISLE rules.
520 fn simplify_skeleton_inst(&mut self, inst: Inst) -> Option<SkeletonInstSimplification> {
521 // We cannot currently simplify terminators, or simplify into
522 // terminators. Anything that could change the control-flow graph is off
523 // limits.
524 //
525 // Consider the following CLIF snippet:
526 //
527 // block0(v0: i64):
528 // v1 = iconst.i32 0
529 // trapz v1, user42
530 // v2 = load.i32 v0
531 // brif v1, block1, block2
532 // block1:
533 // return v2
534 // block2:
535 // v3 = iconst.i32 1
536 // v4 = iadd v2, v3
537 // return v4
538 //
539 // We would ideally like to perform simplifications like replacing the
540 // `trapz` with an unconditional `trap` and the conditional `brif`
541 // branch with an unconditional `jump`. Note, however, that blocks
542 // `block1` and `block2` are dominated by `block0` and therefore can and
543 // do use values defined in `block0`. This presents challenges:
544 //
545 // * If we replace the `brif` with a `jump`, then we've mutated the
546 // control-flow graph and removed that domination property. The uses
547 // of `v2` and `v3` in those blocks become invalid.
548 //
549 // * Even worse, if we turn the `trapz` into a `trap`, we are
550 // introducing a terminator into the middle of the block, which leaves
551 // us with two choices to fix up the IR so that there aren't any
552 // instructions following the terminator in the block:
553 //
554 // 1. We can split the unreachable instructions off into a new
555 // block. However, there is no control-flow edge from the current
556 // block to this new block and so, again, the new block isn't
557 // dominated by the current block, and therefore the can't use
558 // values defined in this block or any dominating it. The `load`
559 // instruction uses `v0` but is not dominated by `v0`'s
560 // definition.
561 //
562 // 2. Alternatively, we can simply delete the trailing instructions,
563 // since they are unreachable. But then not only are the old
564 // instructions' uses no longer dominated by their definitions, but
565 // the definitions do not exist at all anymore!
566 //
567 // Whatever approach we would take, we would invalidate value uses, and
568 // would need to track and fix them up.
569 if self.func.dfg.insts[inst].opcode().is_branch() {
570 return None;
571 }
572
573 let mut guard = TakeAndReplace::new(self, |x| &mut x.optimized_insts);
574 let (ctx, optimized_insts) = guard.get();
575
576 crate::opts::generated_code::constructor_simplify_skeleton(
577 &mut IsleContext { ctx },
578 inst,
579 optimized_insts,
580 );
581
582 let simplifications_len = optimized_insts.len();
583 log::trace!(" -> simplify_skeleton: yielded {simplifications_len} simplification(s)");
584 if simplifications_len > MATCHES_LIMIT {
585 log::trace!(" too many candidate simplifications; truncating to {MATCHES_LIMIT}");
586 optimized_insts.truncate(MATCHES_LIMIT);
587 }
588
589 // Find the best simplification, if any, from our candidates.
590 //
591 // Unlike simplifying pure values, we do not add side-effectful
592 // instructions to the egraph, nor do we extract the best version via
593 // dynamic programming and considering the costs of operands. Instead,
594 // we greedily choose the best simplification. This is because there is
595 // an impedance mismatch: the egraph and our pure rewrites are centered
596 // around *values*, but we don't represent side-effects with values, we
597 // represent them implicitly in their *instructions*.
598 //
599 // The initial best choice is "no simplification, just use the original
600 // instruction" which has the original instruction's cost.
601 let mut best = None;
602 let mut best_cost = cost::Cost::of_skeleton_op(
603 ctx.func.dfg.insts[inst].opcode(),
604 ctx.func.dfg.inst_args(inst).len(),
605 );
606 while let Some(simplification) = optimized_insts.pop() {
607 let (new_inst, new_val) = match simplification {
608 // We can't do better than completely removing the skeleton
609 // instruction, so short-cicuit the loop and eagerly return the
610 // `Remove*` simplifications.
611 SkeletonInstSimplification::Remove => {
612 log::trace!(" -> simplify_skeleton: remove inst");
613 debug_assert!(ctx.func.dfg.inst_results(inst).is_empty());
614 return Some(simplification);
615 }
616 SkeletonInstSimplification::RemoveWithVal { val } => {
617 log::trace!(" -> simplify_skeleton: remove inst and use {val} as its result");
618 if cfg!(debug_assertions) {
619 let results = ctx.func.dfg.inst_results(inst);
620 debug_assert_eq!(results.len(), 1);
621 debug_assert_eq!(
622 ctx.func.dfg.value_type(results[0]),
623 ctx.func.dfg.value_type(val),
624 );
625 }
626 return Some(simplification);
627 }
628
629 // For instruction replacement simplification, we want to check
630 // that the replacements define the same number and types of
631 // values as the original instruction, and also determine
632 // whether they are actually an improvement over (i.e. have
633 // lower cost than) the original instruction.
634 SkeletonInstSimplification::Replace { inst } => {
635 log::trace!(
636 " -> simplify_skeleton: replace inst with {inst}: {}",
637 ctx.func.dfg.display_inst(inst)
638 );
639 (inst, None)
640 }
641 SkeletonInstSimplification::ReplaceWithVal { inst, val } => {
642 log::trace!(
643 " -> simplify_skeleton: replace inst with {val} and {inst}: {}",
644 ctx.func.dfg.display_inst(inst)
645 );
646 (inst, Some(val))
647 }
648 };
649
650 if cfg!(debug_assertions) {
651 let opcode = ctx.func.dfg.insts[inst].opcode();
652 debug_assert!(
653 !(opcode.is_terminator() || opcode.is_branch()),
654 "simplifying control-flow instructions and terminators is not yet supported",
655 );
656
657 let old_vals = ctx.func.dfg.inst_results(inst);
658 let new_vals = if let Some(val) = new_val.as_ref() {
659 std::slice::from_ref(val)
660 } else {
661 ctx.func.dfg.inst_results(new_inst)
662 };
663 debug_assert_eq!(
664 old_vals.len(),
665 new_vals.len(),
666 "skeleton simplification should result in the same number of result values",
667 );
668
669 for (old_val, new_val) in old_vals.iter().zip(new_vals) {
670 let old_ty = ctx.func.dfg.value_type(*old_val);
671 let new_ty = ctx.func.dfg.value_type(*new_val);
672 debug_assert_eq!(
673 old_ty, new_ty,
674 "skeleton simplification should result in values of the correct type",
675 );
676 }
677 }
678
679 // Our best simplification is the one with the least cost. Update
680 // `best` if necessary.
681 let cost = cost::Cost::of_skeleton_op(
682 ctx.func.dfg.insts[new_inst].opcode(),
683 ctx.func.dfg.inst_args(new_inst).len(),
684 );
685 if cost < best_cost {
686 best = Some(simplification);
687 best_cost = cost;
688 }
689 }
690
691 // Return the best simplification!
692 best
693 }
694
695 /// Helper to propagate facts on constant values: if PCC is
696 /// enabled, then unconditionally add a fact attesting to the
697 /// Value's concrete value.
698 fn attach_constant_fact(&mut self, inst: Inst, value: Value, ty: Type) {
699 if self.flags.enable_pcc() {
700 if let InstructionData::UnaryImm {
701 opcode: Opcode::Iconst,
702 imm,
703 } = self.func.dfg.insts[inst]
704 {
705 let imm: i64 = imm.into();
706 self.func.dfg.facts[value] =
707 Some(Fact::constant(ty.bits().try_into().unwrap(), imm as u64));
708 }
709 }
710 }
711}
712
713impl<'a> EgraphPass<'a> {
714 /// Create a new EgraphPass.
715 pub fn new(
716 func: &'a mut Function,
717 raw_domtree: &'a DominatorTree,
718 loop_analysis: &'a LoopAnalysis,
719 alias_analysis: &'a mut AliasAnalysis<'a>,
720 flags: &'a Flags,
721 ctrl_plane: &'a mut ControlPlane,
722 ) -> Self {
723 let mut domtree = DominatorTreePreorder::new();
724 domtree.compute(raw_domtree);
725 Self {
726 func,
727 domtree,
728 loop_analysis,
729 alias_analysis,
730 flags,
731 ctrl_plane,
732 stats: Stats::default(),
733 remat_values: FxHashSet::default(),
734 }
735 }
736
737 /// Run the process.
738 pub fn run(&mut self) {
739 self.remove_pure_and_optimize();
740
741 trace!("egraph built:\n{}\n", self.func.display());
742 if cfg!(feature = "trace-log") {
743 for (value, def) in self.func.dfg.values_and_defs() {
744 trace!(" -> {} = {:?}", value, def);
745 match def {
746 ValueDef::Result(i, 0) => {
747 trace!(" -> {} = {:?}", i, self.func.dfg.insts[i]);
748 }
749 _ => {}
750 }
751 }
752 }
753
754 self.elaborate();
755
756 log::trace!("stats: {:#?}", self.stats);
757 }
758
759 /// Remove pure nodes from the `Layout` of the function, ensuring
760 /// that only the "side-effect skeleton" remains, and also
761 /// optimize the pure nodes. This is the first step of
762 /// egraph-based processing and turns the pure CFG-based CLIF into
763 /// a CFG skeleton with a sea of (optimized) nodes tying it
764 /// together.
765 ///
766 /// As we walk through the code, we eagerly apply optimization
767 /// rules; at any given point we have a "latest version" of an
768 /// eclass of possible representations for a `Value` in the
769 /// original program, which is itself a `Value` at the root of a
770 /// union-tree. We keep a map from the original values to these
771 /// optimized values. When we encounter any instruction (pure or
772 /// side-effecting skeleton) we rewrite its arguments to capture
773 /// the "latest" optimized forms of these values. (We need to do
774 /// this as part of this pass, and not later using a finished map,
775 /// because the eclass can continue to be updated and we need to
776 /// only refer to its subset that exists at this stage, to
777 /// maintain acyclicity.)
778 fn remove_pure_and_optimize(&mut self) {
779 let mut cursor = FuncCursor::new(self.func);
780 let mut value_to_opt_value: SecondaryMap<Value, Value> =
781 SecondaryMap::with_default(Value::reserved_value());
782
783 // Map from instruction to value for hash-consing of pure ops
784 // into the egraph. This can be a standard (non-scoped)
785 // hashmap because pure ops have no location: they are
786 // "outside of" control flow.
787 //
788 // Note also that we keep the controlling typevar (the `Type`
789 // in the tuple below) because it may disambiguate
790 // instructions that are identical except for type.
791 //
792 // We store both skeleton and non-skeleton instructions in the
793 // GVN map; for skeleton instructions, we only store those
794 // that are idempotent, i.e., still eligible to GVN. Note that
795 // some skeleton instructions are idempotent but do not
796 // produce a value: e.g., traps on a given condition. To allow
797 // for both cases, we store an `Option<Value>` as the value in
798 // this map.
799 let mut gvn_map: ScopedHashMap<(Type, InstructionData), Option<Value>> =
800 ScopedHashMap::with_capacity(cursor.func.dfg.num_values());
801
802 // The block in the domtree preorder traversal at each level
803 // of the GVN map.
804 let mut gvn_map_blocks: Vec<Block> = vec![];
805
806 // To get the best possible merging and canonicalization, we
807 // track where a value is "available" at: this is the
808 // domtree-nearest-ancestor join of all args if the value
809 // itself is pure, otherwise the block where the value is
810 // defined. (And for union nodes, the
811 // domtree-highest-ancestor, i.e., the meet or the dual to the
812 // above join.)
813 let mut available_block: SecondaryMap<Value, Block> =
814 SecondaryMap::with_default(Block::reserved_value());
815
816 // To avoid blowing up eclasses too much, we track the size of
817 // each eclass reachable by a tree of union nodes from a given
818 // value ID, and we avoid union'ing additional values into an
819 // eclass when it reaches `ECLASS_ENODE_LIMIT`.
820 //
821 // For efficiency, this encodes size minus one: so a value of
822 // zero (which is cheap to bulk-initialize) means a singleton
823 // eclass of size one. This also allows us to avoid explicitly
824 // writing the size for any values that are not union nodes.
825 let mut eclass_size: SecondaryMap<Value, u8> = SecondaryMap::with_default(0);
826
827 // This is an initial guess at the size we'll need, but we add
828 // more values as we build simplified alternative expressions so
829 // this is likely to realloc again later.
830 available_block.resize(cursor.func.dfg.num_values());
831
832 // In domtree preorder, visit blocks. (TODO: factor out an
833 // iterator from this and elaborator.)
834 let root = cursor.layout().entry_block().unwrap();
835 enum StackEntry {
836 Visit(Block),
837 Pop,
838 }
839 let mut block_stack = vec![StackEntry::Visit(root)];
840 while let Some(entry) = block_stack.pop() {
841 match entry {
842 StackEntry::Visit(block) => {
843 // We popped this block; push children
844 // immediately, then process this block.
845 block_stack.push(StackEntry::Pop);
846 block_stack.extend(
847 self.ctrl_plane
848 .shuffled(self.domtree.children(block))
849 .map(StackEntry::Visit),
850 );
851 gvn_map.increment_depth();
852 gvn_map_blocks.push(block);
853
854 trace!("Processing block {}", block);
855 cursor.set_position(CursorPosition::Before(block));
856
857 let mut alias_analysis_state = self.alias_analysis.block_starting_state(block);
858
859 for ¶m in cursor.func.dfg.block_params(block) {
860 trace!("creating initial singleton eclass for blockparam {}", param);
861 value_to_opt_value[param] = param;
862 available_block[param] = block;
863 }
864 while let Some(inst) = cursor.next_inst() {
865 trace!(
866 "Processing inst {inst}: {}",
867 cursor.func.dfg.display_inst(inst),
868 );
869
870 // Rewrite args of *all* instructions using the
871 // value-to-opt-value map.
872 cursor.func.dfg.map_inst_values(inst, |arg| {
873 let new_value = value_to_opt_value[arg];
874 trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value);
875 debug_assert_ne!(new_value, Value::reserved_value());
876 new_value
877 });
878
879 // Build a context for optimization, with borrows of
880 // state. We can't invoke a method on `self` because
881 // we've borrowed `self.func` mutably (as
882 // `cursor.func`) so we pull apart the pieces instead
883 // here.
884 let mut ctx = OptimizeCtx {
885 func: cursor.func,
886 value_to_opt_value: &mut value_to_opt_value,
887 gvn_map: &mut gvn_map,
888 gvn_map_blocks: &mut gvn_map_blocks,
889 available_block: &mut available_block,
890 eclass_size: &mut eclass_size,
891 rewrite_depth: 0,
892 subsume_values: FxHashSet::default(),
893 remat_values: &mut self.remat_values,
894 stats: &mut self.stats,
895 domtree: &self.domtree,
896 alias_analysis: self.alias_analysis,
897 alias_analysis_state: &mut alias_analysis_state,
898 flags: self.flags,
899 ctrl_plane: self.ctrl_plane,
900 optimized_values: Default::default(),
901 optimized_insts: Default::default(),
902 };
903
904 if is_pure_for_egraph(ctx.func, inst) {
905 // Insert into GVN map and optimize any new nodes
906 // inserted (recursively performing this work for
907 // any nodes the optimization rules produce).
908 let inst = NewOrExistingInst::Existing(inst);
909 ctx.insert_pure_enode(inst);
910 // We've now rewritten all uses, or will when we
911 // see them, and the instruction exists as a pure
912 // enode in the eclass, so we can remove it.
913 cursor.remove_inst_and_step_back();
914 } else {
915 if let Some(cmd) = ctx.optimize_skeleton_inst(inst, block) {
916 Self::execute_skeleton_inst_simplification(
917 cmd,
918 &mut cursor,
919 &mut value_to_opt_value,
920 inst,
921 );
922 }
923 }
924 }
925 }
926 StackEntry::Pop => {
927 gvn_map.decrement_depth();
928 gvn_map_blocks.pop();
929 }
930 }
931 }
932 }
933
934 /// Execute a simplification of an instruction in the side-effectful
935 /// skeleton.
936 fn execute_skeleton_inst_simplification(
937 simplification: SkeletonInstSimplification,
938 cursor: &mut FuncCursor,
939 value_to_opt_value: &mut SecondaryMap<Value, Value>,
940 old_inst: Inst,
941 ) {
942 let mut forward_val = |cursor: &mut FuncCursor, old_val, new_val| {
943 cursor.func.dfg.change_to_alias(old_val, new_val);
944 value_to_opt_value[old_val] = new_val;
945 };
946
947 let (new_inst, new_val) = match simplification {
948 SkeletonInstSimplification::Remove => {
949 cursor.remove_inst_and_step_back();
950 return;
951 }
952 SkeletonInstSimplification::RemoveWithVal { val } => {
953 cursor.remove_inst_and_step_back();
954 let old_val = cursor.func.dfg.first_result(old_inst);
955 cursor.func.dfg.detach_inst_results(old_inst);
956 forward_val(cursor, old_val, val);
957 return;
958 }
959 SkeletonInstSimplification::Replace { inst } => (inst, None),
960 SkeletonInstSimplification::ReplaceWithVal { inst, val } => (inst, Some(val)),
961 };
962
963 // Replace the old instruction with the new one.
964 cursor.replace_inst(new_inst);
965 debug_assert!(!cursor.func.dfg.insts[new_inst].opcode().is_terminator());
966
967 // Redirect the old instruction's result values to our new instruction's
968 // result values.
969 let mut i = 0;
970 let mut next_new_val = |dfg: &crate::ir::DataFlowGraph| -> Value {
971 if let Some(val) = new_val {
972 val
973 } else {
974 let val = dfg.inst_results(new_inst)[i];
975 i += 1;
976 val
977 }
978 };
979 for i in 0..cursor.func.dfg.inst_results(old_inst).len() {
980 let old_val = cursor.func.dfg.inst_results(old_inst)[i];
981 let new_val = next_new_val(&cursor.func.dfg);
982 forward_val(cursor, old_val, new_val);
983 }
984
985 // Back up so that the next iteration of the outer egraph loop will
986 // process the new instruction.
987 cursor.goto_inst(new_inst);
988 cursor.prev_inst();
989 }
990
991 /// Scoped elaboration: compute a final ordering of op computation
992 /// for each block and update the given Func body. After this
993 /// runs, the function body is back into the state where every
994 /// Inst with an used result is placed in the layout (possibly
995 /// duplicated, if our code-motion logic decides this is the best
996 /// option).
997 ///
998 /// This works in concert with the domtree. We do a preorder
999 /// traversal of the domtree, tracking a scoped map from Id to
1000 /// (new) Value. The map's scopes correspond to levels in the
1001 /// domtree.
1002 ///
1003 /// At each block, we iterate forward over the side-effecting
1004 /// eclasses, and recursively generate their arg eclasses, then
1005 /// emit the ops themselves.
1006 ///
1007 /// To use an eclass in a given block, we first look it up in the
1008 /// scoped map, and get the Value if already present. If not, we
1009 /// need to generate it. We emit the extracted enode for this
1010 /// eclass after recursively generating its args. Eclasses are
1011 /// thus computed "as late as possible", but then memoized into
1012 /// the Id-to-Value map and available to all dominated blocks and
1013 /// for the rest of this block. (This subsumes GVN.)
1014 fn elaborate(&mut self) {
1015 let mut elaborator = Elaborator::new(
1016 self.func,
1017 &self.domtree,
1018 self.loop_analysis,
1019 &self.remat_values,
1020 &mut self.stats,
1021 self.ctrl_plane,
1022 );
1023 elaborator.elaborate();
1024
1025 self.check_post_egraph();
1026 }
1027
1028 #[cfg(debug_assertions)]
1029 fn check_post_egraph(&self) {
1030 // Verify that no union nodes are reachable from inst args,
1031 // and that all inst args' defining instructions are in the
1032 // layout.
1033 for block in self.func.layout.blocks() {
1034 for inst in self.func.layout.block_insts(block) {
1035 self.func
1036 .dfg
1037 .inst_values(inst)
1038 .for_each(|arg| match self.func.dfg.value_def(arg) {
1039 ValueDef::Result(i, _) => {
1040 debug_assert!(self.func.layout.inst_block(i).is_some());
1041 }
1042 ValueDef::Union(..) => {
1043 panic!("egraph union node {arg} still reachable at {inst}!");
1044 }
1045 _ => {}
1046 })
1047 }
1048 }
1049 }
1050
1051 #[cfg(not(debug_assertions))]
1052 fn check_post_egraph(&self) {}
1053}
1054
1055/// Implementation of external-context equality and hashing on
1056/// InstructionData. This allows us to deduplicate instructions given
1057/// some context that lets us see its value lists, so we don't need to
1058/// store arguments inline in the `InstuctionData` (or alongside it in
1059/// some newly-defined key type) in all cases.
1060struct GVNContext<'a> {
1061 value_lists: &'a ValueListPool,
1062}
1063
1064impl<'a> CtxEq<(Type, InstructionData), (Type, InstructionData)> for GVNContext<'a> {
1065 fn ctx_eq(
1066 &self,
1067 (a_ty, a_inst): &(Type, InstructionData),
1068 (b_ty, b_inst): &(Type, InstructionData),
1069 ) -> bool {
1070 a_ty == b_ty && a_inst.eq(b_inst, self.value_lists)
1071 }
1072}
1073
1074impl<'a> CtxHash<(Type, InstructionData)> for GVNContext<'a> {
1075 fn ctx_hash<H: Hasher>(&self, state: &mut H, (ty, inst): &(Type, InstructionData)) {
1076 std::hash::Hash::hash(&ty, state);
1077 inst.hash(state, self.value_lists);
1078 }
1079}
1080
1081/// Statistics collected during egraph-based processing.
1082#[derive(Clone, Debug, Default)]
1083pub(crate) struct Stats {
1084 pub(crate) pure_inst: u64,
1085 pub(crate) pure_inst_deduped: u64,
1086 pub(crate) pure_inst_subsume: u64,
1087 pub(crate) pure_inst_rewrite_to_self: u64,
1088 pub(crate) pure_inst_insert_orig: u64,
1089 pub(crate) pure_inst_insert_new: u64,
1090 pub(crate) skeleton_inst: u64,
1091 pub(crate) skeleton_inst_simplified: u64,
1092 pub(crate) skeleton_inst_gvn: u64,
1093 pub(crate) alias_analysis_removed: u64,
1094 pub(crate) new_inst: u64,
1095 pub(crate) union: u64,
1096 pub(crate) subsume: u64,
1097 pub(crate) remat: u64,
1098 pub(crate) rewrite_rule_invoked: u64,
1099 pub(crate) rewrite_rule_results: u64,
1100 pub(crate) rewrite_depth_limit: u64,
1101 pub(crate) elaborate_visit_node: u64,
1102 pub(crate) elaborate_memoize_hit: u64,
1103 pub(crate) elaborate_memoize_miss: u64,
1104 pub(crate) elaborate_remat: u64,
1105 pub(crate) elaborate_licm_hoist: u64,
1106 pub(crate) elaborate_func: u64,
1107 pub(crate) elaborate_func_pre_insts: u64,
1108 pub(crate) elaborate_func_post_insts: u64,
1109 pub(crate) elaborate_best_cost_fixpoint_iters: u64,
1110 pub(crate) eclass_size_limit: u64,
1111}