diff --git a/src/cwe_checker_lib/src/analysis/graph.rs b/src/cwe_checker_lib/src/analysis/graph.rs index 86451da8e..ef734bd6c 100644 --- a/src/cwe_checker_lib/src/analysis/graph.rs +++ b/src/cwe_checker_lib/src/analysis/graph.rs @@ -66,7 +66,9 @@ use petgraph::{ visit::{EdgeRef, IntoNodeReferences}, }; -mod intraprocedural_cfg; +pub mod algo; +pub mod intraprocedural_cfg; +pub mod call; /// The graph type of an interprocedural control flow graph pub type Graph<'a> = DiGraph, Edge<'a>>; diff --git a/src/cwe_checker_lib/src/analysis/graph/algo.rs b/src/cwe_checker_lib/src/analysis/graph/algo.rs new file mode 100644 index 000000000..5ea2a10ff --- /dev/null +++ b/src/cwe_checker_lib/src/analysis/graph/algo.rs @@ -0,0 +1,31 @@ +//! Some simple graph algorithms. + +use std::collections::hash_map::{Entry, HashMap}; + +use petgraph::prelude::*; +use petgraph::unionfind::UnionFind; +use petgraph::visit::{IntoEdgeReferences, NodeCompactIndexable}; + +/// Returns the components of the graph `g`. +pub fn components(g: &G) -> Vec> +where + G: IntoEdgeReferences + NodeCompactIndexable, +{ + let mut vertex_sets = UnionFind::new(g.node_bound()); + for e in g.edge_references() { + let (h, t) = (e.target(), e.source()); + vertex_sets.union(g.to_index(h), g.to_index(t)); + } + let representatives = vertex_sets.into_labeling(); + let mut sets: HashMap> = HashMap::new(); + for (index, repr) in representatives.iter().enumerate() { + match sets.entry(*repr) { + Entry::Vacant(e) => { + e.insert(vec![g.from_index(index)]); + } + Entry::Occupied(e) => e.into_mut().push(g.from_index(index)), + } + } + + sets.into_values().collect() +} diff --git a/src/cwe_checker_lib/src/analysis/graph/call.rs b/src/cwe_checker_lib/src/analysis/graph/call.rs new file mode 100644 index 000000000..901b6b7b4 --- /dev/null +++ b/src/cwe_checker_lib/src/analysis/graph/call.rs @@ -0,0 +1,123 @@ +//! Call graphs. +use crate::analysis::graph::intraprocedural_cfg::IntraproceduralCfg; +use crate::intermediate_representation::{Jmp, Program, Sub, Term, Tid}; + +use std::collections::HashMap; + +use petgraph::graph::{DiGraph, NodeIndex}; +use petgraph::visit::EdgeRef; +use petgraph::Direction; + +/// Whole-program call graph. +pub struct CallGraph<'a> { + graph: DiGraph, CgEdge<'a>>, + fn_tid_to_idx_map: HashMap<&'a Tid, NodeIndex>, +} + +impl<'a> CallGraph<'a> { + /// Constructs the call graph of the program `p`. + pub fn new(p: &'a Program) -> Self { + CallGraphBuilder::new(p).build() + } + + /// Returns an iterator over all callers of the function `f`. + pub fn callers<'b>( + &'b self, + f: &Tid, + ) -> impl Iterator, &'b CgEdge<'a>)> + 'b { + let fn_idx = self.fn_tid_to_idx_map.get(f).unwrap(); + + self.graph + .edges_directed(*fn_idx, Direction::Incoming) + .map(|e_ref| { + let source = e_ref.source(); + (&self.graph[source], e_ref.weight()) + }) + } + + /// Returns an iterator over all callees of the function `f`. + pub fn callees<'b>( + &'b self, + f: &Tid, + ) -> impl Iterator, &'b CgEdge<'a>)> + 'b { + let fn_idx = self.fn_tid_to_idx_map.get(f).unwrap(); + + self.graph + .edges_directed(*fn_idx, Direction::Outgoing) + .map(|e_ref| { + let target = e_ref.target(); + (&self.graph[target], e_ref.weight()) + }) + } +} + +/// Call graph node. +/// +/// Nodes in a call graph correspond to internal or external (aka. imported) +/// functions. Each function has exactly one node. +pub enum CgNode<'a> { + Function(&'a Term, Box>), + ExtFunction, +} + +impl<'a> CgNode<'a> { + /// Returns true iff this node corresponds to an external function. + pub fn is_external(&self) -> bool { + matches!(self, CgNode::ExtFunction) + } +} + +/// Call graph edge. +/// +/// If function `f` may, directly or indirectly, call function `g` the call +/// graph has exactly one edge `f -> g`. Thus, callers can be determined by +/// iterating incoming edges, and callees by iterating outgoing edges. +/// Furthermore, edges include all potential call sites in the caller. +pub struct CgEdge<'a> { + direct_call_sites: Vec>, + indirect_call_sites: Vec>, +} + +impl<'a> CgEdge<'a> { + /// Returns an iterator over the direct call sites of this edge. + pub fn direct_call_sites<'b>(&'b self) -> impl Iterator> + 'b { + self.direct_call_sites.iter() + } + + /// Returns an iterator over the indirect call sites of this edge. + pub fn indirect_call_sites<'b>(&'b self) -> impl Iterator> + 'b { + self.indirect_call_sites.iter() + } +} + +/// Call site. +pub struct CallSite<'a> { + indirect: bool, + insn: &'a Term, +} + +impl<'a> CallSite<'a> { + /// Returns true iff this in an indirect call. + pub fn is_indirect(&self) -> bool { + self.indirect + } + + /// Returns the call instruction. + pub fn insn(&self) -> &'a Term { + self.insn + } +} + +struct CallGraphBuilder<'a> { + _pd: core::marker::PhantomData<&'a u32>, +} + +impl<'a> CallGraphBuilder<'a> { + fn new(_p: &'a Program) -> Self { + todo!() + } + + fn build(self) -> CallGraph<'a> { + todo!() + } +} diff --git a/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg.rs b/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg.rs index 60900ac1b..9f17a2ed6 100644 --- a/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg.rs +++ b/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg.rs @@ -1,16 +1,26 @@ -#![allow(unreachable_code)] -#![allow(dead_code)] -#![allow(unused_imports)] - -use crate::intermediate_representation::{Blk, Jmp, Program, Sub as Function, Term, Tid}; - +//! Intraprocedural control flow graphs. +//! +//! Intraprocedural CFGs use the same nodes and edges as their big brother, +//! the [interprocedural CFG]. They are useful for tasks where it is not +//! necessary to construct a full-blown whole-program CFG. Reusing the same +//! types also allows us to use the same dataflow analysis infrastructure for +//! both kinds of CFGs. It may also allow us to merge multiple intraprocedural +//! CFGs into an interprocedural CFG in the future. +//! +//! [interprocedural CFG]: super::Graph use crate::analysis::graph::{Edge, Graph as Cfg, Node, NodeIndex}; -use crate::intermediate_representation::SinkType; +use crate::intermediate_representation::{Blk, Jmp, Program, SinkType, Sub as Function, Term, Tid}; + +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; -use std::collections::{BTreeMap, HashMap, HashSet}; +mod properties; +mod dom; +mod natural_loops; +use dom::*; +use natural_loops::*; /// Pair of block start and block end nodes for a single basic block. -type BlockIdxs = (NodeIndex, NodeIndex); +pub type BlockIdxs = (NodeIndex, NodeIndex); /// Builder for an intraprocedural CFG. struct IntraproceduralCfgBuilder<'a> { @@ -85,6 +95,8 @@ impl<'a> IntraproceduralCfgBuilder<'a> { calls: self.calls, ext_calls: self.ext_calls, sinks: self.sinks, + dominators: None, + natural_loops: None, } } @@ -240,6 +252,8 @@ impl<'a> IntraproceduralCfgBuilder<'a> { } } +/// An intraprocedural control flow graph. +#[allow(dead_code)] pub struct IntraproceduralCfg<'a> { graph: Cfg<'a>, blk_tid_to_idx_map: HashMap<&'a Tid, BlockIdxs>, @@ -247,10 +261,105 @@ pub struct IntraproceduralCfg<'a> { calls: Vec, ext_calls: Vec, sinks: Vec<(SinkType, BlockIdxs)>, + dominators: Option>>, + natural_loops: Option>>, } impl<'a> IntraproceduralCfg<'a> { + /// Returns the intraprocedural CFG of the given function `f`. pub fn new(program: &'a Program, f: &'a Term) -> Self { IntraproceduralCfgBuilder::new(program, f).build() } + + /// Returns a reference to the underlying graph object. + pub fn graph(&self) -> &Cfg<'a> { + &self.graph + } + + /// Returns the indices of the nodes corresponding to function entry point. + pub fn entry(&self) -> BlockIdxs { + self.entry + } + + /// Returns all blocks that contain __direct__ function calls to + /// __internal__ and __external__ functions. + pub fn call_sites<'b>(&'b self) -> impl Iterator + 'b { + self.calls.iter().chain(self.ext_calls.iter()).copied() + } + + /// Returns a map that takes all __directly__ called __internal__ and + /// __external__functions to the number of times that they are called. + pub fn callees(&self) -> BTreeMap<&'a Tid, u32> { + let mut callees = BTreeMap::new(); + + for callee in self.call_sites().map(|(blk_start, _)| { + let Jmp::Call { target, .. } = &self.graph[blk_start].get_block().jmps[0].term else { + panic!(); + }; + target + }) { + use std::collections::btree_map::Entry::*; + match callees.entry(callee) { + Vacant(e) => { + e.insert(1); + } + Occupied(e) => *e.into_mut() += 1, + } + } + + callees + } + + /// Returns the number of basic block in this CFG. + /// + /// Note that this is not the number of nodes due to block-splitting and + /// artificial nodes around function calls. + pub fn num_blocks(&self) -> usize { + self.blk_tid_to_idx_map.len() + } + + /// Returns the start and end index of this block. + pub fn blk_tid_to_idx(&self, blk_tid: &Tid) -> Option<&BlockIdxs> { + self.blk_tid_to_idx_map.get(blk_tid) + } + + /// Returns the block term of the block with the given [`Tid`]. + pub fn blk_tid_to_term(&self, blk_tid: &Tid) -> Option<&'a Term> { + self.blk_tid_to_idx(blk_tid) + .map(|idx| self.graph()[idx.0].get_block()) + } + + /// Returns the block [`Tid`] for block start and end nodes. + pub fn idx_to_blk_tid(&self, idx: NodeIndex) -> Option<&'a Tid> { + self.graph()[idx].try_get_block().map(|b| &b.tid) + } + + /// Computes the dominator relation of this CFG. + /// + /// Noop if the dominators were already computed. + pub fn compute_dominators(&mut self) { + if self.dominators.is_none() { + self.dominators = Some(compute_dominators(self)); + } + } + + /// Returns the dominator relation of this CFG. + pub fn get_dominators(&self) -> Option<&BTreeMap<&'a Tid, BTreeSet<&'a Tid>>> { + self.dominators.as_ref() + } + + /// Computes the natural loops in this CFG. + /// + /// Noop if the loops were already computed. + pub fn compute_natural_loops(&mut self) { + if self.natural_loops.is_none() { + self.compute_dominators(); + self.natural_loops = Some(compute_natural_loops(self)); + } + } + + /// Returns the natural loops in this CFG. + pub fn get_natural_loops(&self) -> Option<&Vec>> { + self.natural_loops.as_ref() + } } diff --git a/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg/dom.rs b/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg/dom.rs new file mode 100644 index 000000000..217af0659 --- /dev/null +++ b/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg/dom.rs @@ -0,0 +1,267 @@ +//! Dominator computation. +use crate::abstract_domain::{AbstractDomain, CheapToClone}; +use crate::analysis::forward_interprocedural_fixpoint; +use crate::analysis::forward_interprocedural_fixpoint::create_computation; +use crate::analysis::graph::intraprocedural_cfg::{BlockIdxs, IntraproceduralCfg}; +use crate::analysis::graph::{Graph, Node, NodeIndex}; +use crate::analysis::interprocedural_fixpoint_generic::NodeValue; +use crate::intermediate_representation::{Blk, Def, Expression, Jmp, Term, Tid}; + +use std::collections::{BTreeMap, BTreeSet}; +use std::sync::Arc; + +struct Context<'a> { + graph: &'a IntraproceduralCfg<'a>, +} + +/// Returns a mapping that takes each basic block to the set of its dominators. +/// +/// Standard dataflow analysis: +/// Direction: Forward +/// Property space: Powerset of all blocks. +/// Ordering: Reverse inclusion. +/// Initial values: The node itself for the entry node. All nodes for all other +/// nodes. +pub fn compute_dominators<'a>( + graph: &IntraproceduralCfg<'a>, +) -> BTreeMap<&'a Tid, BTreeSet<&'a Tid>> { + let ctx = Context { graph }; + let mut cmp = create_computation(ctx, None); + let entry = graph.entry(); + + cmp.set_node_value(entry.0, NodeValue::Value(Dominators::new_single(entry))); + cmp.compute_with_max_steps(100); + + if !cmp.has_stabilized() { + panic!("Dominator computation has not stabilized."); + } + + cmp.node_values() + .iter() + .filter_map(|(idx, doms)| match doms { + NodeValue::CallFlowCombinator { .. } => None, + NodeValue::Value(doms) => { + let Some(dominee) = graph.idx_to_blk_tid(*idx) else { + // FIXME: This means we have a `NodeValue::Value` at an + // artificial node. Not good. Investigate. + return None; + }; + let dominators = graph + .graph() + .node_indices() + .filter_map(|idx| { + if doms.is_dominator(idx) { + let Some(dominator) = graph.idx_to_blk_tid(idx) else { + // FIXME: This means we have a + // `NodeValue::Value` at an artificial node. + // Not good. Investigate. + return None; + }; + Some(dominator) + } else { + None + } + }) + .collect::>(); + + Some((dominee, dominators)) + } + }) + .collect::>>() +} + +impl<'a> forward_interprocedural_fixpoint::Context<'a> for Context<'a> { + type Value = Dominators; + + fn get_graph(&self) -> &Graph<'a> { + self.graph.graph() + } + + fn merge(&self, value1: &Self::Value, value2: &Self::Value) -> Self::Value { + value1.merge(value2) + } + + fn update_def(&self, value: &Self::Value, _def: &Term) -> Option { + Some(value.clone()) + } + + fn update_jump( + &self, + value: &Self::Value, + _jump: &Term, + _untaken_conditional: Option<&Term>, + target: &Term, + ) -> Option { + let mut new_value = value.clone(); + + // Target block is dominated by itself. + new_value.insert(*self.graph.blk_tid_to_idx(&target.tid).unwrap()); + + Some(new_value) + } + + fn update_call( + &self, + _value: &Self::Value, + _call: &Term, + _target: &Node, + _calling_convention: &Option, + ) -> Option { + None + } + + fn update_return( + &self, + _value: Option<&Self::Value>, + value_before_call: Option<&Self::Value>, + call_term: &Term, + _return_term: &Term, + _calling_convention: &Option, + ) -> Option { + let mut new_value = value_before_call.unwrap().clone(); + + // Return-to block is dominated by itself. + new_value.insert(match &call_term.term { + Jmp::Call { + return_: Some(ret_to_tid), + .. + } => *self.graph.blk_tid_to_idx(ret_to_tid).unwrap(), + // Normalization passes ensure that each call returns to somewhere. + _ => core::unreachable!(), + }); + + Some(new_value) + } + + fn update_call_stub(&self, value: &Self::Value, call: &Term) -> Option { + let mut new_value = value.clone(); + + // Return-to block is dominated by itself. + new_value.insert(match &call.term { + Jmp::Call { + return_: Some(ret_to_tid), + .. + } + | Jmp::CallInd { + return_: Some(ret_to_tid), + .. + } + | Jmp::CallOther { + return_: Some(ret_to_tid), + .. + } => *self.graph.blk_tid_to_idx(ret_to_tid).unwrap(), + // Framework should only call this function for the above edge + // types. + _ => core::unreachable!(), + }); + + Some(new_value) + } + + fn specialize_conditional( + &self, + value: &Self::Value, + _condition: &Expression, + _block_before_condition: &Term, + _is_true: bool, + ) -> Option { + Some(value.clone()) + } +} + +/// The dominators of a node. +#[derive(Clone, Eq, PartialEq, Default, Debug)] +struct Dominators { + /// The dominators of a node. + inner: Arc, +} + +impl CheapToClone for Dominators {} + +/// The dominators of a basic block. +#[derive(Clone, Eq, PartialEq, Default, Debug)] +enum DominatorsInner { + /// Basic block is dominated by all basic blocks. + #[default] + Bottom, + /// Basic block is dominated by a subset of all basic blocks. + Doms(BTreeSet), + /// Basic block has no dominators. (Should never happen as each block is + /// dominated by itself.) + Top, +} + +impl AbstractDomain for Dominators { + fn merge(&self, other: &Self) -> Dominators { + if self == other { + return self.clone(); + } + + use DominatorsInner::*; + match (&*self.inner, &*other.inner) { + (Bottom, Bottom) | (_, Bottom) | (Top, _) => self.clone(), + (_, Top) | (Bottom, _) => other.clone(), + (Doms(a), Doms(b)) => { + let intersection = a.intersection(b).cloned().collect::>(); + + Dominators { + inner: Arc::new(if intersection.is_empty() { + Top + } else { + Doms(intersection) + }), + } + } + } + } + + fn is_top(&self) -> bool { + matches!(*self.inner, DominatorsInner::Top) + } +} + +impl Dominators { + /// Returns a new dominator set that only includes the given block. + fn new_single(idx: BlockIdxs) -> Self { + let mut doms = BTreeSet::new(); + doms.insert(idx); + + Self { + inner: Arc::new(DominatorsInner::Doms(doms)), + } + } + + /// Inserts the given block into the dominator set. + fn insert(&mut self, idx: BlockIdxs) { + use DominatorsInner::*; + match &*self.inner { + Bottom => (), + Top => { + let mut doms = BTreeSet::new(); + doms.insert(idx); + + self.inner = Arc::new(DominatorsInner::Doms(doms)); + } + // Technically incorrect since we do not handle the case when we + // arrive at `Bottom`. + Doms(doms) if !doms.contains(&idx) => { + let doms = Arc::make_mut(&mut self.inner); + if let Doms(doms) = doms { + doms.insert(idx); + } + } + _ => (), + } + } + + /// Returns true iff `idx` is the start index of a block that is in this + /// dominator set. + pub fn is_dominator(&self, idx: NodeIndex) -> bool { + use DominatorsInner::*; + match &*self.inner { + Bottom => true, + Top => false, + Doms(doms) => doms.iter().any(|(blk_start, _)| *blk_start == idx), + } + } +} diff --git a/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg/natural_loops.rs b/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg/natural_loops.rs new file mode 100644 index 000000000..ce65246ba --- /dev/null +++ b/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg/natural_loops.rs @@ -0,0 +1,118 @@ +//! Natural loops. +use crate::analysis::graph::intraprocedural_cfg::IntraproceduralCfg; +use crate::analysis::graph::{Graph, Edge}; +use crate::intermediate_representation::Tid; + +use std::collections::BTreeSet; +use std::fmt; + +use petgraph::visit::EdgeRef; + +/// A natural loop in the CFG. +pub struct NaturalLoop<'a> { + /// Block that controls the loop. + head: &'a Tid, + /// Blocks contained in the loop. + blocks: BTreeSet<&'a Tid>, +} + +impl fmt::Display for NaturalLoop<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "head:{}, blocks:", self.head)?; + for b in self.blocks() { + write!(f, "{}, ", b)? + } + + Ok(()) + } +} + +impl<'a> NaturalLoop<'a> { + /// Returns the block that controls the loop. + pub fn head(&self) -> &'a Tid { + self.head + } + + /// Returns the blocks inside the loop. + pub fn blocks<'b>(&'b self) -> impl Iterator + 'b { + self.blocks.iter().cloned() + } +} + +/// An edge from a block to one of its dominators. +/// +/// Such an edge defines a natural loop. +struct BackEdge<'a> { + tail: &'a Tid, + /// Block that controls the loop. + head: &'a Tid, +} + +impl<'a> BackEdge<'a> { + /// Computes the natural loop of this back edge. + fn natural_loop(&self, cfg: &IntraproceduralCfg<'a>, rev_cfg: &Graph<'_>) -> NaturalLoop<'a> { + let mut visited = BTreeSet::new(); + visited.insert(cfg.blk_tid_to_idx(self.head).unwrap().0); + + let mut stack = vec![cfg.blk_tid_to_idx(self.tail).unwrap().1]; + while let Some(idx) = stack.pop() { + visited.insert(idx); + for idx in rev_cfg.neighbors(idx) { + if !visited.contains(&idx) { + stack.push(idx); + } + } + } + + NaturalLoop { + head: self.head, + blocks: visited + .into_iter() + // Also removes artificial nodes. + .filter_map(|idx| cfg.idx_to_blk_tid(idx)) + .collect(), + } + } +} + +/// Returns the natural loops of this CFG. +/// +/// Panics if dominator relation was not computed. +pub fn compute_natural_loops<'a>(cfg: &IntraproceduralCfg<'a>) -> Vec> { + let doms = cfg.get_dominators().unwrap(); + let back_edges: Vec> = cfg + .graph() + .edge_references() + .filter_map(|e| { + let tail = cfg.idx_to_blk_tid(e.source()); + let head = cfg.idx_to_blk_tid(e.target()); + + // Due to the way we split blocks into two nodes each `Block` edge + // would be a back edge. + if matches!(e.weight(), Edge::Block) { + return None; + } + + if let (Some(tail), Some(head)) = (tail, head) { + if doms + .get(tail) + .is_some_and(|tail_doms| tail_doms.contains(head)) + { + Some(BackEdge { head, tail }) + } else { + None + } + } else { + None + } + }) + .collect(); + + let mut rev_cfg = cfg.graph().clone(); + rev_cfg.reverse(); + + back_edges + .into_iter() + .map(|be| be.natural_loop(cfg, &rev_cfg)) + .collect() +} diff --git a/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg/properties.rs b/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg/properties.rs new file mode 100644 index 000000000..890d7ccdd --- /dev/null +++ b/src/cwe_checker_lib/src/analysis/graph/intraprocedural_cfg/properties.rs @@ -0,0 +1,47 @@ +//! Some simple CFG properties. + +use crate::analysis::graph::intraprocedural_cfg::IntraproceduralCfg; + +use petgraph::algo::connected_components; + +impl<'a> IntraproceduralCfg<'a> { + /// Returns the cyclomatic complexity of the given CFG. + pub fn cyclomatic_complexity(&self) -> u32 { + let p = connected_components(&self.graph()); + let e = self.graph().edge_count(); + let n = self.graph().node_count(); + + (e - n + 2 * p) as u32 + } + + /// Returns a number indicating the likeliness that this CFG was obfuscated + /// by control flow flattening. + /// + /// See this [blog post] for more information. The score is between 0 and + /// 1_000_000 inclusive. + /// + /// Expects that loops and dominators are computed. + /// + /// [blog post]: https://synthesis.to/2021/03/03/flattening_detection.html + pub fn flattening_score(&self) -> usize { + const MAX_SCORE: usize = 1_000_000; + + let doms = self.get_dominators().expect("Compute dominators first."); + // Compute the maximum number of blocks dominated by a block that + // controls a natural loop. + let tmp = self + .get_natural_loops() + .expect("Compute loops first.") + .iter() + .map(|l| { + let head = l.head(); + // Get number of nodes dominated by this loop head. + doms.get(head).unwrap().len() + }) + .max() + // Score is 0 if there are no loops. + .unwrap_or(0); + + (tmp * MAX_SCORE) / self.num_blocks() + } +}