aptos-labs · danielxiangzl · Dec 13, 2024 · Dec 18, 2024 · Dec 13, 2024 · Dec 18, 2024
@@ -9,7 +9,7 @@ use crate::{
 };
 use anyhow::{bail, ensure, format_err};
 use aptos_bitvec::BitVec;
-use aptos_crypto::{bls12381, hash::CryptoHash, HashValue};
+use aptos_crypto::{bls12381::{self, Signature}, hash::CryptoHash, HashValue};
 use aptos_infallible::duration_since_epoch;
 use aptos_types::{
     account_address::AccountAddress,
@@ -87,6 +87,14 @@ impl Block {
         self.is_opt
     }
 
+    pub fn set_timestamp(&mut self, timestamp: u64) {
+        self.block_data.set_timestamp(timestamp);
+    }
+
+    pub fn set_signature(&mut self, signature: Signature) {
+        self.signature = Some(signature);
+    }
+
     pub fn set_quorum_cert(&mut self, qc: QuorumCert) {
         self.block_data.set_quorum_cert(qc);
     }

@@ -112,6 +112,10 @@ impl CryptoHash for BlockData {
 }
 
 impl BlockData {
+    pub fn set_timestamp(&mut self, timestamp: u64) {
+        self.timestamp_usecs = timestamp;
+    }
+
     pub fn set_quorum_cert(&mut self, qc: QuorumCert) {
         self.quorum_cert = qc;
     }

@@ -4,6 +4,7 @@
 
 use crate::{block::Block, common::Author, proof_of_store::ProofCache, sync_info::SyncInfo};
 use anyhow::{anyhow, ensure, format_err, Context, Ok, Result};
+use aptos_crypto::bls12381::Signature;
 use aptos_short_hex_str::AsShortHexStr;
 use aptos_types::validator_verifier::ValidatorVerifier;
 use serde::{Deserialize, Serialize};
@@ -26,6 +27,14 @@ impl ProposalMsg {
         }
     }
 
+    pub fn set_timestamp(&mut self, timestamp: u64) {
+        self.proposal.set_timestamp(timestamp);
+    }
+
+    pub fn set_signature(&mut self, signature: Signature) {
+        self.proposal.set_signature(signature);
+    }
+
     pub fn epoch(&self) -> u64 {
         self.proposal.epoch()
     }

@@ -459,7 +459,7 @@ impl RoundManager {
             sync_info,
             network.clone(),
             proposal_generator,
-            safety_rules,
+            safety_rules.clone(),
             proposer_election,
             parent_id,
         )
@@ -474,9 +474,19 @@ impl RoundManager {
         {
             if Self::check_whether_to_inject_reconfiguration_error() {
                 Self::attempt_to_inject_reconfiguration_error(
+                    epoch_state.clone(),
+                    network.clone(),
+                    &proposal_msg,
+                )
+                .await?;
+            }
+
+            if Self::check_whether_to_equivocate() {
+                Self::attempt_to_equivocate(
                     epoch_state,
                     network.clone(),
                     &proposal_msg,
+                    safety_rules.clone()
                 )
                 .await?;
             }
@@ -1922,6 +1932,12 @@ impl RoundManager {
         false
     }
 
+    #[cfg(feature = "failpoints")]
+    fn check_whether_to_equivocate() -> bool {
+        fail_point!("consensus::leader_equivocation", |_| true);
+        false
+    }
+
     /// Given R1 <- B2 if R1 has the reconfiguration txn, we inject error on B2 if R1.round + 1 = B2.round
     /// Direct suffix is checked by parent.has_reconfiguration && !parent.parent.has_reconfiguration
     /// The error is injected by sending proposals to half of the validators to force a timeout.
@@ -1956,4 +1972,29 @@ impl RoundManager {
             Ok(())
         }
     }
+
+    #[cfg(feature = "failpoints")]
+    async fn attempt_to_equivocate(
+        epoch_state: Arc<EpochState>,
+        network: Arc<NetworkSender>,
+        proposal_msg: &ProposalMsg,
+        safety_rules: Arc<Mutex<MetricsSafetyRules>>,
+    ) -> anyhow::Result<()> {
+        info!("[Test] Leader of epoch {} round {} equivocates", epoch_state.epoch, proposal_msg.proposal().round());
+
+        let all_peers: Vec<_> = epoch_state
+            .verifier
+            .get_ordered_account_addresses_iter()
+            .collect();
+        let mut timestamp = proposal_msg.proposal().block_data().timestamp_usecs();
+        for peer in all_peers {
+            timestamp += 1;
+            let mut modified_proposal_msg = proposal_msg.clone();
+            modified_proposal_msg.set_timestamp(timestamp);
+            let signature = safety_rules.lock().sign_proposal(modified_proposal_msg.proposal().block_data())?;
+            modified_proposal_msg.set_signature(signature);
+            network.send_proposal(modified_proposal_msg.clone(), vec![peer]).await;
+        }
+        Err(anyhow::anyhow!("Injected leader equivocation"))
+    }
 }
diff --git a/testsuite/forge-cli/src/suites/realistic_environment.rs b/testsuite/forge-cli/src/suites/realistic_environment.rs
@@ -61,6 +61,7 @@ pub(crate) fn realistic_env_sweep_wrap(
         .with_initial_fullnode_count(num_fullnodes)
         .with_validator_override_node_config_fn(Arc::new(|config, _| {
             config.execution.processed_transactions_detailed_counters = true;
+            config.api.failpoints_enabled = true;
         }))
         .add_network_test(test)
         // Test inherits the main EmitJobRequest, so update here for more precise latency measurements
@@ -82,15 +83,13 @@ pub(crate) fn realistic_env_sweep_wrap(
 pub(crate) fn realistic_env_load_sweep_test() -> ForgeConfig {
     realistic_env_sweep_wrap(10, 5, LoadVsPerfBenchmark {
         test: Box::new(PerformanceBenchmark),
-        workloads: Workloads::TPS(vec![1000, 5000, 10000, 12000, 13000, 14000, 15000]),
+        workloads: Workloads::TPS(vec![1000, 2500, 5000, 7500, 10000]),
         criteria: [
             (95, 0.9, 1.1, 1.2, 0),
             (95, 0.9, 1.1, 1.2, 0),
             (95, 0.9, 1.1, 1.2, 0),
             (95, 0.9, 1.1, 1.2, 0),
             (95, 0.9, 1.1, 1.2, 0),
-            (95, 0.9, 1.1, 1.2, 0),
-            (95, 0.9, 1.1, 1.2, 0),
         ]
         .into_iter()
         .map(

diff --git a/testsuite/forge.py b/testsuite/forge.py
@@ -1560,7 +1560,7 @@ def test(
     asyncio.run(forge_cluster.write(context.shell))
 
     # These features and profile flags are set as strings
-    enable_failpoints = forge_enable_failpoints == "true"
+    enable_failpoints = True
     enable_performance_profile = forge_enable_performance == "true"
 
     # In the below, assume that the image is pushed to all registries

diff --git a/testsuite/smoke-test/src/consensus/consensus_fault_tolerance.rs b/testsuite/smoke-test/src/consensus/consensus_fault_tolerance.rs
@@ -347,6 +347,47 @@ async fn test_execution_retry() {
     .unwrap();
 }
 
+#[tokio::test]
+async fn test_fault_tolerance_of_leader_equivocation() {
+    let num_validators = 4;
+
+    let swarm = create_swarm(num_validators, 1).await;
+    let (validator_clients, public_info) = {
+        (
+            swarm.get_validator_clients_with_names(),
+            swarm.aptos_public_info(),
+        )
+    };
+    test_consensus_fault_tolerance(
+        validator_clients,
+        public_info,
+        3,
+        5.0,
+        1,
+        Box::new(FailPointFailureInjection::new(Box::new(move |cycle, _| {
+            (
+                vec![(
+                    cycle % num_validators,
+                    "consensus::leader_equivocation".to_string(),
+                    format!("{}%return", 50),
+                )],
+                true,
+            )
+        }))),
+        Box::new(
+            move |_, executed_epochs, executed_rounds, executed_transactions, _, _| {
+                successful_criteria(executed_epochs, executed_rounds, executed_transactions);
+                Ok(())
+            },
+        ),
+        true,
+        false,
+    )
+    .await
+    .unwrap();
+    panic!("test_fault_tolerance_of_leader_equivocation");
+}
+
 #[tokio::test]
 async fn test_fault_tolerance_of_network_send() {
     // Randomly increase network failure rate, until network halts, and check that it comes back afterwards.

diff --git a/testsuite/testcases/src/performance_test.rs b/testsuite/testcases/src/performance_test.rs
@@ -2,8 +2,11 @@
 // Parts of the project are originally copyright © Meta Platforms, Inc.
 // SPDX-License-Identifier: Apache-2.0
 
+use std::{sync::Arc, time::Duration};
+use anyhow::{anyhow, bail, Context};
+
 use crate::NetworkLoadTest;
-use aptos_forge::{NetworkContextSynchronizer, NetworkTest, Result, Test};
+use aptos_forge::{NetworkContextSynchronizer, NetworkTest, Result, Swarm, SwarmExt, Test, TestReport};
 use async_trait::async_trait;
 
 pub struct PerformanceBenchmark;
@@ -14,7 +17,35 @@ impl Test for PerformanceBenchmark {
     }
 }
 
-impl NetworkLoadTest for PerformanceBenchmark {}
+#[async_trait]
+impl NetworkLoadTest for PerformanceBenchmark {
+    async fn test(
+        &self,
+        swarm: Arc<tokio::sync::RwLock<Box<dyn Swarm>>>,
+        _report: &mut TestReport,
+        duration: Duration,
+    ) -> Result<()> {
+        let validators = { swarm.read().await.get_validator_clients_with_names() };
+        // 10 vals, test 1,2,3 failures
+        let num_bad_leaders = 3;
+        for (name, validator)  in validators[..num_bad_leaders].iter() {
+            validator
+                    .set_failpoint(
+                        "consensus::leader_equivocation".to_string(),
+                        "return".to_string(),
+                    )
+                    .await
+                    .map_err(|e| {
+                        anyhow!(
+                            "set_failpoint to set consensus leader equivocation on {} failed, {:?}",
+                            name,
+                            e
+                        )
+                    })?;
+        };
+        Ok(())
+    }
+}
 
 #[async_trait]
 impl NetworkTest for PerformanceBenchmark {