Skip to content

Commit 74278b3

Browse files
ekrembalatacannmmtftrCopilot
authored
Better error observability (#1172)
* do not give metrics error if fetching stopped tasks fails * init better logging * do not return error from optional message fetcher * fmt * change log level * check number sent messages in aggregator rpc streams, log errors * change comment * catch more errors * fix wrong num needed sigs issue * better errors on nonce aggregation * add partial signature verification * verifty partial sigs only when env var is true * change var naming * update docs * add logging * show all partial verify errors, not only the first * fix tracing errors * fix typos * refactor, better err formatting * show verifier pks instead of index (because its ordered) in several error messages * Update core/src/rpc/aggregator.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * better error logging in aggregator setup() * adddress copilot comments * Update core/src/rpc/aggregator.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * ignore test * fix grammar * monitor operator deposit_sign * add observing errors of every verifier in opt_payout_sign, add timeout to op_payout_sign * Update core/src/rpc/verifier.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * cargo fmt * rename var * address comments * rename fn * refactor flatten_join_named_results * cargo fmt * adjust test * update test again * add serial to test * increase timeout * skip a compatibility test in debug mode --------- Co-authored-by: atacann <atacaniyidogan@gmail.com> Co-authored-by: Mehmet Efe Akça <efeakca@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: atacann <111396231+atacann@users.noreply.github.com>
1 parent bfad983 commit 74278b3

File tree

13 files changed

+610
-245
lines changed

13 files changed

+610
-245
lines changed

core/src/aggregator.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,9 @@ impl Aggregator {
429429
debug_span!("get_deposit_keys", id=%OperatorId(operator_xonly_pk)),
430430
)
431431
.await
432-
.wrap_err(Status::internal("Operator key retrieval failed"))?
432+
.wrap_err(Status::internal(format!(
433+
"Operator {operator_xonly_pk} key retrieval failed"
434+
)))?
433435
.into_inner();
434436

435437
// A send error means that all receivers are closed,

core/src/compatibility.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,8 +276,15 @@ mod tests {
276276
assert!(msg.contains("Security council mismatch"));
277277
}
278278

279-
#[tokio::test]
279+
// serial test because it calculates sha256 of the bitvm cache for all actors
280+
#[tokio::test(flavor = "multi_thread")]
280281
async fn test_get_compatibility_data_from_entities() {
282+
// Skip this test if running in debug mode, hashing bitvm cache is slow
283+
if cfg!(debug_assertions) {
284+
eprintln!("Skipping test_get_compatibility_data_from_entities in debug mode");
285+
return;
286+
}
287+
281288
let mut config = create_test_config_with_thread_name().await;
282289
let _regtest = create_regtest_rpc(&mut config).await;
283290
let actors = create_actors::<MockCitreaClient>(&config).await;

core/src/constants.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,20 @@ mod timeout {
6565
pub const OPERATOR_SIGS_STREAM_CREATION_TIMEOUT: Duration = Duration::from_secs(300); // 5 minutes
6666
pub const DEPOSIT_FINALIZE_STREAM_CREATION_TIMEOUT: Duration = Duration::from_secs(300); // 5 minutes
6767

68+
pub const SETUP_COMPLETION_TIMEOUT: Duration = Duration::from_secs(1800); // 30 minutes
69+
6870
pub const PIPELINE_COMPLETION_TIMEOUT: Duration = Duration::from_secs(3600); // 60 minutes
6971
pub const OPERATOR_SIGS_TIMEOUT: Duration = Duration::from_secs(1200); // 20 minutes
7072
pub const SEND_OPERATOR_SIGS_TIMEOUT: Duration = Duration::from_secs(600); // 10 minutes
7173
pub const DEPOSIT_FINALIZATION_TIMEOUT: Duration = Duration::from_secs(2400); // 40 minutes
7274

75+
pub const OPTIMISTIC_PAYOUT_TIMEOUT: Duration = Duration::from_secs(300); // 5 minutes
76+
7377
pub const RESTART_BACKGROUND_TASKS_TIMEOUT: Duration = Duration::from_secs(120);
7478

7579
pub const ENTITY_STATUS_POLL_TIMEOUT: Duration = Duration::from_secs(120);
7680

77-
pub const ENTITY_COMP_DATA_POLL_TIMEOUT: Duration = Duration::from_secs(120); // 2 minutes
81+
pub const ENTITY_COMP_DATA_POLL_TIMEOUT: Duration = Duration::from_secs(240); // 4 minutes
7882

7983
pub const PUBLIC_KEY_COLLECTION_TIMEOUT: Duration = Duration::from_secs(30);
8084

core/src/errors.rs

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -252,27 +252,22 @@ impl From<Status> for BridgeError {
252252

253253
impl From<BridgeError> for tonic::Status {
254254
fn from(val: BridgeError) -> Self {
255-
let eyre_report = val.into_eyre();
256-
257-
// eyre::Report can cast any error in the chain to a Status, so we use its downcast method to get the first Status.
258-
eyre_report.downcast::<Status>().unwrap_or_else(|report| {
259-
// We don't want this case to happen, all casts to Status should contain a Status that contains a user-facing error message.
260-
tracing::error!(
261-
"Returning internal error on RPC call, full error: {:?}",
262-
report
263-
);
264-
265-
let mut status = tonic::Status::internal(report.to_string());
266-
status.set_source(Into::into(
267-
Into::<Box<dyn std::error::Error + Send + Sync>>::into(report),
268-
));
269-
status
270-
})
255+
let err = format!("{val:#}");
256+
// delete escape characters
257+
let flattened = err
258+
.replace("\\n", " ") // remove escaped newlines
259+
.replace("\n", " ") // remove real newlines
260+
.replace("\"", "") // delete quotes
261+
.replace("\\", ""); // remove any remaining backslashes
262+
let whitespace_removed = flattened.split_whitespace().collect::<Vec<_>>().join(" ");
263+
tonic::Status::internal(whitespace_removed)
271264
}
272265
}
273266

274267
#[cfg(test)]
275268
mod tests {
269+
use eyre::Context;
270+
276271
use super::*;
277272
#[test]
278273
fn test_downcast() {
@@ -290,14 +285,18 @@ mod tests {
290285
}
291286

292287
#[test]
293-
fn test_status_in_chain_cast_properly() {
294-
let err: BridgeError = eyre::eyre!("Some problem")
295-
.wrap_err(tonic::Status::deadline_exceeded("Some timer expired"))
296-
.wrap_err("Something else went wrong")
297-
.into();
288+
fn test_status_shows_all_errors_in_chain() {
289+
let err: BridgeError = Err::<(), BridgeError>(BridgeError::BitcoinRPC(
290+
BitcoinRPCError::TransactionNotConfirmed,
291+
))
292+
.wrap_err(tonic::Status::deadline_exceeded("Error A"))
293+
.wrap_err("Error B")
294+
.unwrap_err()
295+
.into();
298296

299-
let status: Status = err.into_status();
300-
assert_eq!(status.code(), tonic::Code::DeadlineExceeded);
301-
assert_eq!(status.message(), "Some timer expired");
297+
let status: Status = err.into();
298+
assert!(status.message().contains("Error A"));
299+
assert!(status.message().contains("Error B"));
300+
assert!(status.message().contains("Bitcoin"));
302301
}
303302
}

core/src/musig2.rs

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
//!
33
//! Helper functions for the MuSig2 signature scheme.
44
5-
use crate::{bitvm_client::SECP, errors::BridgeError};
5+
use crate::{aggregator::VerifierId, bitvm_client::SECP, errors::BridgeError};
66
use bitcoin::{
77
hashes::Hash,
88
key::Keypair,
@@ -195,6 +195,7 @@ pub fn aggregate_partial_signatures(
195195
.unwrap_or(false);
196196

197197
if enable_partial_sig_verification {
198+
let mut partial_sig_verification_errors = Vec::new();
198199
for ((partial_sig, pub_nonce), pub_key) in
199200
partial_sigs_and_nonces.into_iter().zip(pks.into_iter())
200201
{
@@ -205,16 +206,18 @@ pub fn aggregate_partial_signatures(
205206
*pub_nonce,
206207
to_secp_pk(pub_key),
207208
) {
208-
tracing::error!(
209-
"MuSig2 Error: partial signature verification failed for pub key: {}",
210-
pub_key
211-
);
212-
return Err(BridgeError::from(eyre::eyre!(
213-
"MuSig2 Error: partial signature verification failed for pub key: {}",
214-
pub_key
215-
)));
209+
let verifier_id = VerifierId(pub_key);
210+
partial_sig_verification_errors.push(format!("{verifier_id}"));
216211
}
217212
}
213+
if !partial_sig_verification_errors.is_empty() {
214+
let error_msg = format!(
215+
"MuSig2 Error: partial signature verification failed for verifiers: {}",
216+
partial_sig_verification_errors.join(", ")
217+
);
218+
tracing::error!("{error_msg}");
219+
return Err(BridgeError::from(eyre::eyre!(error_msg)));
220+
}
218221
}
219222
let final_sig = session.partial_sig_agg(&partial_sigs);
220223

core/src/operator.rs

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ use crate::errors::BridgeError;
2020
use crate::extended_bitcoin_rpc::ExtendedBitcoinRpc;
2121

2222
use crate::metrics::L1SyncStatusProvider;
23-
use crate::rpc::clementine::EntityStatus;
23+
use crate::rpc::clementine::{EntityStatus, StoppedTasks};
2424
use crate::task::entity_metric_publisher::{
2525
EntityMetricPublisher, ENTITY_METRIC_PUBLISHER_INTERVAL,
2626
};
@@ -225,7 +225,16 @@ where
225225
}
226226

227227
pub async fn get_current_status(&self) -> Result<EntityStatus, BridgeError> {
228-
let stopped_tasks = self.background_tasks.get_stopped_tasks().await?;
228+
let stopped_tasks = match self.background_tasks.get_stopped_tasks().await {
229+
Ok(stopped_tasks) => stopped_tasks,
230+
Err(e) => {
231+
tracing::error!("Failed to get stopped tasks: {:?}", e);
232+
StoppedTasks {
233+
stopped_tasks: vec![format!("Stopped tasks fetch failed {:?}", e)],
234+
}
235+
}
236+
};
237+
229238
// Determine if automation is enabled
230239
let automation_enabled = cfg!(feature = "automation");
231240

@@ -479,13 +488,14 @@ where
479488
pub async fn deposit_sign(
480489
&self,
481490
mut deposit_data: DepositData,
482-
) -> Result<mpsc::Receiver<schnorr::Signature>, BridgeError> {
491+
) -> Result<mpsc::Receiver<Result<schnorr::Signature, BridgeError>>, BridgeError> {
483492
self.citrea_client
484493
.check_nofn_correctness(deposit_data.get_nofn_xonly_pk()?)
485494
.await?;
486495

487496
let mut tweak_cache = TweakCache::default();
488497
let (sig_tx, sig_rx) = mpsc::channel(constants::DEFAULT_CHANNEL_SIZE);
498+
let monitor_err_sender = sig_tx.clone();
489499

490500
let deposit_blockhash = self
491501
.rpc
@@ -511,14 +521,15 @@ where
511521
Some(&mut tweak_cache),
512522
)?;
513523

514-
if sig_tx.send(sig).await.is_err() {
515-
break;
516-
}
524+
sig_tx
525+
.send(Ok(sig))
526+
.await
527+
.wrap_err("Failed to send signature in operator deposit sign")?;
517528
}
518529

519530
Ok::<(), BridgeError>(())
520531
});
521-
monitor_standalone_task(handle, "Operator deposit sign");
532+
monitor_standalone_task(handle, "Operator deposit sign", monitor_err_sender);
522533

523534
Ok(sig_rx)
524535
}

0 commit comments

Comments
 (0)