Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions pkg/tbtc/signer/benches/phase5_roast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,18 @@ fn ensure_benchmark_environment() {
std::env::set_var("TBTC_SIGNER_MAX_SESSIONS", "200000");
std::env::set_var("TBTC_SIGNER_ALLOW_BOOTSTRAP", "true");
std::env::set_var("TBTC_SIGNER_ALLOW_BENCH_RESTART_HOOK", "true");
// The signer treats a missing profile as production, and the default
// `env` state-key provider requires an encryption key for persistence.
// Seed both (and pin the provider) so the README-documented
// `cargo bench --features bench-restart-hook --bench phase5_roast`
// runs in a clean shell without any pre-set TBTC_SIGNER_* variables;
// otherwise the first RunDkg persist fails.
std::env::set_var("TBTC_SIGNER_PROFILE", "development");
std::env::set_var("TBTC_SIGNER_STATE_KEY_PROVIDER", "env");
std::env::set_var(
"TBTC_SIGNER_STATE_ENCRYPTION_KEY_HEX",
"0c9258935f0a30c065befcd746cb1564e9f3c91936c0f0f1c78853fa2d6713dc",
);
});
}

Expand Down
24 changes: 21 additions & 3 deletions pkg/tbtc/signer/src/api.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use serde::{Deserialize, Serialize};
use zeroize::Zeroizing;

#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
pub struct DkgParticipant {
Expand Down Expand Up @@ -129,16 +130,31 @@ pub struct NewSigningPackageResult {
pub signing_package_hex: String,
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
#[derive(Clone, Deserialize, PartialEq, Eq, Serialize)]
pub struct SignShareRequest {
pub signing_package_hex: String,
/// Secret one-time nonces returned by `GenerateNoncesAndCommitmentsResult`.
///
/// This stateless endpoint cannot remember consumed nonces across FFI
/// calls. The caller is cryptographically responsible for single use.
pub nonces_hex: String,
/// Wrapped in `Zeroizing` so the deserialized secret is wiped from the heap
/// on drop rather than lingering in freed memory after the share is produced.
pub nonces_hex: Zeroizing<String>,
pub key_package_identifier: String,
pub key_package_hex: String,
/// Secret private key-package material; `Zeroizing` so it is wiped on drop.
pub key_package_hex: Zeroizing<String>,
}

// Custom Debug redacts the secret fields (the derive would print them verbatim).
impl std::fmt::Debug for SignShareRequest {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SignShareRequest")
.field("signing_package_hex", &self.signing_package_hex)
.field("nonces_hex", &"<redacted>")
.field("key_package_identifier", &self.key_package_identifier)
.field("key_package_hex", &"<redacted>")
.finish()
}
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize)]
Expand Down Expand Up @@ -799,6 +815,8 @@ pub struct InitSignerConfigRequest {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub state_corrupt_backup_limit: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub permit_plaintext_state_rollback: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_sessions: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_live_interactive_sessions: Option<u64>,
Expand Down
3 changes: 3 additions & 0 deletions pkg/tbtc/signer/src/engine/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,9 @@ pub(crate) const TBTC_SIGNER_ADMISSION_ALLOWLIST_IDENTIFIERS_ENV: &str =
pub(crate) const TBTC_SIGNER_ENFORCE_SIGNING_POLICY_FIREWALL_ENV: &str =
"TBTC_SIGNER_ENFORCE_SIGNING_POLICY_FIREWALL";

pub(crate) const TBTC_SIGNER_PERMIT_PLAINTEXT_STATE_ROLLBACK_ENV: &str =
"TBTC_SIGNER_PERMIT_PLAINTEXT_STATE_ROLLBACK";

pub(crate) const TBTC_SIGNER_POLICY_ALLOWED_SCRIPT_CLASSES_ENV: &str =
"TBTC_SIGNER_POLICY_ALLOWED_SCRIPT_CLASSES";

Expand Down
8 changes: 8 additions & 0 deletions pkg/tbtc/signer/src/engine/init_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,14 @@ pub(crate) fn config_values_from_request(
TBTC_SIGNER_ENFORCE_SIGNING_POLICY_FIREWALL_ENV,
request.enforce_signing_policy_firewall,
);
// Make the emergency plaintext-state rollback opt-in reachable for hosts that
// configure via init-time config (where signer_env_var reads the installed
// config, not the process environment), not just raw env.
insert_bool(
&mut values,
TBTC_SIGNER_PERMIT_PLAINTEXT_STATE_ROLLBACK_ENV,
request.permit_plaintext_state_rollback,
);
insert_bool(
&mut values,
TBTC_SIGNER_ENABLE_AUTO_QUARANTINE_ENV,
Expand Down
92 changes: 87 additions & 5 deletions pkg/tbtc/signer/src/engine/lifecycle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

use super::*;

/// Upper bound on per-session `refresh_history` length. Older records are
/// dropped once this is exceeded, bounding persisted-state size for a long-lived
/// / frequently-refreshed session. Also bounds the stale-fingerprint detection
/// window (retries older than this many refreshes are no longer recognized).
const MAX_REFRESH_HISTORY: usize = 256;

pub(crate) fn canary_max_start_sign_round_p95_ms() -> u64 {
signer_env_var(TBTC_SIGNER_CANARY_MAX_START_SIGN_ROUND_P95_MS_ENV)
.and_then(|value| value.trim().parse::<u64>().ok())
Expand Down Expand Up @@ -103,7 +109,7 @@ pub fn refresh_cadence_status(

Ok(RefreshCadenceStatusResult {
session_id: request.session_id,
refresh_count: session.refresh_history.len() as u64,
refresh_count: session.refresh_count,
last_refresh_epoch: last_refresh_record
.map(|record| record.refresh_epoch)
.unwrap_or(0),
Expand Down Expand Up @@ -393,15 +399,27 @@ pub fn refresh_shares(request: RefreshSharesRequest) -> Result<RefreshSharesResu

if let Some(existing) = &session.refresh_request_fingerprint {
if existing == &request_fingerprint {
// Idempotent replay of the *same* (most-recent) refresh request:
// return the cached result.
return session
.refresh_result
.clone()
.ok_or_else(|| EngineError::Internal("missing refresh cache".to_string()));
}

return Err(EngineError::SessionConflict {
session_id: request.session_id,
});
// A fingerprint we have already accepted before (but which is no
// longer the most recent) is a stale / out-of-order retry, not a new
// refresh. Reject it rather than re-deriving the older share set and
// bumping the epoch forward, which would roll the session back behind
// a newer refresh. A genuinely new fingerprint falls through to
// perform the refresh (supporting repeatable periodic reshares).
if session.refresh_history.iter().any(|record| {
record.request_fingerprint.as_deref() == Some(request_fingerprint.as_str())
}) {
return Err(EngineError::SessionConflict {
session_id: request.session_id.clone(),
});
}
}
}
ensure_session_insert_capacity(&guard.sessions, &request.session_id)?;
Expand Down Expand Up @@ -446,14 +464,78 @@ pub fn refresh_shares(request: RefreshSharesRequest) -> Result<RefreshSharesResu
),
});
}
session.refresh_request_fingerprint = Some(request_fingerprint);
// Preserve the previously-accepted fingerprint before overwriting it. If the
// last accepted refresh predates RefreshHistoryRecord.request_fingerprint
// (loaded from legacy state, where history records deserialize with None), its
// fingerprint lives only in refresh_request_fingerprint; backfill it onto the
// most-recent history record so a delayed retry of it is still recognized as
// stale instead of being re-executed as a new refresh.
if let Some(previous_fingerprint) = session.refresh_request_fingerprint.clone() {
let already_tracked = session.refresh_history.iter().any(|record| {
record.request_fingerprint.as_deref() == Some(previous_fingerprint.as_str())
});
if !already_tracked {
if let Some(last) = session.refresh_history.last_mut() {
if last.request_fingerprint.is_none() {
last.request_fingerprint = Some(previous_fingerprint);
}
} else {
// Legacy/degraded state can carry a fingerprint with an EMPTY
// history (refresh_history postdates refresh_request_fingerprint),
// so there is no record to backfill onto. Synthesize one carrying
// the fingerprint so a delayed retry is still recognized for
// stale-retry rejection instead of being re-executed as a new
// refresh (which would advance the epoch). Prefer the cached
// result's epoch/share_count; when the result is absent (a
// truncated/legacy blob that kept only the fingerprint, or a
// corrupt state where refresh_result deserialized to None) fall
// back to an epoch one below the new refresh so the history stays
// strictly increasing, and a zero share_count. refresh_epoch_counter
// is persisted, so a prior accepted refresh implies refresh_epoch >= 2
// and the fallback stays non-zero.
let previous_result = session.refresh_result.clone();
let synthesized_epoch = previous_result
.as_ref()
.map(|previous| previous.refresh_epoch)
.filter(|&epoch| epoch != 0 && epoch < refresh_epoch)
.unwrap_or_else(|| refresh_epoch.saturating_sub(1).max(1));
let synthesized_share_count = previous_result
.as_ref()
.map(|previous| previous.new_shares.len().min(u16::MAX as usize) as u16)
.unwrap_or(0);
session.refresh_history.push(RefreshHistoryRecord {
refresh_epoch: synthesized_epoch,
refreshed_at_unix: now_unix(),
share_count: synthesized_share_count,
key_group: session.dkg_result.as_ref().map(|dkg| dkg.key_group.clone()),
request_fingerprint: Some(previous_fingerprint),
});
}
}
}
// Monotonic total refresh count, independent of refresh_history pruning;
// backfilled from the retained history length for sessions written before
// this field existed.
session.refresh_count = session
.refresh_count
.max(session.refresh_history.len() as u64)
.saturating_add(1);
session.refresh_request_fingerprint = Some(request_fingerprint.clone());
session.refresh_result = Some(result.clone());
session.refresh_history.push(RefreshHistoryRecord {
refresh_epoch,
refreshed_at_unix: now_unix(),
share_count: result.new_shares.len().min(u16::MAX as usize) as u16,
key_group: session.dkg_result.as_ref().map(|dkg| dkg.key_group.clone()),
request_fingerprint: Some(request_fingerprint),
});
// Bound per-session history growth (state-at-rest size + stale-detection
// window). Keep the most recent records; epochs stay strictly increasing so
// refresh_history_continuity_preserved still holds.
if session.refresh_history.len() > MAX_REFRESH_HISTORY {
let excess = session.refresh_history.len() - MAX_REFRESH_HISTORY;
session.refresh_history.drain(0..excess);
}
persist_engine_state_to_storage(&guard)?;
record_hardening_telemetry(|telemetry| {
telemetry.refresh_shares_success_total =
Expand Down
44 changes: 44 additions & 0 deletions pkg/tbtc/signer/src/engine/persistence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ pub(crate) struct PersistedSessionState {
pub(crate) refresh_result: Option<RefreshSharesResult>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub(crate) refresh_history: Vec<RefreshHistoryRecord>,
#[serde(default)]
pub(crate) refresh_count: u64,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub(crate) emergency_rekey_event: Option<EmergencyRekeyEvent>,
// Phase 7.1 interactive consumption markers - the ONLY durable
Expand Down Expand Up @@ -882,6 +884,25 @@ pub(crate) fn decode_encrypted_state_envelope(
.map_err(|e| EngineError::Internal(format!("failed to decode decrypted signer state: {e}")))
}

/// Whether the legacy unencrypted plaintext state path may be accepted.
///
/// Plaintext state is UNAUTHENTICATED, so accepting it would let anyone who can
/// write the state file forge it (cleared replay markers, attacker key material)
/// without holding the state-encryption key. Per the secret-material hardening
/// plan this is an emergency-rollback-only path. The load-bearing guard is the
/// runtime non-production check (a production profile NEVER accepts plaintext,
/// regardless of build); it is additionally gated off in optimized builds via
/// `debug_assertions` and behind an explicit opt-in env flag. (Note: a release
/// build compiled with `debug-assertions = on` would still require both the
/// non-production profile and the opt-in flag.)
fn legacy_plaintext_state_permitted() -> bool {
cfg!(debug_assertions)
&& !signer_profile_is_production()
&& signer_env_var(TBTC_SIGNER_PERMIT_PLAINTEXT_STATE_ROLLBACK_ENV)
.map(|raw_value| truthy_env_flag(&raw_value))
.unwrap_or(false)
}

pub(crate) fn decode_persisted_state_storage_format(
bytes: &[u8],
) -> Result<PersistedStateStorageFormat, EngineError> {
Expand All @@ -895,6 +916,21 @@ pub(crate) fn decode_persisted_state_storage_format(
});
}

// The bytes are not an encrypted envelope. Only fall back to the legacy
// UNAUTHENTICATED plaintext format on the gated emergency-rollback path;
// otherwise refuse, so an attacker who can write the state file cannot
// bypass the AEAD envelope (forged replay markers / key material) without
// the state-encryption key.
if !legacy_plaintext_state_permitted() {
return Err(EngineError::Internal(
"refusing to load unauthenticated plaintext signer state; an \
encrypted state envelope is required (legacy plaintext is an \
emergency-rollback-only path, disabled in production and release \
builds)"
.to_string(),
));
}

let persisted = serde_json::from_slice::<PersistedEngineState>(bytes).map_err(|e| {
EngineError::Internal(format!("failed to decode signer state file payload: {e}"))
})?;
Expand Down Expand Up @@ -1442,6 +1478,13 @@ impl TryFrom<PersistedSessionState> for SessionState {
tx_result: persisted.tx_result,
refresh_request_fingerprint: persisted.refresh_request_fingerprint,
refresh_result: persisted.refresh_result,
// Backfill from history length for state written before refresh_count
// existed (serde defaults it to 0), so refresh_cadence_status reports
// the true total immediately after upgrade rather than 0 until the next
// refresh. Evaluated before refresh_history is moved below.
refresh_count: persisted
.refresh_count
.max(persisted.refresh_history.len() as u64),
refresh_history: persisted.refresh_history,
emergency_rekey_event: persisted.emergency_rekey_event,
// Live interactive state never restores: nonces are gone by
Expand Down Expand Up @@ -1588,6 +1631,7 @@ impl TryFrom<&SessionState> for PersistedSessionState {
refresh_request_fingerprint: session_state.refresh_request_fingerprint.clone(),
refresh_result: session_state.refresh_result.clone(),
refresh_history: session_state.refresh_history.clone(),
refresh_count: session_state.refresh_count,
emergency_rekey_event: session_state.emergency_rekey_event.clone(),
consumed_interactive_attempt_markers,
aggregated_interactive_attempt_markers,
Expand Down
Loading
Loading