Skip to content

Commit

Permalink
refactor: Add ambiguous encoding and stricter rules
Browse files Browse the repository at this point in the history
  • Loading branch information
jannden committed Dec 10, 2024
1 parent d65d97e commit fa8ffcb
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 31 deletions.
86 changes: 64 additions & 22 deletions src/bin/migrate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,22 @@ fn detect_database_encoding(users: Vec<SyncUser>) -> ExternalIdEncoding {
}
}

// Return early if no valid samples
if total == 0 {
return ExternalIdEncoding::Ambiguous;

Check warning on line 95 in src/bin/migrate.rs

View check run for this annotation

Codecov / codecov/patch

src/bin/migrate.rs#L95

Added line #L95 was not covered by tests
}

// Use thresholds to determine encoding
let hex_ratio = f64::from(hex_count) / f64::from(total);
let base64_ratio = f64::from(base64_count) / f64::from(total);

if hex_ratio > 0.8 {
ExternalIdEncoding::Hex
} else if base64_ratio > 0.8 {
ExternalIdEncoding::Base64
} else {
ExternalIdEncoding::Plain
// Require a strong majority (90%) for a format to be considered dominant
// Also detect when both formats have significant presence
match (hex_ratio, base64_ratio) {
(h, _) if h > 0.9 => ExternalIdEncoding::Hex,
(_, b) if b > 0.9 => ExternalIdEncoding::Base64,
(h, b) if h > 0.2 && b > 0.2 => ExternalIdEncoding::Ambiguous, // Both formats present
_ => ExternalIdEncoding::Ambiguous, // No clear dominant format
}
}

Expand Down Expand Up @@ -185,37 +191,73 @@ mod tests {
// "cafe" might count for both hex and base64, but "cafeb" and "abc" won't count
// for either. Out of 3, maybe 1 counts as hex/base64 and 2 are plain. Ratios:
// hex = 1/3 ≈ 0.33, base64 = 1/3 ≈ 0.33, both < 0.8.
run_detection_test(user_ids, ExternalIdEncoding::Plain);
run_detection_test(user_ids, ExternalIdEncoding::Ambiguous);
}

#[tokio::test]
async fn test_invalid_characters() {
// "zzz" is not hex. It's also not base64-safe (though 'z' is alphanumeric,
// length=3 %4!=0) "+++" is not hex and length=3 not multiple of 4 for base64.
let user_ids = vec!["zzz", "+++"];
run_detection_test(user_ids, ExternalIdEncoding::Plain);
run_detection_test(user_ids, ExternalIdEncoding::Ambiguous);
}

#[tokio::test]
async fn test_both_formats_significant() {
// 10 total users:
// - 3 hex (30%)
// - 4 base64 (40%)
// - 3 plain (30%)
let user_ids = vec![
// Hex format users (30%)
"deadbeef", "cafebabe", "12345678",
// Base64 format users (40%)
"Y2FmZQ==", // "cafe"
"Zm9vYmFy", // "foobar"
"aGVsbG8=", // "hello"
"d29ybGQ=", // "world"
// Plain format users (30%)
"plain_1", "plain_2", "plain_3",
];

// Both hex (30%) and base64 (40%) > 20% threshold
// Neither > 90% threshold
// Should detect as Ambiguous
run_detection_test(user_ids, ExternalIdEncoding::Ambiguous);
}

#[tokio::test]
async fn test_near_threshold_hex() {
// We want a scenario where hex ratio just hits 0.8.
// Suppose we have 5 users total, 4 of which are hex. 4/5 = 0.8
// If 4 pass as hex, and maybe 1 is something else.
let user_ids = vec!["deadbeef", "cafebabe", "beefcafe", "0123456789abcdef", "plain_id"];
// The 4 hex IDs will count, "plain_id" won't count for either.
// hex_ratio = 4/5=0.8. The code uses `>` 0.8 not `>=`, so 0.8 is NOT greater
// than 0.8. This test checks that boundary condition. Expected = Plain since
// not strictly greater.
run_detection_test(user_ids, ExternalIdEncoding::Plain);
// Testing near 90% threshold for hex
// 9 hex users and 1 plain = 90% exactly
let user_ids = vec![
"deadbeef", "cafebabe", "beefcafe", "12345678", "87654321", "abcdef12", "34567890",
"98765432", "fedcba98", "plain_id",
];
// hex_ratio = 9/10 = 0.9
// Code requires > 0.9, not >=, so this should be Ambiguous
run_detection_test(user_ids, ExternalIdEncoding::Ambiguous);
}

#[tokio::test]
async fn test_near_threshold_base64() {
// Similar scenario for base64
// 5 users, 4 are valid base64. 4/5=0.8 exactly.
let user_ids = vec!["Y2FmZQ==", "Zm9v", "YmFy", "YQ==", "plain_id"];
// Again hits exactly 0.8, not greater, expect Plain
run_detection_test(user_ids, ExternalIdEncoding::Plain);
// Testing near 90% threshold for base64
// 9 base64 users and 1 plain = 90% exactly
let user_ids = vec![
"Y2FmZQ==", // cafe
"Zm9vYmFy", // foobar
"aGVsbG8=", // hello
"d29ybGQ=", // world
"dGVzdA==", // test
"YWJjZA==", // abcd
"eHl6Nzg=", // xyz78
"cXdlcnQ=", // qwert
"MTIzNDU=", // 12345
"plain_id",
];
// base64_ratio = 9/10 = 0.9
// Code requires > 0.9, not >=, so this should be Ambiguous
run_detection_test(user_ids, ExternalIdEncoding::Ambiguous);
}

#[tokio::test]
Expand Down
49 changes: 41 additions & 8 deletions src/user.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ pub enum ExternalIdEncoding {
Base64,
/// The external ID is stored as a plain string
Plain,
/// The encoding could not be determined
Ambiguous,
}

/// Source-agnostic representation of a user
Expand Down Expand Up @@ -118,7 +120,7 @@ impl User {
expected_encoding: ExternalIdEncoding,
) -> Result<User> {
// Double check the encoding
match &self.external_user_id {
let detected_encoding = match &self.external_user_id {
s if s.is_empty() => {
tracing::warn!(?self, "Skipping user due to empty uid");
return Ok(self.clone());
Expand All @@ -133,6 +135,7 @@ impl User {
"Encoding mismatch detected"
);
}
ExternalIdEncoding::Hex
}
s if s
.chars()
Expand All @@ -148,6 +151,7 @@ impl User {
"Encoding mismatch detected"

Check warning on line 151 in src/user.rs

View check run for this annotation

Codecov / codecov/patch

src/user.rs#L151

Added line #L151 was not covered by tests
);
}
ExternalIdEncoding::Base64
}
_ => {
// Plain or unknown encoding
Expand All @@ -159,20 +163,38 @@ impl User {
"Encoding mismatch detected"
);
}
ExternalIdEncoding::Plain
}
};

let new_external_id = match expected_encoding {
ExternalIdEncoding::Hex => self.external_user_id.clone(),
ExternalIdEncoding::Base64 => {
if let Ok(decoded) = general_purpose::STANDARD.decode(&self.external_user_id) {
hex::encode(decoded)
} else {
tracing::warn!(?self, "Failed to decode base64 ID despite database heuristic");
hex::encode(self.external_user_id.as_bytes())
ExternalIdEncoding::Base64 => decode_base64_or_fallback(
&self.external_user_id,
"Failed to decode base64 ID despite database heuristic",
),
ExternalIdEncoding::Plain => hex::encode(self.external_user_id.as_bytes()),
ExternalIdEncoding::Ambiguous => {
tracing::warn!(
?self,
"Using case-by-case detected encoding due to ambiguous expected encoding"
);
match detected_encoding {
ExternalIdEncoding::Hex => self.external_user_id.clone(),
ExternalIdEncoding::Base64 => decode_base64_or_fallback(
&self.external_user_id,
"Failed to decode base64 ID despite case-by-case handling",
),

Check warning on line 187 in src/user.rs

View check run for this annotation

Codecov / codecov/patch

src/user.rs#L184-L187

Added lines #L184 - L187 were not covered by tests
ExternalIdEncoding::Plain => hex::encode(self.external_user_id.as_bytes()),
ExternalIdEncoding::Ambiguous => {
tracing::error!(

Check warning on line 190 in src/user.rs

View check run for this annotation

Codecov / codecov/patch

src/user.rs#L190

Added line #L190 was not covered by tests
?self,
"Unreachable code? Ambiguous encoding detected despite case-by-case handling."

Check warning on line 192 in src/user.rs

View check run for this annotation

Codecov / codecov/patch

src/user.rs#L192

Added line #L192 was not covered by tests
);
unreachable!("Ambiguous encoding should not be detected here");

Check warning on line 194 in src/user.rs

View check run for this annotation

Codecov / codecov/patch

src/user.rs#L194

Added line #L194 was not covered by tests
}
}
}
ExternalIdEncoding::Plain => hex::encode(self.external_user_id.as_bytes()),
};

Ok(Self { external_user_id: new_external_id, ..self.clone() })
Expand Down Expand Up @@ -204,3 +226,14 @@ impl std::fmt::Debug for User {
.finish()
}
}

/// Helper function for base64 decoding with fallback
fn decode_base64_or_fallback(id: &str, warning_message: &str) -> String {
match general_purpose::STANDARD.decode(id) {
Ok(decoded) => hex::encode(decoded),
Err(_) => {
tracing::warn!(?id, "{}", warning_message);
hex::encode(id.as_bytes())

Check warning on line 236 in src/user.rs

View check run for this annotation

Codecov / codecov/patch

src/user.rs#L235-L236

Added lines #L235 - L236 were not covered by tests
}
}
}
2 changes: 1 addition & 1 deletion src/zitadel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ use crate::{
const FAMEDLY_USER_ROLE: &str = "User";

/// The number of users to sample for encoding detection
const USER_SAMPLE_SIZE: usize = 10;
const USER_SAMPLE_SIZE: usize = 50;

/// A very high-level Zitadel zitadel_client
#[derive(Clone, Debug)]
Expand Down

0 comments on commit fa8ffcb

Please sign in to comment.