From 82f4ca25bc8ed21bde9e3ec92862757fc559ddb7 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Thu, 1 Aug 2024 12:00:17 -0300 Subject: [PATCH 1/4] Add reconstructed ListBlobs marker as an experimental feature --- object_store/Cargo.toml | 1 + object_store/src/azure/client.rs | 42 ++++++++++++++++++++++++++++++++ object_store/src/azure/mod.rs | 9 +++++++ 3 files changed, 52 insertions(+) diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 4e845e5ca2d0..4d5c7a1085b8 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -67,6 +67,7 @@ aws = ["cloud", "md-5"] http = ["cloud"] tls-webpki-roots = ["reqwest?/rustls-tls-webpki-roots"] integration = [] +experimental-azure-list-offset = [] [dev-dependencies] # In alphabetical order futures-test = "0.3" diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index b5e82c2a8585..ed17b54f25c4 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -559,6 +559,19 @@ impl GetClient for AzureClient { } } +#[cfg(feature = "experimental-azure-list-offset")] +fn marker_for_offset(offset: &str, is_emulator: bool) -> String { + if is_emulator { + return offset.to_string(); + } else { + let encoded_part = BASE64_STANDARD.encode( + &format!("{:06}!{} !000028!9999-12-31T23:59:59.9999999Z!", offset.len() + 1, offset) + ).replace("=", "-"); + let length_string = format!("{}", encoded_part.len()); + return format!("{}!{}!{}", length_string.len(), length_string, encoded_part); + } +} + #[async_trait] impl ListClient for AzureClient { /// Make an Azure List request @@ -569,6 +582,7 @@ impl ListClient for AzureClient { token: Option<&str>, offset: Option<&str>, ) -> Result<(ListResult, Option)> { + #[cfg(not(feature = "experimental-azure-list-offset"))] assert!(offset.is_none()); // Not yet supported let credential = self.get_credential().await?; @@ -586,6 +600,22 @@ impl ListClient for AzureClient { query.push(("delimiter", DELIMITER)) } + #[cfg(feature = "experimental-azure-list-offset")] + let token_string = match (token, offset) { + (Some(token), _) => { + Some(token.to_string()) + } + (None, Some(offset)) => { + Some(marker_for_offset(offset, self.config.is_emulator)) + } + (None, None) => { + None + } + }; + + #[cfg(feature = "experimental-azure-list-offset")] + let token = token_string.as_deref(); + if let Some(token) = token { query.push(("marker", token)) } @@ -967,4 +997,16 @@ mod tests { let _delegated_key_response_internal: UserDelegationKey = quick_xml::de::from_str(S).unwrap(); } + + #[cfg(feature = "experimental-azure-list-offset")] + #[test] + fn test_marker_for_offset() { + // BlobStorage + let marker = marker_for_offset("file.txt", false); + assert_eq!(marker, "2!72!MDAwMDA5IWZpbGUudHh0ICEwMDAwMjghOTk5OS0xMi0zMVQyMzo1OTo1OS45OTk5OTk5WiE-"); + + // Azurite + let marker = marker_for_offset("file.txt", true); + assert_eq!(marker, "file.txt"); + } } diff --git a/object_store/src/azure/mod.rs b/object_store/src/azure/mod.rs index f89a184f9523..74b86cea2842 100644 --- a/object_store/src/azure/mod.rs +++ b/object_store/src/azure/mod.rs @@ -123,6 +123,15 @@ impl ObjectStore for MicrosoftAzure { self.client.list(prefix) } + #[cfg(feature = "experimental-azure-list-offset")] + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'_, Result> { + self.client.list_with_offset(prefix, offset) + } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { self.client.list_with_delimiter(prefix).await } From 3f5a080d6ab821569eb4a5253bbce1a4add3fe1e Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Thu, 1 Aug 2024 12:15:44 -0300 Subject: [PATCH 2/4] Explanation comment --- object_store/src/azure/client.rs | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index ed17b54f25c4..7de81c554062 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -564,6 +564,40 @@ fn marker_for_offset(offset: &str, is_emulator: bool) -> String { if is_emulator { return offset.to_string(); } else { + // Here we reconstruct an Azure marker (continuation token) from a key to be able to seek + // into an arbitrary position in the key space. + // The current format (July 2024) for the marker is as follows: + // + // +-> unpadded length of next field + // | + // | +-> unpadded length of base64 encoded field + // | | + // | | +-> base64 encoded field with padding characters (=) repaced with - + // | | | + // 2!72!MDAwMDA4IWZpbGUudHh0ITAwMDAyOCE5OTk5LTEyLTMxVDIzOjU5OjU5Ljk5OTk5OTlaIQ-- + // | | ^ + // terminators | + // | + // +------------+ + // Decoding the |base64 field| gives: + // +------------+ + // + // +-> length of key field padded to 6 digits + // | + // | +-> key to start listing at + // | | + // | | +-> length of timestamp field padded to 6 digits + // | | | + // | | | +-> constant max timestamp field + // | | | | + // 000008!file.txt!000028!9999-12-31T23:59:59.9999999Z! + // | | | | + // +----> field terminators <-------------------+ + // + // When recostructing we add a space character (ASCII 0x20) to the end of the key to change the + // `start_at` behavior into a `start_after` behavior as the space character is the first valid character + // in the lexicographical order. + let encoded_part = BASE64_STANDARD.encode( &format!("{:06}!{} !000028!9999-12-31T23:59:59.9999999Z!", offset.len() + 1, offset) ).replace("=", "-"); From 52c7d07a20e697b9c6e2ec40071e4414327b7f05 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Thu, 1 Aug 2024 12:33:09 -0300 Subject: [PATCH 3/4] fix lint --- object_store/src/azure/client.rs | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index 7de81c554062..d8ab3286b359 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -562,7 +562,7 @@ impl GetClient for AzureClient { #[cfg(feature = "experimental-azure-list-offset")] fn marker_for_offset(offset: &str, is_emulator: bool) -> String { if is_emulator { - return offset.to_string(); + offset.to_string() } else { // Here we reconstruct an Azure marker (continuation token) from a key to be able to seek // into an arbitrary position in the key space. @@ -598,11 +598,15 @@ fn marker_for_offset(offset: &str, is_emulator: bool) -> String { // `start_at` behavior into a `start_after` behavior as the space character is the first valid character // in the lexicographical order. - let encoded_part = BASE64_STANDARD.encode( - &format!("{:06}!{} !000028!9999-12-31T23:59:59.9999999Z!", offset.len() + 1, offset) - ).replace("=", "-"); + let encoded_part = BASE64_STANDARD + .encode(&format!( + "{:06}!{} !000028!9999-12-31T23:59:59.9999999Z!", + offset.len() + 1, + offset, + )) + .replace("=", "-"); let length_string = format!("{}", encoded_part.len()); - return format!("{}!{}!{}", length_string.len(), length_string, encoded_part); + format!("{}!{}!{}", length_string.len(), length_string, encoded_part) } } @@ -636,15 +640,9 @@ impl ListClient for AzureClient { #[cfg(feature = "experimental-azure-list-offset")] let token_string = match (token, offset) { - (Some(token), _) => { - Some(token.to_string()) - } - (None, Some(offset)) => { - Some(marker_for_offset(offset, self.config.is_emulator)) - } - (None, None) => { - None - } + (Some(token), _) => Some(token.to_string()), + (None, Some(offset)) => Some(marker_for_offset(offset, self.config.is_emulator)), + (None, None) => None, }; #[cfg(feature = "experimental-azure-list-offset")] @@ -1037,7 +1035,10 @@ mod tests { fn test_marker_for_offset() { // BlobStorage let marker = marker_for_offset("file.txt", false); - assert_eq!(marker, "2!72!MDAwMDA5IWZpbGUudHh0ICEwMDAwMjghOTk5OS0xMi0zMVQyMzo1OTo1OS45OTk5OTk5WiE-"); + assert_eq!( + marker, + "2!72!MDAwMDA5IWZpbGUudHh0ICEwMDAwMjghOTk5OS0xMi0zMVQyMzo1OTo1OS45OTk5OTk5WiE-" + ); // Azurite let marker = marker_for_offset("file.txt", true); From 369784b8efbe5155aa4bc3be41fb4bbaf5c809ec Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Thu, 1 Aug 2024 13:36:58 -0300 Subject: [PATCH 4/4] Add hadoop clarification --- object_store/src/azure/client.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/object_store/src/azure/client.rs b/object_store/src/azure/client.rs index d8ab3286b359..8a6550a0ee48 100644 --- a/object_store/src/azure/client.rs +++ b/object_store/src/azure/client.rs @@ -568,11 +568,11 @@ fn marker_for_offset(offset: &str, is_emulator: bool) -> String { // into an arbitrary position in the key space. // The current format (July 2024) for the marker is as follows: // - // +-> unpadded length of next field + // +-> current token version // | // | +-> unpadded length of base64 encoded field // | | - // | | +-> base64 encoded field with padding characters (=) repaced with - + // | | +-> base64 encoded field with characters (/,+,=) repaced with (_,*,-) // | | | // 2!72!MDAwMDA4IWZpbGUudHh0ITAwMDAyOCE5OTk5LTEyLTMxVDIzOjU5OjU5Ljk5OTk5OTlaIQ-- // | | ^ @@ -597,6 +597,9 @@ fn marker_for_offset(offset: &str, is_emulator: bool) -> String { // When recostructing we add a space character (ASCII 0x20) to the end of the key to change the // `start_at` behavior into a `start_after` behavior as the space character is the first valid character // in the lexicographical order. + // + // It appears that hadoop relies on this, code here: + // https://github.com/apache/hadoop/blob/059e996c02d64716707d8dfb905dc84bab317aef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L1358 let encoded_part = BASE64_STANDARD .encode(&format!( @@ -604,9 +607,10 @@ fn marker_for_offset(offset: &str, is_emulator: bool) -> String { offset.len() + 1, offset, )) + .replace("/", "_") + .replace("+", "*") .replace("=", "-"); - let length_string = format!("{}", encoded_part.len()); - format!("{}!{}!{}", length_string.len(), length_string, encoded_part) + format!("2!{}!{}", encoded_part.len(), encoded_part) } }