From 7493eada1953f815dcafa6b5e85042e60a2b8b6c Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Sat, 4 Jan 2025 11:25:02 +0300 Subject: [PATCH] Improve get_encoding_from_content --- src/response.rs | 2 +- src/utils.rs | 35 ++++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/response.rs b/src/response.rs index 1a63007..50fd6bf 100644 --- a/src/response.rs +++ b/src/response.rs @@ -40,7 +40,7 @@ impl Response { } self.encoding = get_encoding_from_headers(&self.headers) .or_else(|| get_encoding_from_content(self.content.as_bytes(py))) - .unwrap_or_else(|| "UTF-8".to_string()); + .unwrap_or_else(|| "utf-8".to_string()); Ok(&self.encoding) } diff --git a/src/utils.rs b/src/utils.rs index 8721860..7e187be 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -64,9 +64,9 @@ pub fn get_encoding_from_headers( // Check for specific conditions and return the appropriate encoding if let Some(param) = params.to_ascii_lowercase().strip_prefix("charset=") { - Some(param.trim_matches('"').to_string()) + Some(param.trim_matches('"').to_ascii_lowercase()) } else if media_type == "application/json" { - Some("UTF-8".to_string()) + Some("utf-8".to_string()) } else { None } @@ -75,24 +75,23 @@ pub fn get_encoding_from_headers( /// Get encoding from the `` tag within the first 2048 bytes of HTML content. pub fn get_encoding_from_content(raw_bytes: &[u8]) -> Option { - const START_SEQUENCE: &[u8] = b"charset="; - const START_SEQUENCE_LEN: usize = START_SEQUENCE.len(); - const END_SEQUENCE: u8 = b'>'; + let start_sequence: &[u8] = b"charset="; let max_index = min(2048, raw_bytes.len()); if let Some(start_index) = raw_bytes[..max_index] - .windows(START_SEQUENCE_LEN) - .position(|window| window == START_SEQUENCE) + .windows(start_sequence.len()) + .position(|window| window == start_sequence) { - if let Some(end_index) = &raw_bytes[start_index..max_index] + let remaining_bytes = &raw_bytes[start_index + start_sequence.len()..max_index]; + if let Some(end_index) = remaining_bytes .iter() - .position(|&byte| byte == END_SEQUENCE) + .enumerate() + .position(|(i, &byte)| matches!(byte, b' ' | b'"' | b'>') && i > 0) { - let charset_slice = - &raw_bytes[start_index + START_SEQUENCE_LEN..start_index + end_index]; + let charset_slice = &remaining_bytes[..end_index]; let charset = String::from_utf8_lossy(charset_slice) .trim_matches('"') - .to_string(); + .to_ascii_lowercase(); return Some(charset); } } @@ -182,7 +181,7 @@ mod utils_tests { ); assert_eq!( get_encoding_from_headers(&headers), - Some("UTF-8".to_string()) + Some("utf-8".to_string()) ); } @@ -204,6 +203,16 @@ mod utils_tests { ); } + #[test] + fn test_get_encoding_from_content_present_charset3() { + let raw_html = + b""; + assert_eq!( + get_encoding_from_content(raw_html), + Some("utf-8".to_string()) + ); + } + #[test] fn test_get_encoding_from_content_missing_charset() { let raw_html = b"";