From 7493eada1953f815dcafa6b5e85042e60a2b8b6c Mon Sep 17 00:00:00 2001
From: deedy5 <65482418+deedy5@users.noreply.github.com>
Date: Sat, 4 Jan 2025 11:25:02 +0300
Subject: [PATCH] Improve get_encoding_from_content
---
src/response.rs | 2 +-
src/utils.rs | 35 ++++++++++++++++++++++-------------
2 files changed, 23 insertions(+), 14 deletions(-)
diff --git a/src/response.rs b/src/response.rs
index 1a63007..50fd6bf 100644
--- a/src/response.rs
+++ b/src/response.rs
@@ -40,7 +40,7 @@ impl Response {
}
self.encoding = get_encoding_from_headers(&self.headers)
.or_else(|| get_encoding_from_content(self.content.as_bytes(py)))
- .unwrap_or_else(|| "UTF-8".to_string());
+ .unwrap_or_else(|| "utf-8".to_string());
Ok(&self.encoding)
}
diff --git a/src/utils.rs b/src/utils.rs
index 8721860..7e187be 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -64,9 +64,9 @@ pub fn get_encoding_from_headers(
// Check for specific conditions and return the appropriate encoding
if let Some(param) = params.to_ascii_lowercase().strip_prefix("charset=") {
- Some(param.trim_matches('"').to_string())
+ Some(param.trim_matches('"').to_ascii_lowercase())
} else if media_type == "application/json" {
- Some("UTF-8".to_string())
+ Some("utf-8".to_string())
} else {
None
}
@@ -75,24 +75,23 @@ pub fn get_encoding_from_headers(
/// Get encoding from the `` tag within the first 2048 bytes of HTML content.
pub fn get_encoding_from_content(raw_bytes: &[u8]) -> Option {
- const START_SEQUENCE: &[u8] = b"charset=";
- const START_SEQUENCE_LEN: usize = START_SEQUENCE.len();
- const END_SEQUENCE: u8 = b'>';
+ let start_sequence: &[u8] = b"charset=";
let max_index = min(2048, raw_bytes.len());
if let Some(start_index) = raw_bytes[..max_index]
- .windows(START_SEQUENCE_LEN)
- .position(|window| window == START_SEQUENCE)
+ .windows(start_sequence.len())
+ .position(|window| window == start_sequence)
{
- if let Some(end_index) = &raw_bytes[start_index..max_index]
+ let remaining_bytes = &raw_bytes[start_index + start_sequence.len()..max_index];
+ if let Some(end_index) = remaining_bytes
.iter()
- .position(|&byte| byte == END_SEQUENCE)
+ .enumerate()
+ .position(|(i, &byte)| matches!(byte, b' ' | b'"' | b'>') && i > 0)
{
- let charset_slice =
- &raw_bytes[start_index + START_SEQUENCE_LEN..start_index + end_index];
+ let charset_slice = &remaining_bytes[..end_index];
let charset = String::from_utf8_lossy(charset_slice)
.trim_matches('"')
- .to_string();
+ .to_ascii_lowercase();
return Some(charset);
}
}
@@ -182,7 +181,7 @@ mod utils_tests {
);
assert_eq!(
get_encoding_from_headers(&headers),
- Some("UTF-8".to_string())
+ Some("utf-8".to_string())
);
}
@@ -204,6 +203,16 @@ mod utils_tests {
);
}
+ #[test]
+ fn test_get_encoding_from_content_present_charset3() {
+ let raw_html =
+ b"";
+ assert_eq!(
+ get_encoding_from_content(raw_html),
+ Some("utf-8".to_string())
+ );
+ }
+
#[test]
fn test_get_encoding_from_content_missing_charset() {
let raw_html = b"";