Skip to content

Commit

Permalink
move get encoding to response struct, make encoding setter, request f…
Browse files Browse the repository at this point in the history
…uture: bypass unnecessary response items
  • Loading branch information
deedy5 committed Aug 13, 2024
1 parent bd65c0a commit 81d898b
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 43 deletions.
43 changes: 12 additions & 31 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use ahash::RandomState;
use anyhow::{anyhow, Result};
use indexmap::IndexMap;
use pyo3::prelude::*;
use pyo3::types::{PyBytes, PyDict, PyString};
use pyo3::types::{PyBytes, PyDict};
use rquest::header::{HeaderMap, HeaderName, HeaderValue, COOKIE};
use rquest::tls::Impersonate;
use rquest::multipart;
Expand All @@ -18,7 +18,7 @@ mod response;
use response::Response;

mod utils;
use utils::{get_encoding_from_content, get_encoding_from_headers, json_dumps, url_encode};
use utils::{json_dumps, url_encode};

// Tokio global one-thread runtime
fn runtime() -> &'static Runtime {
Expand Down Expand Up @@ -359,42 +359,23 @@ impl Client {
let status_code = resp.status().as_u16();
let url = resp.url().to_string();
let buf = resp.bytes().await?;
let encoding = get_encoding_from_headers(&headers)
.or_else(|| get_encoding_from_content(&buf))
.unwrap_or_else(|| "UTF-8".to_string());
Ok((buf, cookies, encoding, headers, status_code, url))

log::info!("response: {} {} {}", url, status_code, buf.len());
Ok((buf, cookies, headers, status_code, url))
};

// Execute an async future, releasing the Python GIL for concurrency.
// Use Tokio global runtime to block on the future.
let result = py.allow_threads(|| runtime().block_on(future));
let (f_buf, f_cookies, f_encoding, f_headers, f_status_code, f_url) = result?;

// Response items
let cookies_dict = PyDict::new_bound(py);
for (key, value) in f_cookies {
cookies_dict.set_item(key, value)?;
}
let cookies = cookies_dict.unbind();
let encoding = PyString::new_bound(py, f_encoding.as_str()).unbind();
let headers_dict = PyDict::new_bound(py);
for (key, value) in f_headers {
headers_dict.set_item(key, value)?;
}
let headers = headers_dict.unbind();
let status_code = f_status_code.into_py(py);
let url = PyString::new_bound(py, &f_url).unbind();
let content = PyBytes::new_bound(py, &f_buf).unbind();

log::info!("response: {} {} {} {}", f_url, f_status_code, f_buf.len(), f_encoding);
let (f_buf, f_cookies, f_headers, f_status_code, f_url) = result?;

Ok(Response {
content,
cookies,
encoding,
headers,
status_code,
url,
content: PyBytes::new_bound(py, &f_buf).unbind(),
cookies: f_cookies,
encoding: String::new(),
headers: f_headers,
status_code: f_status_code,
url: f_url,
})
}

Expand Down
40 changes: 28 additions & 12 deletions src/response.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use crate::utils::{get_encoding_from_content, get_encoding_from_headers};
use ahash::RandomState;
use anyhow::{anyhow, Result};
use encoding_rs::Encoding;
use html2text::{from_read, from_read_with_decorator, render::text_renderer::TrivialDecorator};
use pyo3::prelude::*;
use pyo3::types::{PyBytes, PyDict, PyString};
use indexmap::IndexMap;
use pyo3::{prelude::*, types::PyBytes};

/// A struct representing an HTTP response.
///
Expand All @@ -13,29 +15,43 @@ pub struct Response {
#[pyo3(get)]
pub content: Py<PyBytes>,
#[pyo3(get)]
pub cookies: Py<PyDict>,
pub cookies: IndexMap<String, String, RandomState>,
#[pyo3(get, set)]
pub encoding: String,
#[pyo3(get)]
pub encoding: Py<PyString>,
pub headers: IndexMap<String, String, RandomState>,
#[pyo3(get)]
pub headers: Py<PyDict>,
pub status_code: u16,
#[pyo3(get)]
pub status_code: Py<PyAny>,
#[pyo3(get)]
pub url: Py<PyString>,
pub url: String,
}

#[pymethods]
impl Response {
#[getter]
fn get_encoding(&mut self, py: Python) -> Result<&String> {
if !self.encoding.is_empty() {
return Ok(&self.encoding);
}
self.encoding = get_encoding_from_headers(&self.headers)
.or(get_encoding_from_content(&self.content.bind(py).as_bytes()))
.unwrap_or("UTF-8".to_string());
Ok(&self.encoding)
}

#[getter]
fn text(&mut self, py: Python) -> Result<String> {
let encoding_name = &self.encoding.bind(py).to_string();
// If self.encoding is empty, call get_encoding to populate self.encoding
if self.encoding.is_empty() {
self.get_encoding(py)?;
}

// Convert Py<PyBytes> to &[u8]
let raw_bytes = &self.content.bind(py).as_bytes();

// Release the GIL here because decoding can be CPU-intensive
let (decoded_str, detected_encoding_name) = py.allow_threads(|| {
let encoding_name_bytes = &encoding_name.as_bytes().to_vec();
let encoding_name_bytes = &self.encoding.as_bytes();
let encoding = Encoding::for_label(encoding_name_bytes).ok_or({
anyhow!(
"Unsupported charset: {}",
Expand All @@ -51,8 +67,8 @@ impl Response {
})?;

// Update self.encoding based on the detected encoding
if encoding_name != &detected_encoding_name {
self.encoding = PyString::new_bound(py, &detected_encoding_name).into();
if &self.encoding != &detected_encoding_name {
self.encoding = detected_encoding_name;
}

Ok(decoded_str)
Expand Down

0 comments on commit 81d898b

Please sign in to comment.