Skip to content

Commit

Permalink
Improve srcset parsing (#1160)
Browse files Browse the repository at this point in the history
Our current `srcset` parsing is pretty basic.

We split on comma and then on whitespace and take the first part, which is the image source URL.
However, we don't handle URLs containing unencoded commas like
</cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg>, which leads to false-positives.

According to the spec, commas in strings should be encoded, but in practice, there are some websites which don't do that. To handle these cases, too, I propose to extend the `srcset` parsing to make use of a small "state machine", which detects if a comma is within the image source or outside of it while parsing.

This is part of an effort to reduce false-positives during link checking.

---------
Co-authored-by: Hugo McNally <[email protected]>
  • Loading branch information
mre authored Jul 29, 2023
1 parent 62c6a05 commit cead4ce
Show file tree
Hide file tree
Showing 6 changed files with 328 additions and 97 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use html5ever::{
tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
};

use super::{is_email_link, is_verbatim_elem, plaintext::extract_plaintext};
use super::{super::plaintext::extract_plaintext, is_email_link, is_verbatim_elem, srcset};
use crate::types::uri::raw::RawUri;

#[derive(Clone, Default)]
Expand Down Expand Up @@ -157,17 +157,7 @@ impl LinkExtractor {
Some(vec![attr_value].into_iter())
}
(_, "srcset") => {
let mut urls = Vec::new();
for image_candidate_string in attr_value.trim().split(',') {
for part in image_candidate_string.split_ascii_whitespace() {
if part.is_empty() {
continue;
}
urls.push(part);
break;
}
}
Some(urls.into_iter())
Some(srcset::parse(attr_value).into_iter())
}
_ => None,
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use html5gum::{Emitter, Error, State, Tokenizer};

use super::plaintext::extract_plaintext;
use super::{is_email_link, is_verbatim_elem};
use crate::types::uri::raw::RawUri;
use super::{is_email_link, is_verbatim_elem, srcset};
use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};

#[derive(Clone)]
struct LinkExtractor {
Expand Down Expand Up @@ -75,17 +74,7 @@ impl LinkExtractor {
Some(vec![attr_value].into_iter())
}
(_, "srcset") => {
let mut urls = Vec::new();
for image_candidate_string in attr_value.trim().split(',') {
for part in image_candidate_string.split_ascii_whitespace() {
if part.is_empty() {
continue;
}
urls.push(part);
break;
}
}
Some(urls.into_iter())
Some(srcset::parse(attr_value).into_iter())
}
_ => None,
}
Expand Down Expand Up @@ -457,6 +446,7 @@ mod tests {
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}

#[test]
fn test_exclude_email_without_mailto() {
let input = r#"<!DOCTYPE html>
Expand All @@ -470,13 +460,12 @@ mod tests {
</body>
</html>"#;

let expected = vec![];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
assert!(uris.is_empty());
}

#[test]
fn test_email_false_postive() {
fn test_email_false_positive() {
let input = r#"<!DOCTYPE html>
<html lang="en-US">
<head>
Expand All @@ -488,7 +477,33 @@ mod tests {
</body>
</html>"#;

let expected = vec![];
let uris = extract_html(input, false);
assert!(uris.is_empty());
}

#[test]
fn test_extract_srcset() {
let input = r#"
<img srcset="/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 640w, /cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 750w" src="/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg">
"#;

let expected = vec![RawUri {
text: "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
element: Some("img".to_string()),
attribute: Some("srcset".to_string()),
},
RawUri {
text: "/cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
element: Some("img".to_string()),
attribute: Some("srcset".to_string()),
},
RawUri {
text: "/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
element: Some("img".to_string()),
attribute: Some("src".to_string()),
}

];
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}
Expand Down
71 changes: 71 additions & 0 deletions lychee-lib/src/extract/html/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
pub(crate) mod html5ever;
pub(crate) mod html5gum;
mod srcset;

use linkify::{LinkFinder, LinkKind};

/// Check if the given URL is an email link.
///
/// This operates on the raw URL strings, not the linkified version because it
/// gets used in the HTML extractors, which parse the HTML attributes directly
/// and return the raw strings.
///
/// Note that `LinkFinder::links()` is lazy and traverses the input in `O(n)`,
/// so there should be no big performance penalty for calling this function.
pub(crate) fn is_email_link(input: &str) -> bool {
let mut findings = LinkFinder::new().kinds(&[LinkKind::Email]).links(input);
let email = match findings.next() {
None => return false,
Some(email) => email.as_str(),
};

// Email needs to match the full string.
// Strip the "mailto:" prefix if it exists.
input.strip_prefix("mailto:").unwrap_or(input) == email
}

/// Check if the given element is in the list of preformatted ("verbatim") tags.
///
/// These will be excluded from link checking by default.
// Including the <script> tag is debatable, but the alternative is to
// have a separate list of tags which need a separate config setting and that
// seems worse.
pub(crate) fn is_verbatim_elem(name: &str) -> bool {
matches!(
name,
"code"
| "kbd"
| "listing"
| "noscript"
| "plaintext"
| "pre"
| "samp"
| "script"
| "textarea"
| "var"
| "xmp"
)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_is_email_link() {
assert!(is_email_link("mailto:[email protected]"));
assert!(!is_email_link("mailto:[email protected] in a sentence"));

assert!(is_email_link("[email protected]"));
assert!(!is_email_link("[email protected] in sentence"));
assert!(!is_email_link("https://example.org"));
}

#[test]
fn test_verbatim_matching() {
assert!(is_verbatim_elem("pre"));
assert!(is_verbatim_elem("code"));
assert!(is_verbatim_elem("listing"));
assert!(is_verbatim_elem("script"));
}
}
Loading

0 comments on commit cead4ce

Please sign in to comment.