-
-
Notifications
You must be signed in to change notification settings - Fork 134
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Our current `srcset` parsing is pretty basic. We split on comma and then on whitespace and take the first part, which is the image source URL. However, we don't handle URLs containing unencoded commas like </cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg>, which leads to false-positives. According to the spec, commas in strings should be encoded, but in practice, there are some websites which don't do that. To handle these cases, too, I propose to extend the `srcset` parsing to make use of a small "state machine", which detects if a comma is within the image source or outside of it while parsing. This is part of an effort to reduce false-positives during link checking. --------- Co-authored-by: Hugo McNally <[email protected]>
- Loading branch information
Showing
6 changed files
with
328 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
pub(crate) mod html5ever; | ||
pub(crate) mod html5gum; | ||
mod srcset; | ||
|
||
use linkify::{LinkFinder, LinkKind}; | ||
|
||
/// Check if the given URL is an email link. | ||
/// | ||
/// This operates on the raw URL strings, not the linkified version because it | ||
/// gets used in the HTML extractors, which parse the HTML attributes directly | ||
/// and return the raw strings. | ||
/// | ||
/// Note that `LinkFinder::links()` is lazy and traverses the input in `O(n)`, | ||
/// so there should be no big performance penalty for calling this function. | ||
pub(crate) fn is_email_link(input: &str) -> bool { | ||
let mut findings = LinkFinder::new().kinds(&[LinkKind::Email]).links(input); | ||
let email = match findings.next() { | ||
None => return false, | ||
Some(email) => email.as_str(), | ||
}; | ||
|
||
// Email needs to match the full string. | ||
// Strip the "mailto:" prefix if it exists. | ||
input.strip_prefix("mailto:").unwrap_or(input) == email | ||
} | ||
|
||
/// Check if the given element is in the list of preformatted ("verbatim") tags. | ||
/// | ||
/// These will be excluded from link checking by default. | ||
// Including the <script> tag is debatable, but the alternative is to | ||
// have a separate list of tags which need a separate config setting and that | ||
// seems worse. | ||
pub(crate) fn is_verbatim_elem(name: &str) -> bool { | ||
matches!( | ||
name, | ||
"code" | ||
| "kbd" | ||
| "listing" | ||
| "noscript" | ||
| "plaintext" | ||
| "pre" | ||
| "samp" | ||
| "script" | ||
| "textarea" | ||
| "var" | ||
| "xmp" | ||
) | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn test_is_email_link() { | ||
assert!(is_email_link("mailto:[email protected]")); | ||
assert!(!is_email_link("mailto:[email protected] in a sentence")); | ||
|
||
assert!(is_email_link("[email protected]")); | ||
assert!(!is_email_link("[email protected] in sentence")); | ||
assert!(!is_email_link("https://example.org")); | ||
} | ||
|
||
#[test] | ||
fn test_verbatim_matching() { | ||
assert!(is_verbatim_elem("pre")); | ||
assert!(is_verbatim_elem("code")); | ||
assert!(is_verbatim_elem("listing")); | ||
assert!(is_verbatim_elem("script")); | ||
} | ||
} |
Oops, something went wrong.