Improve srcset parsing (#1160)

Our current `srcset` parsing is pretty basic. We split on comma and then on whitespace and take the first part, which is the image source URL. However, we don't handle URLs containing unencoded commas like </cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg>, which leads to false-positives. According to the spec, commas in strings should be encoded, but in practice, there are some websites which don't do that. To handle these cases, too, I propose to extend the `srcset` parsing to make use of a small "state machine", which detects if a comma is within the image source or outside of it while parsing. This is part of an effort to reduce false-positives during link checking. --------- Co-authored-by: Hugo McNally <[email protected]>
lycheeverse · Jul 29, 2023 · cead4ce · cead4ce
1 parent 62c6a05
commit cead4ce
Show file tree

Hide file tree

Showing 6 changed files with 328 additions and 97 deletions.
diff --git a/lychee-lib/src/extract/html5ever.rs → lychee-lib/src/extract/html/html5ever.rs b/lychee-lib/src/extract/html5ever.rs → lychee-lib/src/extract/html/html5ever.rs
@@ -4,7 +4,7 @@ use html5ever::{
     tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
 };
 
-use super::{is_email_link, is_verbatim_elem, plaintext::extract_plaintext};
+use super::{super::plaintext::extract_plaintext, is_email_link, is_verbatim_elem, srcset};
 use crate::types::uri::raw::RawUri;
 
 #[derive(Clone, Default)]
@@ -157,17 +157,7 @@ impl LinkExtractor {
                 Some(vec![attr_value].into_iter())
             }
             (_, "srcset") => {
-                let mut urls = Vec::new();
-                for image_candidate_string in attr_value.trim().split(',') {
-                    for part in image_candidate_string.split_ascii_whitespace() {
-                        if part.is_empty() {
-                            continue;
-                        }
-                        urls.push(part);
-                        break;
-                    }
-                }
-                Some(urls.into_iter())
+                Some(srcset::parse(attr_value).into_iter())
             }
             _ => None,
         }

diff --git a/lychee-lib/src/extract/html5gum.rs → lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html5gum.rs → lychee-lib/src/extract/html/html5gum.rs
@@ -1,8 +1,7 @@
 use html5gum::{Emitter, Error, State, Tokenizer};
 
-use super::plaintext::extract_plaintext;
-use super::{is_email_link, is_verbatim_elem};
-use crate::types::uri::raw::RawUri;
+use super::{is_email_link, is_verbatim_elem, srcset};
+use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};
 
 #[derive(Clone)]
 struct LinkExtractor {
@@ -75,17 +74,7 @@ impl LinkExtractor {
                 Some(vec![attr_value].into_iter())
             }
             (_, "srcset") => {
-                let mut urls = Vec::new();
-                for image_candidate_string in attr_value.trim().split(',') {
-                    for part in image_candidate_string.split_ascii_whitespace() {
-                        if part.is_empty() {
-                            continue;
-                        }
-                        urls.push(part);
-                        break;
-                    }
-                }
-                Some(urls.into_iter())
+                Some(srcset::parse(attr_value).into_iter())
             }
             _ => None,
         }
@@ -457,6 +446,7 @@ mod tests {
         let uris = extract_html(input, false);
         assert_eq!(uris, expected);
     }
+
     #[test]
     fn test_exclude_email_without_mailto() {
         let input = r#"<!DOCTYPE html>
@@ -470,13 +460,12 @@ mod tests {
           </body>
         </html>"#;
 
-        let expected = vec![];
         let uris = extract_html(input, false);
-        assert_eq!(uris, expected);
+        assert!(uris.is_empty());
     }
 
     #[test]
-    fn test_email_false_postive() {
+    fn test_email_false_positive() {
         let input = r#"<!DOCTYPE html>
         <html lang="en-US">
           <head>
@@ -488,7 +477,33 @@ mod tests {
           </body>
         </html>"#;
 
-        let expected = vec![];
+        let uris = extract_html(input, false);
+        assert!(uris.is_empty());
+    }
+
+    #[test]
+    fn test_extract_srcset() {
+        let input = r#"
+            <img srcset="/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 640w, /cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 750w" src="/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg">
+        "#;
+
+        let expected = vec![RawUri {
+            text: "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
+            element: Some("img".to_string()),
+            attribute: Some("srcset".to_string()),
+        },
+        RawUri {
+            text: "/cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
+            element: Some("img".to_string()),
+            attribute: Some("srcset".to_string()),
+        },
+        RawUri {
+            text: "/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
+            element: Some("img".to_string()),
+            attribute: Some("src".to_string()),
+        }
+
+        ];
         let uris = extract_html(input, false);
         assert_eq!(uris, expected);
     }

diff --git a/lychee-lib/src/extract/html/mod.rs b/lychee-lib/src/extract/html/mod.rs
@@ -0,0 +1,71 @@
+pub(crate) mod html5ever;
+pub(crate) mod html5gum;
+mod srcset;
+
+use linkify::{LinkFinder, LinkKind};
+
+/// Check if the given URL is an email link.
+///
+/// This operates on the raw URL strings, not the linkified version because it
+/// gets used in the HTML extractors, which parse the HTML attributes directly
+/// and return the raw strings.
+///
+/// Note that `LinkFinder::links()` is lazy and traverses the input in `O(n)`,
+/// so there should be no big performance penalty for calling this function.
+pub(crate) fn is_email_link(input: &str) -> bool {
+    let mut findings = LinkFinder::new().kinds(&[LinkKind::Email]).links(input);
+    let email = match findings.next() {
+        None => return false,
+        Some(email) => email.as_str(),
+    };
+
+    // Email needs to match the full string.
+    // Strip the "mailto:" prefix if it exists.
+    input.strip_prefix("mailto:").unwrap_or(input) == email
+}
+
+/// Check if the given element is in the list of preformatted ("verbatim") tags.
+///
+/// These will be excluded from link checking by default.
+// Including the <script> tag is debatable, but the alternative is to
+// have a separate list of tags which need a separate config setting and that
+// seems worse.
+pub(crate) fn is_verbatim_elem(name: &str) -> bool {
+    matches!(
+        name,
+        "code"
+            | "kbd"
+            | "listing"
+            | "noscript"
+            | "plaintext"
+            | "pre"
+            | "samp"
+            | "script"
+            | "textarea"
+            | "var"
+            | "xmp"
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_is_email_link() {
+        assert!(is_email_link("mailto:[email protected]"));
+        assert!(!is_email_link("mailto:[email protected] in a sentence"));
+
+        assert!(is_email_link("[email protected]"));
+        assert!(!is_email_link("[email protected] in sentence"));
+        assert!(!is_email_link("https://example.org"));
+    }
+
+    #[test]
+    fn test_verbatim_matching() {
+        assert!(is_verbatim_elem("pre"));
+        assert!(is_verbatim_elem("code"));
+        assert!(is_verbatim_elem("listing"));
+        assert!(is_verbatim_elem("script"));
+    }
+}