Merge pull request #48 from flosacca/fix-unescape-html

Fix unescapeHTML
ruby · Nov 30, 2023 · e4c6337 · e4c6337
2 parents 1fda83f + 354a408
commit e4c6337
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 14 deletions.
diff --git a/ext/cgi/escape/escape.c b/ext/cgi/escape/escape.c
@@ -83,7 +83,7 @@ optimized_unescape_html(VALUE str)
     unsigned long charlimit = (strcasecmp(rb_enc_name(enc), "UTF-8") == 0 ? UNICODE_MAX :
                                strcasecmp(rb_enc_name(enc), "ISO-8859-1") == 0 ? 256 :
                                128);
-    long i, len, beg = 0;
+    long i, j, len, beg = 0;
     size_t clen, plen;
     int overflow;
     const char *cstr;
@@ -100,6 +100,7 @@ optimized_unescape_html(VALUE str)
         plen = i - beg;
         if (++i >= len) break;
         c = (unsigned char)cstr[i];
+        j = i;
 #define MATCH(s) (len - i >= (int)rb_strlen_lit(s) && \
                   memcmp(&cstr[i], s, rb_strlen_lit(s)) == 0 && \
                   (i += rb_strlen_lit(s) - 1, 1))
@@ -112,28 +113,40 @@ optimized_unescape_html(VALUE str)
             else if (MATCH("mp;")) {
                 c = '&';
             }
-            else continue;
+            else {
+                i = j;
+                continue;
+            }
             break;
           case 'q':
             ++i;
             if (MATCH("uot;")) {
                 c = '"';
             }
-            else continue;
+            else {
+                i = j;
+                continue;
+            }
             break;
           case 'g':
             ++i;
             if (MATCH("t;")) {
                 c = '>';
             }
-            else continue;
+            else {
+                i = j;
+                continue;
+            }
             break;
           case 'l':
             ++i;
             if (MATCH("t;")) {
                 c = '<';
             }
-            else continue;
+            else {
+                i = j;
+                continue;
+            }
             break;
           case '#':
             if (len - ++i >= 2 && ISDIGIT(cstr[i])) {
@@ -142,9 +155,15 @@ optimized_unescape_html(VALUE str)
             else if ((cstr[i] == 'x' || cstr[i] == 'X') && len - ++i >= 2 && ISXDIGIT(cstr[i])) {
                 cc = ruby_scan_digits(&cstr[i], len-i, 16, &clen, &overflow);
             }
-            else continue;
+            else {
+                i = j;
+                continue;
+            }
             i += clen;
-            if (overflow || cc >= charlimit || cstr[i] != ';') continue;
+            if (overflow || cc >= charlimit || cstr[i] != ';') {
+                i = j;
+                continue;
+            }
             if (!dest) {
                 dest = rb_str_buf_new(len);
             }

diff --git a/ext/java/org/jruby/ext/cgi/escape/CGIEscape.java b/ext/java/org/jruby/ext/cgi/escape/CGIEscape.java
@@ -140,7 +140,7 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) {
         int charlimit = (enc instanceof UTF8Encoding) ? UNICODE_MAX :
                 (enc instanceof ISO8859_1Encoding) ? 256 :
                         128;
-        int i, len, beg = 0;
+        int i, j, len, beg = 0;
         int clen = 0, plen;
         boolean overflow = false;
         byte[] cstrBytes;
@@ -160,6 +160,7 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) {
             plen = i - beg;
             if (++i >= len) break;
             c = cstrBytes[cstr + i] & 0xFF;
+            j = i;
             switch (c) {
                 case 'a':
                     ++i;
@@ -169,28 +170,40 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) {
                     } else if (MATCH(MPSEMI, len, i, cstrBytes, cstr)) {
                         i += MPSEMI.length - 1;
                         c = '&';
-                    } else continue;
+                    } else {
+                        i = j;
+                        continue;
+                    }
                     break;
                 case 'q':
                     ++i;
                     if (MATCH(UOTSEMI, len, i, cstrBytes, cstr)) {
                         i += UOTSEMI.length - 1;
                         c = '"';
-                    } else continue;
+                    } else {
+                        i = j;
+                        continue;
+                    }
                     break;
                 case 'g':
                     ++i;
                     if (MATCH(TSEMI, len, i, cstrBytes, cstr)) {
                         i += TSEMI.length - 1;
                         c = '>';
-                    } else continue;
+                    } else {
+                        i = j;
+                        continue;
+                    }
                     break;
                 case 'l':
                     ++i;
                     if (MATCH(TSEMI, len, i, cstrBytes, cstr)) {
                         i += TSEMI.length - 1;
                         c = '<';
-                    } else continue;
+                    } else {
+                        i = j;
+                        continue;
+                    }
                     break;
                 case '#':
                     if (len - ++i >= 2 && Character.isDigit(cstrBytes[cstr + i])) {
@@ -203,9 +216,15 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) {
                         cc = ruby_scan_digits(cstrBytes, cstr + i, len - i, 16, clenOverflow);
                         clen = clenOverflow[0];
                         overflow = clenOverflow[1] == 1;
-                    } else continue;
+                    } else {
+                        i = j;
+                        continue;
+                    }
                     i += clen;
-                    if (overflow || cc >= charlimit || i >= len || cstrBytes[cstr + i] != ';') continue;
+                    if (overflow || cc >= charlimit || i >= len || cstrBytes[cstr + i] != ';') {
+                        i = j;
+                        continue;
+                    }
                     if (dest == null) {
                         dest = RubyString.newStringLight(runtime, len);
                     }

diff --git a/test/cgi/test_cgi_util.rb b/test/cgi/test_cgi_util.rb
@@ -186,6 +186,22 @@ def test_cgi_unescapeHTML_invalid
     assert_equal('&<&amp>&quot&abcdefghijklmn', CGI.unescapeHTML('&&lt;&amp&gt;&quot&abcdefghijklmn'))
   end
 
+  module UnescapeHTMLTests
+    def test_cgi_unescapeHTML_following_known_first_letter
+      assert_equal('&a>&q>&l>&g>', CGI.unescapeHTML('&a&gt;&q&gt;&l&gt;&g&gt;'))
+    end
+
+    def test_cgi_unescapeHTML_following_number_sign
+      assert_equal('&#>&#x>', CGI.unescapeHTML('&#&gt;&#x&gt;'))
+    end
+
+    def test_cgi_unescapeHTML_following_invalid_numeric
+      assert_equal('&#1114112>&#x110000>', CGI.unescapeHTML('&#1114112&gt;&#x110000&gt;'))
+    end
+  end
+
+  include UnescapeHTMLTests
+
   Encoding.list.each do |enc|
     begin
       escaped = "&#39;&amp;&quot;&gt;&lt;".encode(enc)
@@ -283,6 +299,8 @@ def teardown
     end if defined?(CGI::Escape)
   end
 
+  include CGIUtilTest::UnescapeHTMLTests
+
   def test_cgi_escapeHTML_with_invalid_byte_sequence
     assert_equal("&lt;\xA4??&gt;", CGI.escapeHTML(%[<\xA4??>]))
   end