Skip to content

Commit

Permalink
Merge pull request #48 from flosacca/fix-unescape-html
Browse files Browse the repository at this point in the history
Fix unescapeHTML
  • Loading branch information
nobu authored Nov 30, 2023
2 parents 1fda83f + 354a408 commit e4c6337
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 14 deletions.
33 changes: 26 additions & 7 deletions ext/cgi/escape/escape.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ optimized_unescape_html(VALUE str)
unsigned long charlimit = (strcasecmp(rb_enc_name(enc), "UTF-8") == 0 ? UNICODE_MAX :
strcasecmp(rb_enc_name(enc), "ISO-8859-1") == 0 ? 256 :
128);
long i, len, beg = 0;
long i, j, len, beg = 0;
size_t clen, plen;
int overflow;
const char *cstr;
Expand All @@ -100,6 +100,7 @@ optimized_unescape_html(VALUE str)
plen = i - beg;
if (++i >= len) break;
c = (unsigned char)cstr[i];
j = i;
#define MATCH(s) (len - i >= (int)rb_strlen_lit(s) && \
memcmp(&cstr[i], s, rb_strlen_lit(s)) == 0 && \
(i += rb_strlen_lit(s) - 1, 1))
Expand All @@ -112,28 +113,40 @@ optimized_unescape_html(VALUE str)
else if (MATCH("mp;")) {
c = '&';
}
else continue;
else {
i = j;
continue;
}
break;
case 'q':
++i;
if (MATCH("uot;")) {
c = '"';
}
else continue;
else {
i = j;
continue;
}
break;
case 'g':
++i;
if (MATCH("t;")) {
c = '>';
}
else continue;
else {
i = j;
continue;
}
break;
case 'l':
++i;
if (MATCH("t;")) {
c = '<';
}
else continue;
else {
i = j;
continue;
}
break;
case '#':
if (len - ++i >= 2 && ISDIGIT(cstr[i])) {
Expand All @@ -142,9 +155,15 @@ optimized_unescape_html(VALUE str)
else if ((cstr[i] == 'x' || cstr[i] == 'X') && len - ++i >= 2 && ISXDIGIT(cstr[i])) {
cc = ruby_scan_digits(&cstr[i], len-i, 16, &clen, &overflow);
}
else continue;
else {
i = j;
continue;
}
i += clen;
if (overflow || cc >= charlimit || cstr[i] != ';') continue;
if (overflow || cc >= charlimit || cstr[i] != ';') {
i = j;
continue;
}
if (!dest) {
dest = rb_str_buf_new(len);
}
Expand Down
33 changes: 26 additions & 7 deletions ext/java/org/jruby/ext/cgi/escape/CGIEscape.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) {
int charlimit = (enc instanceof UTF8Encoding) ? UNICODE_MAX :
(enc instanceof ISO8859_1Encoding) ? 256 :
128;
int i, len, beg = 0;
int i, j, len, beg = 0;
int clen = 0, plen;
boolean overflow = false;
byte[] cstrBytes;
Expand All @@ -160,6 +160,7 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) {
plen = i - beg;
if (++i >= len) break;
c = cstrBytes[cstr + i] & 0xFF;
j = i;
switch (c) {
case 'a':
++i;
Expand All @@ -169,28 +170,40 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) {
} else if (MATCH(MPSEMI, len, i, cstrBytes, cstr)) {
i += MPSEMI.length - 1;
c = '&';
} else continue;
} else {
i = j;
continue;
}
break;
case 'q':
++i;
if (MATCH(UOTSEMI, len, i, cstrBytes, cstr)) {
i += UOTSEMI.length - 1;
c = '"';
} else continue;
} else {
i = j;
continue;
}
break;
case 'g':
++i;
if (MATCH(TSEMI, len, i, cstrBytes, cstr)) {
i += TSEMI.length - 1;
c = '>';
} else continue;
} else {
i = j;
continue;
}
break;
case 'l':
++i;
if (MATCH(TSEMI, len, i, cstrBytes, cstr)) {
i += TSEMI.length - 1;
c = '<';
} else continue;
} else {
i = j;
continue;
}
break;
case '#':
if (len - ++i >= 2 && Character.isDigit(cstrBytes[cstr + i])) {
Expand All @@ -203,9 +216,15 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) {
cc = ruby_scan_digits(cstrBytes, cstr + i, len - i, 16, clenOverflow);
clen = clenOverflow[0];
overflow = clenOverflow[1] == 1;
} else continue;
} else {
i = j;
continue;
}
i += clen;
if (overflow || cc >= charlimit || i >= len || cstrBytes[cstr + i] != ';') continue;
if (overflow || cc >= charlimit || i >= len || cstrBytes[cstr + i] != ';') {
i = j;
continue;
}
if (dest == null) {
dest = RubyString.newStringLight(runtime, len);
}
Expand Down
18 changes: 18 additions & 0 deletions test/cgi/test_cgi_util.rb
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,22 @@ def test_cgi_unescapeHTML_invalid
assert_equal('&<&amp>&quot&abcdefghijklmn', CGI.unescapeHTML('&&lt;&amp&gt;&quot&abcdefghijklmn'))
end

module UnescapeHTMLTests
def test_cgi_unescapeHTML_following_known_first_letter
assert_equal('&a>&q>&l>&g>', CGI.unescapeHTML('&a&gt;&q&gt;&l&gt;&g&gt;'))
end

def test_cgi_unescapeHTML_following_number_sign
assert_equal('&#>&#x>', CGI.unescapeHTML('&#&gt;&#x&gt;'))
end

def test_cgi_unescapeHTML_following_invalid_numeric
assert_equal('&#1114112>&#x110000>', CGI.unescapeHTML('&#1114112&gt;&#x110000&gt;'))
end
end

include UnescapeHTMLTests

Encoding.list.each do |enc|
begin
escaped = "&#39;&amp;&quot;&gt;&lt;".encode(enc)
Expand Down Expand Up @@ -283,6 +299,8 @@ def teardown
end if defined?(CGI::Escape)
end

include CGIUtilTest::UnescapeHTMLTests

def test_cgi_escapeHTML_with_invalid_byte_sequence
assert_equal("&lt;\xA4??&gt;", CGI.escapeHTML(%[<\xA4??>]))
end
Expand Down

0 comments on commit e4c6337

Please sign in to comment.