diff --git a/CHANGES.md b/CHANGES.md
index 73d01e173c..5c2438b023 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -33,6 +33,8 @@
`Connection.Response#cookies()` will provide the last one set. Generally it is better to use
the [Jsoup.newSession](https://jsoup.org/cookbook/web/request-session) method to maintain a cookie jar, as that
applies appropriate path selection on cookies when making requests. [1831](https://github.com/jhy/jsoup/issues/1831)
+* When parsing named HTML entities, base entities should resolve if they are a prefix of the input token (and not in an
+ attribute). [2207](https://github.com/jhy/jsoup/issues/2207)
## 1.18.1 (2024-Jul-10)
diff --git a/pom.xml b/pom.xml
index aa01a6f380..042477e7fe 100644
--- a/pom.xml
+++ b/pom.xml
@@ -98,6 +98,7 @@
java.io.UncheckedIOException
java.util.Comparator
java.util.List
+ java.util.ArrayList
java.util.LinkedHashMap
java.util.Map
java.util.Objects
diff --git a/src/main/java/org/jsoup/nodes/Entities.java b/src/main/java/org/jsoup/nodes/Entities.java
index c4503dd02e..bff4d35fe5 100644
--- a/src/main/java/org/jsoup/nodes/Entities.java
+++ b/src/main/java/org/jsoup/nodes/Entities.java
@@ -11,7 +11,9 @@
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashMap;
import static org.jsoup.nodes.Document.OutputSettings.*;
@@ -36,6 +38,9 @@ public class Entities {
private static final char[] codeDelims = {',', ';'};
private static final HashMap multipoints = new HashMap<>(); // name -> multiple character references
+ private static final int BaseCount = 106;
+ private static final ArrayList baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching
+
public enum EscapeMode {
/**
* Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
@@ -50,6 +55,12 @@ public enum EscapeMode {
*/
extended(EntitiesData.fullPoints, 2125);
+ static {
+ // sort the base names by length, for prefix matching
+ Collections.addAll(baseSorted, base.nameKeys);
+ baseSorted.sort((a, b) -> b.length() - a.length());
+ }
+
// table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
private String[] nameKeys;
private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
@@ -134,6 +145,19 @@ public static int codepointsForName(final String name, final int[] codepoints) {
return 0;
}
+ /**
+ Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not".
+
+ @return longest entity name that is a prefix of the input, or "" if no entity matches
+ */
+ public static String findPrefix(String input) {
+ for (String name : baseSorted) {
+ if (input.startsWith(name)) return name;
+ }
+ return emptyName;
+ // if perf critical, could look at using a Trie vs a scan
+ }
+
/**
HTML escape an input string. That is, {@code <} is returned as {@code <}. The escaped string is suitable for use
both in attributes and in text data.
diff --git a/src/main/java/org/jsoup/parser/Tokeniser.java b/src/main/java/org/jsoup/parser/Tokeniser.java
index ff3bfa4f18..750ee0701f 100644
--- a/src/main/java/org/jsoup/parser/Tokeniser.java
+++ b/src/main/java/org/jsoup/parser/Tokeniser.java
@@ -228,7 +228,12 @@ void advanceTransition(TokeniserState newState) {
reader.rewindToMark();
if (looksLegit) // named with semicolon
characterReferenceError("invalid named reference [%s]", nameRef);
- return null;
+ if (inAttribute) return null;
+ // check if there's a base prefix match; consume and use that if so
+ String prefix = Entities.findPrefix(nameRef);
+ if (prefix.isEmpty()) return null;
+ reader.matchConsume(prefix);
+ nameRef = prefix;
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
// don't want that to match
diff --git a/src/test/java/org/jsoup/nodes/EntitiesTest.java b/src/test/java/org/jsoup/nodes/EntitiesTest.java
index 59708cdf99..fb9d278cfc 100644
--- a/src/test/java/org/jsoup/nodes/EntitiesTest.java
+++ b/src/test/java/org/jsoup/nodes/EntitiesTest.java
@@ -112,6 +112,13 @@ public class EntitiesTest {
assertEquals("Hello &= &", Entities.unescape(text, false));
}
+ @Test public void prefixMatch() {
+ // https://github.com/jhy/jsoup/issues/2207
+ // example from https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
+ String text = "I'm ¬it; I tell you. I'm ∉ I tell you.";
+ assertEquals("I'm ¬it; I tell you. I'm ∉ I tell you.", Entities.unescape(text, false));
+ assertEquals("I'm ¬it; I tell you. I'm ∉ I tell you.", Entities.unescape(text, true)); // not for attributes
+ }
@Test public void caseSensitive() {
String unescaped = "Ü ü & &";
diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
index a67003a839..d87318bda0 100644
--- a/src/test/java/org/jsoup/parser/HtmlParserTest.java
+++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -925,9 +925,8 @@ private static Stream dupeAttributeData() {
assertEquals(" - One
Two
", StringUtil.normaliseWhitespace(nodes.get(0).outerHtml()));
}
- @Test public void doesNotFindShortestMatchingEntity() {
- // previous behaviour was to identify a possible entity, then chomp down the string until a match was found.
- // (as defined in html5.) However in practise that lead to spurious matches against the author's intent.
+ @Test public void doesNotFindExtendedPrefixMatchingEntity() {
+ // only base entities, not extended entities, should allow prefix match (i.e., those in the spec named list that don't include a trailing ; - https://html.spec.whatwg.org/multipage/named-characters.html)
String html = "One &clubsuite; ♣";
Document doc = Jsoup.parse(html);
assertEquals(StringUtil.normaliseWhitespace("One &clubsuite; ♣"), doc.body().html());
@@ -941,6 +940,23 @@ private static Stream dupeAttributeData() {
assertEquals("& \" ® &icy &hopf и 𝕙", doc.body().html());
}
+ @Test public void findsBasePrefixEntity() {
+ // https://github.com/jhy/jsoup/issues/2207
+ String html = "a c­c I'm ¬it; I tell you. I'm ∉ I tell you.";
+ Document doc = Jsoup.parse(html);
+ doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii");
+ assertEquals("a cc I'm ¬it; I tell you. I'm ∉ I tell you.", doc.body().html());
+ assertEquals("a cc I'm ¬it; I tell you. I'm ∉ I tell you.", doc.body().text());
+
+ // and in an attribute:
+ html = "One";
+ doc = Jsoup.parse(html);
+ doc.outputSettings().escapeMode(Entities.EscapeMode.extended).charset("ascii");
+ Element el = doc.expectFirst("a");
+ assertEquals("One", el.outerHtml());
+ assertEquals(" c­c I'm ¬it; I tell you. I'm ∉ I tell you.", el.attr("title"));
+ }
+
@Test public void handlesXmlDeclarationAsBogusComment() {
String html = "One";
Document doc = Jsoup.parse(html);