From 1b679df68a7c52253a40fb1d70b1c5bb1eea6c92 Mon Sep 17 00:00:00 2001 From: InanisV Date: Mon, 20 Apr 2020 22:36:35 +0800 Subject: [PATCH] fixes #1230 --- src/main/java/org/jsoup/Jsoup.java | 13 +++++ .../jsoup/parser/HtmlTreeBuilderState.java | 47 +++++++++++++++++++ src/main/java/org/jsoup/parser/Parser.java | 13 +++++ .../java/org/jsoup/parser/TreeBuilder.java | 15 ++++++ .../java/org/jsoup/parser/HtmlParserTest.java | 7 +++ 5 files changed, 95 insertions(+) diff --git a/src/main/java/org/jsoup/Jsoup.java b/src/main/java/org/jsoup/Jsoup.java index 84a5e34efd..a454561807 100644 --- a/src/main/java/org/jsoup/Jsoup.java +++ b/src/main/java/org/jsoup/Jsoup.java @@ -58,6 +58,19 @@ public static Document parse(String html) { return Parser.parse(html, ""); } + /** + Parse HTML into a Document with only legal tags by encoding illegal ones. + As no base URI is specified, absolute URL detection relies on the HTML including a {@code } tag. + + @param html HTML to parse + @return sane HTML + + @see #parse(String, String) + */ + public static Document parseLegalTag(String html) { + return Parser.parseLegalTag(html, ""); + } + /** * Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page. *

diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java index 21c8f64c23..d24fb88dce 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java @@ -8,6 +8,7 @@ import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; +import java.lang.reflect.Field; import java.util.ArrayList; import static org.jsoup.internal.StringUtil.inSorted; @@ -253,6 +254,20 @@ private boolean anythingElse(Token t, HtmlTreeBuilder tb) { }, InBody { boolean process(Token t, HtmlTreeBuilder tb) { + if (tb.legalize) { + if (t.type == Token.TokenType.StartTag && !islegal(t.asStartTag().normalName)) { + Token.StartTag st = t.asStartTag(); + Token.Character newT = new Token.Character(); + newT.data("<" + st.tagName + ">"); + t = newT; + } else if (t.type == Token.TokenType.EndTag && !islegal(t.asEndTag().normalName)) { + Token.EndTag et = t.asEndTag(); + Token.Character newT = new Token.Character(); + newT.data(""); + t = newT; + } + } + switch (t.type) { case Character: { Token.Character c = t.asCharacter(); @@ -868,6 +883,38 @@ else if (!tb.onStack(formatEl)) { } return true; } + + private boolean islegal(String tagName) { + try { + // check tags in Constants + Field[] fields = Constants.class.getDeclaredFields(); + Constants _const = new Constants(); + for (Field item: fields) { + String[] tags = (String[]) item.get(_const); + if (inSorted(tagName, tags)) + return true; + } + + // check tags in Tag.java + Tag _tag = Tag.valueOf("noMeaning"); + String[] checkList = {"blockTags", "inlineTags", "emptyTags", "formatAsInlineTags", + "preserveWhitespaceTags", "formListedTags", "formSubmitTags"}; + for (String name: checkList) { + Field field = Tag.class.getDeclaredField(name); + field.setAccessible(true); + if (inSorted(tagName, (String[]) field.get(_tag))) { + field.setAccessible(false); + return true; + } + field.setAccessible(false); + } + } catch (IllegalAccessException e) { + System.err.println("IllegalAccessException"); + } catch (NoSuchFieldException e) { + System.err.println("NoSuchFieldException"); + } + return false; + } }, Text { // in script, style etc. normally treated as data tags diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java index ae64918dc9..5409198ea5 100644 --- a/src/main/java/org/jsoup/parser/Parser.java +++ b/src/main/java/org/jsoup/parser/Parser.java @@ -107,6 +107,19 @@ public static Document parse(String html, String baseUri) { return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder)); } + /** + * Parse HTML into a Document with encoding illegal tags. + * + * @param html HTML to parse + * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. + * + * @return parsed Document + */ + public static Document parseLegalTag(String html, String baseUri) { + TreeBuilder treeBuilder = new HtmlTreeBuilder(); + return treeBuilder.parseLegalTag(new StringReader(html), baseUri, new Parser(treeBuilder)); + } + /** * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. * diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java index 5292705c17..9bbe053e05 100644 --- a/src/main/java/org/jsoup/parser/TreeBuilder.java +++ b/src/main/java/org/jsoup/parser/TreeBuilder.java @@ -22,6 +22,7 @@ abstract class TreeBuilder { protected String baseUri; // current base uri, for creating new elements protected Token currentToken; // currentToken is used only for error tracking. protected ParseSettings settings; + protected boolean legalize; // if encoding illegal tags private Token.StartTag start = new Token.StartTag(); // start tag to process private Token.EndTag end = new Token.EndTag(); @@ -40,6 +41,7 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) { tokeniser = new Tokeniser(reader, parser.getErrors()); stack = new ArrayList<>(32); this.baseUri = baseUri; + legalize = false; } Document parse(Reader input, String baseUri, Parser parser) { @@ -55,6 +57,19 @@ Document parse(Reader input, String baseUri, Parser parser) { return doc; } + Document parseLegalTag(Reader input, String baseUri, Parser parser) { + initialiseParse(input, baseUri, parser); + this.legalize = true; + runParser(); + + reader.close(); + reader = null; + tokeniser = null; + stack = null; + + return doc; + } + abstract List parseFragment(String inputFragment, Element context, String baseUri, Parser parser); protected void runParser() { diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java index 000ff97888..d281e5a12d 100644 --- a/src/test/java/org/jsoup/parser/HtmlParserTest.java +++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java @@ -1362,4 +1362,11 @@ public void testUNewlines() { doc.outputSettings().prettyPrint(false); assertEquals("One

Hello!

There

", doc.outerHtml()); } + + @Test public void testParseLegalTag() { + String html = "

Try jsoup

"; + Document doc = Jsoup.parseLegalTag(html); + doc.outputSettings().prettyPrint(false); + assertEquals("

Try <aaa>jsoup</aaa>

<hello>
", doc.outerHtml()); + } }