Skip to content

Commit

Permalink
fixes jhy#1230
Browse files Browse the repository at this point in the history
  • Loading branch information
InanisV committed Apr 20, 2020
1 parent 89580cc commit 1b679df
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 0 deletions.
13 changes: 13 additions & 0 deletions src/main/java/org/jsoup/Jsoup.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ public static Document parse(String html) {
return Parser.parse(html, "");
}

/**
Parse HTML into a Document with only legal tags by encoding illegal ones.
As no base URI is specified, absolute URL detection relies on the HTML including a {@code <base href>} tag.
@param html HTML to parse
@return sane HTML
@see #parse(String, String)
*/
public static Document parseLegalTag(String html) {
return Parser.parseLegalTag(html, "");
}

/**
* Creates a new {@link Connection} to a URL. Use to fetch and parse a HTML page.
* <p>
Expand Down
47 changes: 47 additions & 0 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

import java.lang.reflect.Field;
import java.util.ArrayList;

import static org.jsoup.internal.StringUtil.inSorted;
Expand Down Expand Up @@ -253,6 +254,20 @@ private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
},
InBody {
boolean process(Token t, HtmlTreeBuilder tb) {
if (tb.legalize) {
if (t.type == Token.TokenType.StartTag && !islegal(t.asStartTag().normalName)) {
Token.StartTag st = t.asStartTag();
Token.Character newT = new Token.Character();
newT.data("<" + st.tagName + ">");
t = newT;
} else if (t.type == Token.TokenType.EndTag && !islegal(t.asEndTag().normalName)) {
Token.EndTag et = t.asEndTag();
Token.Character newT = new Token.Character();
newT.data("</" + et.tagName + ">");
t = newT;
}
}

switch (t.type) {
case Character: {
Token.Character c = t.asCharacter();
Expand Down Expand Up @@ -868,6 +883,38 @@ else if (!tb.onStack(formatEl)) {
}
return true;
}

private boolean islegal(String tagName) {
try {
// check tags in Constants
Field[] fields = Constants.class.getDeclaredFields();
Constants _const = new Constants();
for (Field item: fields) {
String[] tags = (String[]) item.get(_const);
if (inSorted(tagName, tags))
return true;
}

// check tags in Tag.java
Tag _tag = Tag.valueOf("noMeaning");
String[] checkList = {"blockTags", "inlineTags", "emptyTags", "formatAsInlineTags",
"preserveWhitespaceTags", "formListedTags", "formSubmitTags"};
for (String name: checkList) {
Field field = Tag.class.getDeclaredField(name);
field.setAccessible(true);
if (inSorted(tagName, (String[]) field.get(_tag))) {
field.setAccessible(false);
return true;
}
field.setAccessible(false);
}
} catch (IllegalAccessException e) {
System.err.println("IllegalAccessException");
} catch (NoSuchFieldException e) {
System.err.println("NoSuchFieldException");
}
return false;
}
},
Text {
// in script, style etc. normally treated as data tags
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/org/jsoup/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,19 @@ public static Document parse(String html, String baseUri) {
return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder));
}

/**
* Parse HTML into a Document with encoding illegal tags.
*
* @param html HTML to parse
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return parsed Document
*/
public static Document parseLegalTag(String html, String baseUri) {
TreeBuilder treeBuilder = new HtmlTreeBuilder();
return treeBuilder.parseLegalTag(new StringReader(html), baseUri, new Parser(treeBuilder));
}

/**
* Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
*
Expand Down
15 changes: 15 additions & 0 deletions src/main/java/org/jsoup/parser/TreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ abstract class TreeBuilder {
protected String baseUri; // current base uri, for creating new elements
protected Token currentToken; // currentToken is used only for error tracking.
protected ParseSettings settings;
protected boolean legalize; // if encoding illegal tags

private Token.StartTag start = new Token.StartTag(); // start tag to process
private Token.EndTag end = new Token.EndTag();
Expand All @@ -40,6 +41,7 @@ protected void initialiseParse(Reader input, String baseUri, Parser parser) {
tokeniser = new Tokeniser(reader, parser.getErrors());
stack = new ArrayList<>(32);
this.baseUri = baseUri;
legalize = false;
}

Document parse(Reader input, String baseUri, Parser parser) {
Expand All @@ -55,6 +57,19 @@ Document parse(Reader input, String baseUri, Parser parser) {
return doc;
}

Document parseLegalTag(Reader input, String baseUri, Parser parser) {
initialiseParse(input, baseUri, parser);
this.legalize = true;
runParser();

reader.close();
reader = null;
tokeniser = null;
stack = null;

return doc;
}

abstract List<Node> parseFragment(String inputFragment, Element context, String baseUri, Parser parser);

protected void runParser() {
Expand Down
7 changes: 7 additions & 0 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1362,4 +1362,11 @@ public void testUNewlines() {
doc.outputSettings().prettyPrint(false);
assertEquals("<html><head></head><body>One <p>Hello!</p><p>There</p></body></html> ", doc.outerHtml());
}

@Test public void testParseLegalTag() {
String html = "<p>Try <aaa>jsoup</aaa></p><div><hello></div>";
Document doc = Jsoup.parseLegalTag(html);
doc.outputSettings().prettyPrint(false);
assertEquals("<html><head></head><body><p>Try &lt;aaa&gt;jsoup&lt;/aaa&gt;</p><div>&lt;hello&gt;</div></body></html>", doc.outerHtml());
}
}

0 comments on commit 1b679df

Please sign in to comment.