diff --git a/CHANGES.md b/CHANGES.md index d7559c2ef6..8f0d8a4998 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,35 +1,50 @@ # jsoup Changelog -## 1.17.2 (Pending) +## 1.18.1 (Pending) ### Improvements -* Added `Element.attribute(String)` and `Attributes.attribute(String)` to more simply obtain an `Attribute` object. - [2069](https://github.com/jhy/jsoup/issues/2069) -* If source tracking is on, and an Attribute's key is changed (via `Attribute.setKey(String)`), the source range is - now still tracked in `Attribute.sourceRange()`. [2070](https://github.com/jhy/jsoup/issues/2070) -* Added support for the `[*]` element with any attribute selector. And also restored support for selecting by an empty - attribute name prefix (`[^]`). [2079](https://github.com/jhy/jsoup/issues/2079) +* Added `Path` accepting parse methods: `Jsoup.parse(Path)`, `Jsoup.parse(path, charsetName, baseUri, parser)`, + etc. [2055](https://github.com/jhy/jsoup/pull/2055) + +### Changes + +* Removed previously deprecated internal classes and methods. [2094](https://github.com/jhy/jsoup/pull/2094) + +--- + +## 1.17.2 (2023-Dec-29) + +### Improvements + +* **Attribute object accessors**: Added `Element.attribute(String)` and `Attributes.attribute(String)` to more simply + obtain an `Attribute` object. [2069](https://github.com/jhy/jsoup/issues/2069) +* **Attribute source tracking**: If source tracking is on, and an Attribute's key is changed ( + via `Attribute.setKey(String)`), the source range is now still tracked + in `Attribute.sourceRange()`. [2070](https://github.com/jhy/jsoup/issues/2070) +* **Wildcard attribute selector**: Added support for the `[*]` element with any attribute selector. And also restored + support for selecting by an empty attribute name prefix (`[^]`). [2079](https://github.com/jhy/jsoup/issues/2079) ### Bug Fixes -* When tracking the source position of attributes, if source attribute name was mix-cased but the parser was - lower-case normalizing attribute names, the source position for that attribute was not tracked - correctly. [2067](https://github.com/jhy/jsoup/issues/2067) -* When tracking the source position of a body fragment parse, a null pointer exception was - thrown. [2068](https://github.com/jhy/jsoup/issues/2068) -* A multi-point encoded emoji entity may be incorrectly decoded to the replacement +* **Mixed-cased source position**: When tracking the source position of attributes, if the source attribute name was + mix-cased but the parser was lower-case normalizing attribute names, the source position for that attribute was not + tracked correctly. [2067](https://github.com/jhy/jsoup/issues/2067) +* **Source position NPE**: When tracking the source position of a body fragment parse, a null pointer + exception was thrown. [2068](https://github.com/jhy/jsoup/issues/2068) +* **Multi-point emoji entity**: A multi-point encoded emoji entity may be incorrectly decoded to the replacement character. [2074](https://github.com/jhy/jsoup/issues/2074) -* (Regression) in a selector like `parent [attr=va], other`, the `, OR` was binding to `[attr=va]` instead of - `parent [attr=va]`, causing incorrect selections. The fix includes a EvaluatorDebug class that generates a sexpr - to represent the query, allowing simpler and more thorough query parse +* **Selector sub-expressions**: (Regression) in a selector like `parent [attr=va], other`, the `, OR` was binding + to `[attr=va]` instead of `parent [attr=va]`, causing incorrect selections. The fix includes a EvaluatorDebug class + that generates a sexpr to represent the query, allowing simpler and more thorough query parse tests. [2073](https://github.com/jhy/jsoup/issues/2073) -* When generating XML-syntax output from parsed HTML, script nodes containing (pseudo) CData sections would have an - extraneous CData section added, causing script execution errors. Now, the data content is emitted in a HTML/XML/XHTML - polyglot format, if the data is not already within a CData section. [2078](https://github.com/jhy/jsoup/issues/2078) -* The `:has` evaluator held a non-thread-safe Iterator, and so if an Evaluator object was shared across multiple - concurrent threads, a NoSuchElement exception may be thrown, and the selected results may be incorrect. Now, the - iterator object is a thread-local. [2088](https://github.com/jhy/jsoup/issues/2088) +* **XML CData output**: When generating XML-syntax output from parsed HTML, script nodes containing (pseudo) CData + sections would have an extraneous CData section added, causing script execution errors. Now, the data content is + emitted in a HTML/XML/XHTML polyglot format, if the data is not already within a CData + section. [2078](https://github.com/jhy/jsoup/issues/2078) +* **Thread safety**: The `:has` evaluator held a non-thread-safe Iterator, and so if an Evaluator object was + shared across multiple concurrent threads, a NoSuchElement exception may be thrown, and the selected results may be + incorrect. Now, the iterator object is a thread-local. [2088](https://github.com/jhy/jsoup/issues/2088) --- Older changes for versions 0.1.1 (2010-Jan-31) through 1.17.1 (2023-Nov-27) may be found in diff --git a/pom.xml b/pom.xml index cffd1a99f7..e25abaec3e 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ org.jsoup jsoup - 1.17.2-SNAPSHOT + 1.18.1-SNAPSHOT https://jsoup.org/ jsoup is a Java library that simplifies working with real-world HTML and XML. It offers an easy-to-use API for URL fetching, data parsing, extraction, and manipulation using DOM API methods, CSS, and xpath selectors. jsoup implements the WHATWG HTML5 specification, and parses HTML to the same DOM as modern browsers. 2009 @@ -42,7 +42,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.12.0 + 3.12.1 UTF-8 @@ -88,13 +88,21 @@ 2.3.3_r2 + java.io.File + java.nio.file.* + java.nio.channels.SeekableByteChannel java.util.function.* java.util.stream.* + java.lang.Throwable java.lang.ThreadLocal java.io.UncheckedIOException + java.util.Comparator java.util.List + java.util.LinkedHashMap + java.util.Map java.util.Objects java.util.Optional + java.util.Set java.util.Spliterator java.util.Spliterators @@ -227,7 +235,7 @@ org.jsoup jsoup - 1.16.2 + 1.17.1 jar @@ -237,7 +245,8 @@ true true - + @java.lang.Deprecated + org.jsoup.UncheckedIOException diff --git a/src/main/java/org/jsoup/Jsoup.java b/src/main/java/org/jsoup/Jsoup.java index 29acbafbc4..e20311bcd8 100644 --- a/src/main/java/org/jsoup/Jsoup.java +++ b/src/main/java/org/jsoup/Jsoup.java @@ -13,6 +13,7 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; +import java.nio.file.Path; /** The core public access point to the jsoup functionality. @@ -183,6 +184,72 @@ public static Document parse(File file, @Nullable String charsetName, String bas return DataUtil.load(file, charsetName, baseUri, parser); } + /** + Parse the contents of a file as HTML. + + @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + @since 1.18.1 + */ + public static Document parse(Path path, @Nullable String charsetName, String baseUri) throws IOException { + return DataUtil.load(path, charsetName, baseUri); + } + + /** + Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. + + @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + @see #parse(File, String, String) parse(file, charset, baseUri) + @since 1.18.1 + */ + public static Document parse(Path path, @Nullable String charsetName) throws IOException { + return DataUtil.load(path, charsetName, path.toAbsolutePath().toString()); + } + + /** + Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. + The charset used to read the file will be determined by the byte-order-mark (BOM), or a {@code } tag, + or if neither is present, will be {@code UTF-8}. + +

This is the equivalent of calling {@link #parse(File, String) parse(file, null)}

+ + @param path the file to load HTML from. Supports gzipped files (ending in .z or .gz). + @return sane HTML + @throws IOException if the file could not be found or read. + @see #parse(Path, String, String) parse(file, charset, baseUri) + @since 1.18.1 + */ + public static Document parse(Path path) throws IOException { + return DataUtil.load(path, null, path.toAbsolutePath().toString()); + } + + /** + Parse the contents of a file as HTML. + + @param path file to load HTML from. Supports gzipped files (ending in .z or .gz). + @param charsetName (optional) character set of file contents. Set to {@code null} to determine from {@code http-equiv} meta tag, if + present, or fall back to {@code UTF-8} (which is often safe to do). + @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + @param parser alternate {@link Parser#xmlParser() parser} to use. + @return sane HTML + + @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + @since 1.18.1 + */ + public static Document parse(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + return DataUtil.load(path, charsetName, baseUri, parser); + } + /** Read an input stream, and parse it to a Document. diff --git a/src/main/java/org/jsoup/UncheckedIOException.java b/src/main/java/org/jsoup/UncheckedIOException.java index dd6a76b5a6..a3b4fa31b6 100644 --- a/src/main/java/org/jsoup/UncheckedIOException.java +++ b/src/main/java/org/jsoup/UncheckedIOException.java @@ -6,7 +6,7 @@ * @deprecated Use {@link java.io.UncheckedIOException} instead. This class acted as a compatibility shim for Java * versions prior to 1.8. */ -// todo annotate @Deprecated in next release (after previous @Deprecations clear) +@Deprecated public class UncheckedIOException extends java.io.UncheckedIOException { public UncheckedIOException(IOException cause) { super(cause); diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java index c1c791053c..58f44fb7c0 100644 --- a/src/main/java/org/jsoup/helper/DataUtil.java +++ b/src/main/java/org/jsoup/helper/DataUtil.java @@ -2,7 +2,6 @@ import org.jsoup.internal.ControllableInputStream; import org.jsoup.internal.Normalizer; -import org.jsoup.internal.SharedConstants; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Comment; import org.jsoup.nodes.Document; @@ -16,7 +15,6 @@ import java.io.BufferedReader; import java.io.CharArrayReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -25,8 +23,12 @@ import java.nio.Buffer; import java.nio.ByteBuffer; import java.nio.CharBuffer; +import java.nio.channels.Channels; +import java.nio.channels.SeekableByteChannel; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.Locale; import java.util.Random; import java.util.regex.Matcher; @@ -63,7 +65,7 @@ private DataUtil() {} * @throws IOException on IO error */ public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException { - return load(file, charsetName, baseUri, Parser.htmlParser()); + return load(file.toPath(), charsetName, baseUri); } /** @@ -81,18 +83,48 @@ public static Document load(File file, @Nullable String charsetName, String base * @since 1.14.2 */ public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { - InputStream stream = new FileInputStream(file); - String name = Normalizer.lowerCase(file.getName()); - if (name.endsWith(".gz") || name.endsWith(".z")) { - // unfortunately file input streams don't support marks (why not?), so we will close and reopen after read - boolean zipped; - try { - zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes - } finally { - stream.close(); + return load(file.toPath(), charsetName, baseUri, parser); + } + + /** + * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) + * are supported in addition to uncompressed files. + * + * @param path file to load + * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in + * the file will always override this setting. + * @param baseUri base URI of document, to resolve relative links against + * @return Document + * @throws IOException on IO error + */ + public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException { + return load(path, charsetName, baseUri, Parser.htmlParser()); + } + /** + * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) + * are supported in addition to uncompressed files. + * + * @param path file to load + * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in + * the file will always override this setting. + * @param baseUri base URI of document, to resolve relative links against + * @param parser alternate {@link Parser#xmlParser() parser} to use. + + * @return Document + * @throws IOException on IO error + * @since 1.17.2 + */ + public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { + final SeekableByteChannel byteChannel = Files.newByteChannel(path); + InputStream stream = Channels.newInputStream(byteChannel); + String name = Normalizer.lowerCase(path.getFileName().toString()); + if (name.endsWith(".gz") || name.endsWith(".z")) { + final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes + byteChannel.position(0); // reset to start of file + if (zipped) { + stream = new GZIPInputStream(stream); } - stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file); } return parseInputStream(stream, charsetName, baseUri, parser); } @@ -139,16 +171,15 @@ static void crossStreams(final InputStream in, final OutputStream out) throws IO static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { if (input == null) // empty body return new Document(baseUri); - input = ControllableInputStream.wrap(input, DefaultBufferSize, 0); @Nullable Document doc = null; // read the start of the stream and look for a BOM or meta charset - try { - input.mark(DefaultBufferSize); - ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. - boolean fullyRead = (input.read() == -1); - input.reset(); + try (InputStream wrappedInputStream = ControllableInputStream.wrap(input, DefaultBufferSize, 0)) { + wrappedInputStream.mark(DefaultBufferSize); + ByteBuffer firstBytes = readToByteBuffer(wrappedInputStream, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. + boolean fullyRead = (wrappedInputStream.read() == -1); + wrappedInputStream.reset(); // look for BOM - overrides any other header or input BomCharset bomCharset = detectCharsetFromBom(firstBytes); @@ -189,9 +220,8 @@ else if (first instanceof Comment) { if (comment.isXmlDeclaration()) decl = comment.asXmlDeclaration(); } - if (decl != null) { - if (decl.name().equalsIgnoreCase("xml")) - foundCharset = decl.attr("encoding"); + if (decl != null && decl.name().equalsIgnoreCase("xml")) { + foundCharset = decl.attr("encoding"); } } foundCharset = validateCharset(foundCharset); @@ -208,8 +238,7 @@ else if (first instanceof Comment) { if (doc == null) { if (charsetName == null) charsetName = defaultCharsetName; - BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), DefaultBufferSize); // Android level does not allow us try-with-resources - try { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(wrappedInputStream, Charset.forName(charsetName)), DefaultBufferSize)) { if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here long skipped = reader.skip(1); Validate.isTrue(skipped == 1); // WTF if this fails. @@ -227,14 +256,8 @@ else if (first instanceof Comment) { doc.charset(UTF_8); } } - finally { - reader.close(); - } } } - finally { - input.close(); - } return doc; } diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index fc9467aeba..ef3d2024d1 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -5,6 +5,7 @@ import org.jsoup.UncheckedIOException; import org.jsoup.UnsupportedMimeTypeException; import org.jsoup.internal.ControllableInputStream; +import org.jsoup.internal.Functions; import org.jsoup.internal.SharedConstants; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Document; @@ -39,6 +40,7 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.function.Function; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import java.util.zip.Inflater; @@ -1109,13 +1111,8 @@ private static LinkedHashMap> createHeaderMap(HttpURLConnec if (key == null || val == null) continue; // skip http1.1 line - if (headers.containsKey(key)) - headers.get(key).add(val); - else { - final ArrayList vals = new ArrayList<>(); - vals.add(val); - headers.put(key, vals); - } + final List vals = headers.computeIfAbsent(key, Functions.listFunction()); + vals.add(val); } return headers; } diff --git a/src/main/java/org/jsoup/internal/ConstrainableInputStream.java b/src/main/java/org/jsoup/internal/ConstrainableInputStream.java deleted file mode 100644 index 8f382ea042..0000000000 --- a/src/main/java/org/jsoup/internal/ConstrainableInputStream.java +++ /dev/null @@ -1,100 +0,0 @@ -package org.jsoup.internal; - -import org.jsoup.helper.DataUtil; -import org.jsoup.helper.Validate; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.SocketTimeoutException; -import java.nio.ByteBuffer; - -/** - * A jsoup internal class (so don't use it as there is no contract API) that enables constraints on an Input Stream, - * namely a maximum read size, and the ability to Thread.interrupt() the read. - * @deprecated use {@link ControllableInputStream} instead (but don't use that either, because this is jsoup internal!) - */ -@Deprecated -public final class ConstrainableInputStream extends BufferedInputStream { - private final boolean capped; - private final int maxSize; - private long startTime; - private long timeout = 0; // optional max time of request - private int remaining; - private boolean interrupted; - - private ConstrainableInputStream(InputStream in, int bufferSize, int maxSize) { - super(in, bufferSize); - Validate.isTrue(maxSize >= 0); - this.maxSize = maxSize; - remaining = maxSize; - capped = maxSize != 0; - startTime = System.nanoTime(); - } - - /** - * If this InputStream is not already a ConstrainableInputStream, let it be one. - * @param in the input stream to (maybe) wrap - * @param bufferSize the buffer size to use when reading - * @param maxSize the maximum size to allow to be read. 0 == infinite. - * @return a constrainable input stream - */ - public static ConstrainableInputStream wrap(InputStream in, int bufferSize, int maxSize) { - return in instanceof ConstrainableInputStream - ? (ConstrainableInputStream) in - : new ConstrainableInputStream(in, bufferSize, maxSize); - } - - @Override - public int read(byte[] b, int off, int len) throws IOException { - if (interrupted || capped && remaining <= 0) - return -1; - if (Thread.currentThread().isInterrupted()) { - // interrupted latches, because parse() may call twice - interrupted = true; - return -1; - } - if (expired()) - throw new SocketTimeoutException("Read timeout"); - - if (capped && len > remaining) - len = remaining; // don't read more than desired, even if available - - try { - final int read = super.read(b, off, len); - remaining -= read; - return read; - } catch (SocketTimeoutException e) { - return 0; - } - } - - /** - * Reads this inputstream to a ByteBuffer. The supplied max may be less than the inputstream's max, to support - * reading just the first bytes. - */ - public ByteBuffer readToByteBuffer(int max) throws IOException { - return DataUtil.readToByteBuffer(this, max); - } - - @Override - public void reset() throws IOException { - super.reset(); - remaining = maxSize - markpos; - } - - public ConstrainableInputStream timeout(long startTimeNanos, long timeoutMillis) { - this.startTime = startTimeNanos; - this.timeout = timeoutMillis * 1000000; - return this; - } - - private boolean expired() { - if (timeout == 0) - return false; - - final long now = System.nanoTime(); - final long dur = now - startTime; - return (dur > timeout); - } -} diff --git a/src/main/java/org/jsoup/internal/FieldsAreNonnullByDefault.java b/src/main/java/org/jsoup/internal/FieldsAreNonnullByDefault.java deleted file mode 100644 index 9e099b4ee5..0000000000 --- a/src/main/java/org/jsoup/internal/FieldsAreNonnullByDefault.java +++ /dev/null @@ -1,17 +0,0 @@ -package org.jsoup.internal; - -import org.jspecify.annotations.NullMarked; - -import java.lang.annotation.Documented; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; - -/** - @deprecated Previously indicated that fields types are not nullable, unless otherwise specified by @Nullable. - */ -@Deprecated -@Documented -@NullMarked -@Retention(value = RetentionPolicy.CLASS) -public @interface FieldsAreNonnullByDefault { -} diff --git a/src/main/java/org/jsoup/internal/Functions.java b/src/main/java/org/jsoup/internal/Functions.java new file mode 100644 index 0000000000..40227d8417 --- /dev/null +++ b/src/main/java/org/jsoup/internal/Functions.java @@ -0,0 +1,40 @@ +package org.jsoup.internal; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.IdentityHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Function; + +/** + * An internal class containing functions for use with {@link Map#computeIfAbsent(Object, Function)}. + */ +@SuppressWarnings({"rawtypes", "unchecked"}) +public final class Functions { + private static final Function ListFunction = key -> new ArrayList<>(); + private static final Function SetFunction = key -> new HashSet<>(); + private static final Function MapFunction = key -> new HashMap<>(); + private static final Function IdentityMapFunction = key -> new IdentityHashMap<>(); + + private Functions() { + } + + public static Function> listFunction() { + return (Function>) ListFunction; + } + + public static Function> setFunction() { + return (Function>) SetFunction; + } + + public static Function> mapFunction() { + return (Function>) MapFunction; + } + + public static Function> identityMapFunction() { + return (Function>) IdentityMapFunction; + } +} diff --git a/src/main/java/org/jsoup/internal/NonnullByDefault.java b/src/main/java/org/jsoup/internal/NonnullByDefault.java deleted file mode 100644 index cda055d4f2..0000000000 --- a/src/main/java/org/jsoup/internal/NonnullByDefault.java +++ /dev/null @@ -1,17 +0,0 @@ -package org.jsoup.internal; - -import org.jspecify.annotations.NullMarked; - -import java.lang.annotation.Documented; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; - -/** - @deprecated Previously indicated that all components (methods, returns, fields) are not nullable, unless otherwise specified by @Nullable. - */ -@Deprecated -@Documented -@NullMarked -@Retention(value = RetentionPolicy.CLASS) -public @interface NonnullByDefault { -} diff --git a/src/main/java/org/jsoup/internal/ReturnsAreNonnullByDefault.java b/src/main/java/org/jsoup/internal/ReturnsAreNonnullByDefault.java deleted file mode 100644 index d218d6cb7a..0000000000 --- a/src/main/java/org/jsoup/internal/ReturnsAreNonnullByDefault.java +++ /dev/null @@ -1,17 +0,0 @@ -package org.jsoup.internal; - -import org.jspecify.annotations.NullMarked; - -import java.lang.annotation.Documented; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; - -/** - @deprecated Previously indicated that return types are not nullable, unless otherwise specified by @Nullable. - */ -@Deprecated -@Documented -@NullMarked -@Retention(value = RetentionPolicy.RUNTIME) -public @interface ReturnsAreNonnullByDefault { -} diff --git a/src/main/java/org/jsoup/internal/StringUtil.java b/src/main/java/org/jsoup/internal/StringUtil.java index 85e104ef16..3be5f7d809 100644 --- a/src/main/java/org/jsoup/internal/StringUtil.java +++ b/src/main/java/org/jsoup/internal/StringUtil.java @@ -10,6 +10,8 @@ import java.util.Iterator; import java.util.Stack; import java.util.regex.Pattern; +import java.util.stream.Collector; +import java.util.stream.Collectors; /** A minimal String utility class. Designed for internal jsoup use only - the API and outcome may change without @@ -375,6 +377,23 @@ public static String releaseBuilder(StringBuilder sb) { return string; } + /** + * Return a {@link Collector} similar to the one returned by {@link Collectors#joining(CharSequence)}, + * but backed by jsoup's {@link StringJoiner}, which allows for more efficient garbage collection. + * + * @param delimiter The delimiter for separating the strings. + * @return A {@code Collector} which concatenates CharSequence elements, separated by the specified delimiter + */ + public static Collector joining(String delimiter) { + return Collector.of(() -> new StringJoiner(delimiter), + StringJoiner::add, + (j1, j2) -> { + j1.append(j2.complete()); + return j1; + }, + StringJoiner::complete); + } + private static final int MaxCachedBuilderSize = 8 * 1024; private static final int MaxIdleBuilders = 8; } diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index 41ba2e9482..9d35cc18e7 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -6,6 +6,7 @@ import org.jsoup.parser.ParseSettings; import org.jsoup.parser.Parser; import org.jsoup.parser.Tag; +import org.jsoup.parser.TokenQueue; import org.jsoup.select.Collector; import org.jsoup.select.Elements; import org.jsoup.select.Evaluator; @@ -970,12 +971,9 @@ private String cssSelectorComponent() { // Escape tagname, and translate HTML namespace ns:tag to CSS namespace syntax ns|tag String tagName = escapeCssIdentifier(tagName()).replace("\\:", "|"); StringBuilder selector = StringUtil.borrowBuilder().append(tagName); - // String classes = StringUtil.join(classNames().stream().map(TokenQueue::escapeCssIdentifier).iterator(), "."); - // todo - replace with ^^ in 1.16.1 when we enable Android support for stream etc - StringUtil.StringJoiner escapedClasses = new StringUtil.StringJoiner("."); - for (String name : classNames()) escapedClasses.add(escapeCssIdentifier(name)); - String classes = escapedClasses.complete(); - if (classes.length() > 0) + String classes = classNames().stream().map(TokenQueue::escapeCssIdentifier) + .collect(StringUtil.joining(".")); + if (!classes.isEmpty()) selector.append('.').append(classes); if (parent() == null || parent() instanceof Document) // don't add Document to selector, as will always have a html node diff --git a/src/main/java/org/jsoup/nodes/Range.java b/src/main/java/org/jsoup/nodes/Range.java index 955c043a8c..ccc1db5f51 100644 --- a/src/main/java/org/jsoup/nodes/Range.java +++ b/src/main/java/org/jsoup/nodes/Range.java @@ -95,12 +95,6 @@ static Range of(Node node, boolean start) { return range != null ? (Range) range : Untracked; } - /** - @deprecated no-op; internal method moved out of visibility - */ - @Deprecated - public void track(Node node, boolean start) {} - @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java index 8ef653eee3..bc4b612d49 100644 --- a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java @@ -131,20 +131,6 @@ void insertDoctypeFor(Token.Doctype token) { insertLeafNode(doctypeNode); } - /** @deprecated unused and will be removed. */ - @Deprecated - protected void insertNode(Node node) { - currentElement().appendChild(node); - onNodeInserted(node); - } - - /** @deprecated unused and will be removed. */ - @Deprecated - protected void insertNode(Node node, Token token) { - currentElement().appendChild(node); - onNodeInserted(node); - } - /** * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not * found, skips. diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java index eb1281ba6d..629522fc32 100644 --- a/src/main/java/org/jsoup/safety/Safelist.java +++ b/src/main/java/org/jsoup/safety/Safelist.java @@ -6,6 +6,7 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/ */ import org.jsoup.helper.Validate; +import org.jsoup.internal.Functions; import org.jsoup.internal.Normalizer; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; @@ -304,12 +305,8 @@ public Safelist addAttributes(String tag, String... attributes) { Validate.notEmpty(key); attributeSet.add(AttributeKey.valueOf(key)); } - if (this.attributes.containsKey(tagName)) { - Set currentSet = this.attributes.get(tagName); - currentSet.addAll(attributeSet); - } else { - this.attributes.put(tagName, attributeSet); - } + Set currentSet = this.attributes.computeIfAbsent(tagName, Functions.setFunction()); + currentSet.addAll(attributeSet); return this; } @@ -382,13 +379,8 @@ public Safelist addEnforcedAttribute(String tag, String attribute, String value) AttributeKey attrKey = AttributeKey.valueOf(attribute); AttributeValue attrVal = AttributeValue.valueOf(value); - if (enforcedAttributes.containsKey(tagName)) { - enforcedAttributes.get(tagName).put(attrKey, attrVal); - } else { - Map attrMap = new HashMap<>(); - attrMap.put(attrKey, attrVal); - enforcedAttributes.put(tagName, attrMap); - } + Map attrMap = enforcedAttributes.computeIfAbsent(tagName, Functions.mapFunction()); + attrMap.put(attrKey, attrVal); return this; } @@ -458,21 +450,9 @@ public Safelist addProtocols(String tag, String attribute, String... protocols) TagName tagName = TagName.valueOf(tag); AttributeKey attrKey = AttributeKey.valueOf(attribute); - Map> attrMap; - Set protSet; - - if (this.protocols.containsKey(tagName)) { - attrMap = this.protocols.get(tagName); - } else { - attrMap = new HashMap<>(); - this.protocols.put(tagName, attrMap); - } - if (attrMap.containsKey(attrKey)) { - protSet = attrMap.get(attrKey); - } else { - protSet = new HashSet<>(); - attrMap.put(attrKey, protSet); - } + Map> attrMap = this.protocols.computeIfAbsent(tagName, Functions.mapFunction()); + Set protSet = attrMap.computeIfAbsent(attrKey, Functions.setFunction()); + for (String protocol : protocols) { Validate.notEmpty(protocol); Protocol prot = Protocol.valueOf(protocol); diff --git a/src/main/java/org/jsoup/select/CombiningEvaluator.java b/src/main/java/org/jsoup/select/CombiningEvaluator.java index 25e5eac2ee..ea442f0dc3 100644 --- a/src/main/java/org/jsoup/select/CombiningEvaluator.java +++ b/src/main/java/org/jsoup/select/CombiningEvaluator.java @@ -7,15 +7,15 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.Comparator; +import java.util.List; /** * Base combining (and, or) evaluator. */ public abstract class CombiningEvaluator extends Evaluator { final ArrayList evaluators; // maintain original order so that #toString() is sensible - final ArrayList sortedEvaluators; // cost ascending order + final List sortedEvaluators; // cost ascending order int num = 0; int cost = 0; @@ -62,12 +62,9 @@ void updateEvaluators() { } sortedEvaluators.clear(); sortedEvaluators.addAll(evaluators); - Collections.sort(sortedEvaluators, costComparator); + sortedEvaluators.sort(Comparator.comparingInt(Evaluator::cost)); } - private static final Comparator costComparator = (o1, o2) -> o1.cost() - o2.cost(); - // ^ comparingInt, sortedEvaluators.sort not available in targeted version - public static final class And extends CombiningEvaluator { And(Collection evaluators) { super(evaluators); diff --git a/src/main/java/org/jsoup/select/StructuralEvaluator.java b/src/main/java/org/jsoup/select/StructuralEvaluator.java index ce5051fb85..ca7beacb15 100644 --- a/src/main/java/org/jsoup/select/StructuralEvaluator.java +++ b/src/main/java/org/jsoup/select/StructuralEvaluator.java @@ -1,11 +1,13 @@ package org.jsoup.select; +import org.jsoup.internal.Functions; import org.jsoup.internal.StringUtil; import org.jsoup.nodes.Element; import org.jsoup.nodes.NodeIterator; import java.util.ArrayList; import java.util.IdentityHashMap; +import java.util.Map; /** * Base structural evaluator. @@ -23,19 +25,9 @@ public StructuralEvaluator(Evaluator evaluator) { threadMemo = ThreadLocal.withInitial(IdentityHashMap::new); boolean memoMatches(final Element root, final Element element) { - // not using computeIfAbsent, as the lambda impl requires a new Supplier closure object on every hit: tons of GC - IdentityHashMap> rootMemo = threadMemo.get(); - IdentityHashMap memo = rootMemo.get(root); - if (memo == null) { - memo = new IdentityHashMap<>(); - rootMemo.put(root, memo); - } - Boolean matches = memo.get(element); - if (matches == null) { - matches = evaluator.matches(root, element); - memo.put(element, matches); - } - return matches; + Map> rootMemo = threadMemo.get(); + Map memo = rootMemo.computeIfAbsent(root, Functions.identityMapFunction()); + return memo.computeIfAbsent(element, key -> evaluator.matches(root, key)); } @Override protected void reset() { @@ -163,34 +155,6 @@ public String toString() { } } - /** - @deprecated replaced by {@link ImmediateParentRun} - */ - @Deprecated - static class ImmediateParent extends StructuralEvaluator { - public ImmediateParent(Evaluator evaluator) { - super(evaluator); - } - - @Override - public boolean matches(Element root, Element element) { - if (root == element) - return false; - - Element parent = element.parent(); - return parent != null && memoMatches(root, parent); - } - - @Override protected int cost() { - return 1 + evaluator.cost(); - } - - @Override - public String toString() { - return String.format("%s > ", evaluator); - } - } - /** Holds a list of evaluators for one > two > three immediate parent matches, and the final direct evaluator under test. To match, these are effectively ANDed together, starting from the last, matching up to the first. diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java index 10074d4ca9..61627aac20 100644 --- a/src/test/java/org/jsoup/helper/DataUtilTest.java +++ b/src/test/java/org/jsoup/helper/DataUtilTest.java @@ -11,8 +11,10 @@ import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.Path; import static org.jsoup.integration.ParseTest.getFile; +import static org.jsoup.integration.ParseTest.getPath; import static org.junit.jupiter.api.Assertions.*; public class DataUtilTest { @@ -207,13 +209,21 @@ public void supportsXmlCharsetDeclaration() throws IOException { @Test - public void lLoadsGzipFile() throws IOException { + public void loadsGzipFile() throws IOException { File in = getFile("/htmltests/gzip.html.gz"); Document doc = Jsoup.parse(in, null); assertEquals("Gzip test", doc.title()); assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); } + @Test + public void loadsGzipPath() throws IOException { + Path in = getPath("/htmltests/gzip.html.gz"); + Document doc = Jsoup.parse(in, null); + assertEquals("Gzip test", doc.title()); + assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); + } + @Test public void loadsZGzipFile() throws IOException { // compressed on win, with z suffix @@ -223,6 +233,15 @@ public void loadsZGzipFile() throws IOException { assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); } + @Test + public void loadsZGzipPath() throws IOException { + // compressed on win, with z suffix + Path in = getPath("/htmltests/gzip.html.z"); + Document doc = Jsoup.parse(in, null); + assertEquals("Gzip test", doc.title()); + assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); + } + @Test public void handlesFakeGzipFile() throws IOException { File in = getFile("/htmltests/fake-gzip.html.gz"); @@ -231,6 +250,14 @@ public void handlesFakeGzipFile() throws IOException { assertEquals("And should still be readable.", doc.selectFirst("p").text()); } + @Test + public void handlesFakeGzipPath() throws IOException { + Path in = getPath("/htmltests/fake-gzip.html.gz"); + Document doc = Jsoup.parse(in, null); + assertEquals("This is not gzipped", doc.title()); + assertEquals("And should still be readable.", doc.selectFirst("p").text()); + } + // an input stream to give a range of output sizes, that changes on each read static class VaryingReadInputStream extends InputStream { final InputStream in; diff --git a/src/test/java/org/jsoup/integration/ParseTest.java b/src/test/java/org/jsoup/integration/ParseTest.java index 0c5cb2b15c..d84c103497 100644 --- a/src/test/java/org/jsoup/integration/ParseTest.java +++ b/src/test/java/org/jsoup/integration/ParseTest.java @@ -15,6 +15,8 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.zip.GZIPInputStream; import static org.junit.jupiter.api.Assertions.*; @@ -133,6 +135,15 @@ public static File getFile(String resourceName) { } } + public static Path getPath(String resourceName) { + try { + URL resource = ParseTest.class.getResource(resourceName); + return resource != null ? Paths.get(resource.toURI()) : Paths.get("/404"); + } catch (URISyntaxException e) { + throw new IllegalStateException(e); + } + } + public static InputStream inputStreamFrom(String s) { return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8)); } diff --git a/src/test/java/org/jsoup/nodes/BuildEntities.java b/src/test/java/org/jsoup/nodes/BuildEntities.java index 423c26805a..c5eb554ae4 100644 --- a/src/test/java/org/jsoup/nodes/BuildEntities.java +++ b/src/test/java/org/jsoup/nodes/BuildEntities.java @@ -105,12 +105,6 @@ private static String d(int d) { return Integer.toString(d, Entities.codepointRadix); } - private static class ByName implements Comparator { - public int compare(CharacterRef o1, CharacterRef o2) { - return o1.name.compareTo(o2.name); - } - } - private static class ByCode implements Comparator { public int compare(CharacterRef o1, CharacterRef o2) { int[] c1 = o1.codepoints; @@ -131,6 +125,6 @@ public int compare(CharacterRef o1, CharacterRef o2) { } } - private static ByName byName = new ByName(); - private static ByCode byCode = new ByCode(); + private static final Comparator byName = Comparator.comparing(ref -> ref.name); + private static final ByCode byCode = new ByCode(); }