Skip to content

Commit

Permalink
Avoid using targetURI() when possible
Browse files Browse the repository at this point in the history
It can change the String value of the URI.
  • Loading branch information
ato committed Nov 20, 2024
1 parent 14b80be commit 830a6e7
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 33 deletions.
12 changes: 12 additions & 0 deletions src/org/netpreserve/jwarc/URIs.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,18 @@ public class URIs {
private final static Pattern AUTHORITY_REGEX = Pattern.compile("([^@]*@)?(.*?)(?::([0-9]+))?", DOTALL);
private final static Pattern IPV4_REGEX = Pattern.compile("[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}");

/**
* Returns true if the given string begins with a http: or https: URI scheme. Does not enforce the string is a
* valid URI.
*/
public static boolean hasHttpOrHttpsScheme(String uri) {
return startsWithIgnoreCase(uri, "http:") || startsWithIgnoreCase(uri, "https:");
}

private static boolean startsWithIgnoreCase(String string, String prefix) {
return string.regionMatches(true, 0, prefix, 0, prefix.length());
}

public static URI parseLeniently(String uri) {
Matcher m = URL_REGEX.matcher(uri);
if (!m.matches()) {
Expand Down
3 changes: 3 additions & 0 deletions src/org/netpreserve/jwarc/WarcTargetRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ public String target() {

/**
* The URI of the original target resource this record holds information about.
* <p>
* The {@link #target()} method should be preferred unless you actually need an instance of URI as some WARC files
* may contain a value of WARC-Target-URI that cannot be represented as a Java URI instance without changing them.
*/
public URI targetURI() {
return URIs.parseLeniently(target());
Expand Down
13 changes: 10 additions & 3 deletions src/org/netpreserve/jwarc/net/Browser.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
Expand Down Expand Up @@ -38,13 +37,21 @@ public void browse(URI uri) throws IOException {
}

public void screenshot(URI uri, Path outfile) throws IOException {
run("--screenshot=" + outfile, uri.toString());
screenshot(uri.toString(), outfile);
}

public void screenshot(String url, Path outfile) throws IOException {
run("--screenshot=" + outfile, url);
}

public FileChannel screenshot(URI uri) throws IOException {
return screenshot(uri.toString());
}

public FileChannel screenshot(String uri) throws IOException {
Path outfile = Files.createTempFile("jwarc-screenshot", ".png");
try {
run("--screenshot=" + outfile, uri.toString());
run("--screenshot=" + outfile, uri);
return FileChannel.open(outfile, DELETE_ON_CLOSE);
} catch (Exception e) {
Files.deleteIfExists(outfile);
Expand Down
14 changes: 4 additions & 10 deletions src/org/netpreserve/jwarc/net/Capture.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,33 +10,27 @@
* Hods the location of a particular captured version of a resource.
*/
class Capture {
private final String urikey;
private final URI uri;
private final String uri;
private final Instant date;
private final Path file;
private final long position;

Capture(URI uri, Instant date) {
Capture(String uri, Instant date) {
this(uri, date, null, -1);
}

Capture(URI uri, Instant date, Path file, long position) {
urikey = uri.toString();
Capture(String uri, Instant date, Path file, long position) {
this.uri = uri;
this.date = date;
this.file = file;
this.position = position;
}

String uriKey() {
return urikey;
}

Instant date() {
return date;
}

URI uri() {
String uri() {
return uri;
}

Expand Down
9 changes: 4 additions & 5 deletions src/org/netpreserve/jwarc/net/CaptureIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import static java.util.Comparator.comparing;

public class CaptureIndex {
private final NavigableSet<Capture> entries = new TreeSet<>(comparing(Capture::uriKey).thenComparing(Capture::date));
private final NavigableSet<Capture> entries = new TreeSet<>(comparing(Capture::uri).thenComparing(Capture::date));
private Capture entrypoint;

public CaptureIndex(List<Path> warcs) throws IOException {
Expand All @@ -22,9 +22,8 @@ public CaptureIndex(List<Path> warcs) throws IOException {
for (WarcRecord record : reader) {
if ((record instanceof WarcResponse || record instanceof WarcResource)) {
WarcCaptureRecord capture = (WarcCaptureRecord) record;
String scheme = capture.targetURI().getScheme();
if ("http".equalsIgnoreCase(scheme) || "https".equalsIgnoreCase(scheme)) {
Capture entry = new Capture(capture.targetURI(), capture.date(), warc, reader.position());
if (URIs.hasHttpOrHttpsScheme(capture.target())) {
Capture entry = new Capture(capture.target(), capture.date(), warc, reader.position());
add(entry);
if (entrypoint == null && MediaType.HTML.equals(capture.payloadType().base())) {
entrypoint = entry;
Expand All @@ -40,7 +39,7 @@ void add(Capture capture) {
entries.add(capture);
}

NavigableSet<Capture> query(URI uri) {
NavigableSet<Capture> query(String uri) {
return entries.subSet(new Capture(uri, Instant.MIN), true, new Capture(uri, Instant.MAX), true);
}

Expand Down
8 changes: 6 additions & 2 deletions src/org/netpreserve/jwarc/net/WarcRenderer.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,19 @@ public WarcRenderer(CaptureIndex index, String browserExecutable) throws IOExcep
}

public void screenshot(URI uri, Instant date, WarcWriter warcWriter) throws IOException {
screenshot(uri.toString(), date, warcWriter);
}

public void screenshot(String url, Instant date, WarcWriter warcWriter) throws IOException {
Path screenshot = Files.createTempFile("jwarc-screenshot", ".png");
try {
Browser browser = new Browser(browserExecutable, (InetSocketAddress) proxySocket.getLocalSocketAddress(),
"WarcRenderer (arctime/" + ARC_TIME.format(date) + ")");
browser.screenshot(uri, screenshot);
browser.screenshot(url, screenshot);
try (FileChannel channel = FileChannel.open(screenshot)) {
long size = channel.size();
if (size == 0) return;
warcWriter.write(new WarcResource.Builder(URI.create("screenshot:" + uri))
warcWriter.write(new WarcResource.Builder(URI.create("screenshot:" + url))
.date(date)
.body(MediaType.parse("image/png"), channel, size)
.build());
Expand Down
12 changes: 5 additions & 7 deletions src/org/netpreserve/jwarc/net/WarcServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ private Instant parseUserAgentDate(HttpExchange exchange) {
}

private void timemap(HttpExchange exchange) throws IOException {
URI uri = URI.create(exchange.param(1));
NavigableSet<Capture> versions = index.query(uri);
NavigableSet<Capture> versions = index.query(exchange.param(1));
if (versions.isEmpty()) {
exchange.send(404, "Not found in archive");
return;
Expand All @@ -114,7 +113,7 @@ private void replay(HttpExchange exchange) throws IOException {

private void render(HttpExchange exchange) throws IOException {
Instant date = Instant.from(ARC_DATE.parse(exchange.param(1)));
URI uri = URI.create(exchange.param(2));
String uri = exchange.param(2);
NavigableSet<Capture> versions = index.query(uri);
if (versions.isEmpty()) {
exchange.send(404, "Not found in archive");
Expand All @@ -134,13 +133,12 @@ private void render(HttpExchange exchange) throws IOException {
}

private void replay(HttpExchange exchange, String target, Instant date, boolean proxy) throws IOException {
URI uri = URI.create(target);
NavigableSet<Capture> versions = index.query(uri);
NavigableSet<Capture> versions = index.query(target);
if (versions.isEmpty()) {
exchange.send(404, "Not found in archive");
return;
}
Capture capture = closest(versions, uri, date);
Capture capture = closest(versions, target, date);
try (FileChannel channel = FileChannel.open(capture.file(), READ)) {
channel.position(capture.position());
WarcReader reader = new WarcReader(channel);
Expand Down Expand Up @@ -186,7 +184,7 @@ private void mementoLink(StringBuilder sb, String rel, Capture current, Capture
.append(rel).append("memento\";datetime=\"").append(RFC_1123_UTC.format(capture.date())).append("\"");
}

private Capture closest(NavigableSet<Capture> versions, URI uri, Instant date) {
private Capture closest(NavigableSet<Capture> versions, String uri, Instant date) {
Capture key = new Capture(uri, date);
Capture a = versions.floor(key);
Capture b = versions.higher(key);
Expand Down
5 changes: 2 additions & 3 deletions src/org/netpreserve/jwarc/tools/DedupeTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import org.netpreserve.jwarc.cdx.CdxRecord;

import java.io.IOException;
import java.net.URI;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.channels.FileChannel;
Expand Down Expand Up @@ -74,9 +73,9 @@ private WarcRevisit deduplicate(WarcRecord record) throws IOException {
if (payloadDigest == null) return null;
CdxRecord match = findMatchingRecord(response, payloadDigest.base32());
if (match == null) return null;
return new WarcRevisit.Builder(response.targetURI(), WarcRevisit.IDENTICAL_PAYLOAD_DIGEST_1_0)
return new WarcRevisit.Builder(response.target(), WarcRevisit.IDENTICAL_PAYLOAD_DIGEST_1_0)
.date(response.date())
.refersTo((URI) null, match.targetURI(), match.date())
.refersTo(null, match.target(), match.date())
.body(response.contentType(), response.http().serializeHeader())
.payloadDigest(payloadDigest)
.build();
Expand Down
5 changes: 2 additions & 3 deletions src/org/netpreserve/jwarc/tools/ScreenshotTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public static void main(String[] args) throws Exception {
for (WarcRecord record : reader) {
if (!isNormalPage(record)) continue;
WarcCaptureRecord capture = (WarcCaptureRecord) record;
renderer.screenshot(capture.targetURI(), capture.date(), warcWriter);
renderer.screenshot(capture.target(), capture.date(), warcWriter);
}
}
}
Expand All @@ -33,8 +33,7 @@ private static boolean isNormalPage(WarcRecord record) throws IOException {
return false;
}
WarcCaptureRecord capture = (WarcCaptureRecord) record;
String scheme = capture.targetURI().getScheme();
if (!("http".equalsIgnoreCase(scheme) || "https".equalsIgnoreCase(scheme))) {
if (!(URIs.hasHttpOrHttpsScheme(capture.target()))) {
return false;
}
try {
Expand Down

0 comments on commit 830a6e7

Please sign in to comment.