Skip to content

Commit

Permalink
ExtractTool: Add --concurrent option
Browse files Browse the repository at this point in the history
  • Loading branch information
ato committed Dec 19, 2024
1 parent 55353a7 commit 7828aa0
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 17 deletions.
42 changes: 42 additions & 0 deletions src/org/netpreserve/jwarc/ConcurrentRecordSet.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.netpreserve.jwarc;

import java.net.URI;
import java.util.HashSet;
import java.util.Set;

/**
* A set for testing whether WARC records are concurrent (i.e. part of the same capture event).
*/
public class ConcurrentRecordSet {
private final Set<URI> set = new HashSet<>();

/**
* Adds a record to the set.
*/
public void add(WarcRecord record) {
set.add(record.id());
if (record instanceof WarcCaptureRecord) {
set.addAll(((WarcCaptureRecord) record).concurrentTo());
}
}

/**
* Tests if the given record is concurrent to any previously added record.
*/
public boolean contains(WarcRecord record) {
if (set.contains(record.id())) return true;
if (record instanceof WarcCaptureRecord) {
for (URI id : ((WarcCaptureRecord) record).concurrentTo()) {
if (set.contains(id)) return true;
}
}
return false;
}

/**
* Removes all records from the set.
*/
public void clear() {
set.clear();
}
}
54 changes: 37 additions & 17 deletions src/org/netpreserve/jwarc/tools/ExtractTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@
import java.nio.channels.WritableByteChannel;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.*;

import static java.nio.charset.StandardCharsets.US_ASCII;
import static java.nio.charset.StandardCharsets.UTF_8;
Expand Down Expand Up @@ -85,6 +83,7 @@ private static void usage(int exitValue) {
System.err.println();
System.err.println("Options:");
System.err.println();
System.err.println(" --concurrent\talso outputs any immediately following concurrent records");
System.err.println(" --headers\toutput only record (and HTTP) headers");
System.err.println(" --payload\toutput only record payload, if necessary");
System.err.println(" \tdecode transfer and/or content encoding");
Expand All @@ -95,11 +94,16 @@ public static void main(String[] args) throws IOException {
ExtractAction action = ExtractAction.RECORD;
Path warcFile = null;
List<Long> offsets = new ArrayList<>();
boolean extractConcurrent = false;
for (String arg : args) {
switch (arg) {
case "-h":
case "--help":
usage(0);
break;
case "--concurrent":
extractConcurrent = true;
break;
case "--headers":
action = ExtractAction.HEADERS;
break;
Expand Down Expand Up @@ -128,7 +132,9 @@ public static void main(String[] args) throws IOException {
}
if (warcFile == null || offsets.isEmpty()) {
usage(1);
return;
}
WritableByteChannel out = Channels.newChannel(System.out);
for (long offset : offsets) {
try (FileChannel channel = FileChannel.open(warcFile);
WarcReader reader = new WarcReader(channel.position(offset))) {
Expand All @@ -137,22 +143,36 @@ public static void main(String[] args) throws IOException {
System.err.println("No record found at position " + offset);
System.exit(1);
}
WritableByteChannel out = Channels.newChannel(System.out);
switch (action) {
case RECORD:
writeWarcHeaders(out, record.get());
writeBody(out, record.get().body());
out.write(ByteBuffer.wrap("\r\n\r\n".getBytes(US_ASCII)));
break;
case HEADERS:
writeWarcHeaders(out, record.get());
writeHttpHeaders(out, record.get());
break;
case PAYLOAD:
writePayload(out, record.get());
break;

writeRecord(record.get(), out, action);

if (extractConcurrent) {
ConcurrentRecordSet concurrentSet = new ConcurrentRecordSet();
while (true) {
concurrentSet.add(record.get());
record = reader.next();
if (!record.isPresent() || !concurrentSet.contains(record.get())) break;
writeRecord(record.get(), out, action);
}
}
}
}
}

private static void writeRecord(WarcRecord record, WritableByteChannel out, ExtractAction action) throws IOException {
switch (action) {
case RECORD:
writeWarcHeaders(out, record);
writeBody(out, record.body());
out.write(ByteBuffer.wrap("\r\n\r\n".getBytes(US_ASCII)));
break;
case HEADERS:
writeWarcHeaders(out, record);
writeHttpHeaders(out, record);
break;
case PAYLOAD:
writePayload(out, record);
break;
}
}
}

0 comments on commit 7828aa0

Please sign in to comment.