Skip to content

Commit

Permalink
Add HeaderValidator with a ruleset based on the WARC 1.1 standard
Browse files Browse the repository at this point in the history
  • Loading branch information
ato committed Nov 26, 2024
1 parent b3dd69f commit 55353a7
Show file tree
Hide file tree
Showing 4 changed files with 331 additions and 4 deletions.
217 changes: 217 additions & 0 deletions src/org/netpreserve/jwarc/HeaderValidator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
package org.netpreserve.jwarc;

import java.util.*;
import java.util.regex.Pattern;

/**
* The `HeaderValidator` class validates MessageHeaders based on predefined rules. Rules can include which records types
* a header field is allowed, forbidden or mandatory on and whether the value matches a regular expression.
*/
public class HeaderValidator {
private final Map<String, FieldRule> fields = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
private final Set<String> mandatoryFields = new HashSet<>();
private final Map<String, Set<String>> mandatoryFieldsByRecordType = new HashMap<>();
private boolean forbidUnknownFields = false;

class FieldRule {
final String name;
Pattern pattern;
boolean repeatable;
final Set<String> forbiddenOn = new HashSet<>();

public FieldRule(String name) {
this.name = name;
}

public FieldRule mandatory() {
mandatoryFields.add(name);
return this;
}

public FieldRule pattern(Pattern pattern) {
this.pattern = pattern;
return this;
}

public FieldRule repeatable() {
this.repeatable = true;
return this;
}

public FieldRule forbidOn(String... recordTypes) {
Collections.addAll(forbiddenOn, recordTypes);
return this;
}

public FieldRule requireOn(String... recordTypes) {
for (String recordType : recordTypes) {
mandatoryFieldsByRecordType
.computeIfAbsent(recordType, k -> new TreeSet<>(String.CASE_INSENSITIVE_ORDER))
.add(name);
}
return this;
}
}

private HeaderValidator() {
}

/**
* Creates and configures a HeaderValidator object for the WARC 1.1 standard. Extension headers and values will
* be ignored.
*/
public static HeaderValidator warc_1_1() {
return warc_1_1(false);
}

/**
* Creates and configures a HeaderValidator object for the WARC 1.1 standard.
* <p>
* The validation of field values is slightly relaxed from the grammar in the WARC 1.1 standard for
* backwards compatibility with WARC 1.0 and in other cases recommended by the
* <a href="https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1-annotated/">
* community annotations</a>.
*
* @param forbidExtensions if true unknown headers are forbidden and only standard values are accepted for
* WARC-Type, WARC-Truncated and WARC-Profile.
*/
public static HeaderValidator warc_1_1(boolean forbidExtensions) {
// TODO: more complete URI validation
String uriRegex = "(?:[a-zA-Z][a-zA-Z0-9+.-]*:)?.*";
// we allow '<' URI '>' in some fields for backwards compatibility with WARC 1.0
Pattern backwardsCompatibleUri = Pattern.compile("<" + uriRegex + ">|" + uriRegex);
Pattern uri = Pattern.compile(uriRegex);

Pattern recordId = Pattern.compile("<" + uriRegex + ">");
Pattern nonNegativeInteger = Pattern.compile("[0-9]+");
Pattern date = Pattern.compile("\\d{4}(-\\d{2}(-\\d{2}(T\\d{2}:\\d{2}(:\\d{2}(\\.\\d+)?(Z|[+-]\\d{2}:\\d{2})?)?)?)?)?");

String ows = "[ \\t]*";
String token = "[-!#$%&'*+.^_`|~0-9A-Za-z]+";
String quotedString = "\"(?:[^\"\\x00-\\x1F\\x7F]|\\\\.)*\"";
String value = token + "|" + quotedString;
String parameter = token + "=" + "(" + value + ")";
// WARC 1.1 errata: allow OWS in media type parameters
Pattern mediaType = Pattern.compile(token + "/" + token + ows + "(?:;" + ows + parameter + ")*");
// community recommendation #48: allow / and @ for compatibility with Base32 and Base64
String digestValue = "[-!#$%&'*+.^_`|~0-9A-Za-z/@]+";
Pattern labelledDigest = Pattern.compile(token + ":" + digestValue);

HeaderValidator v = new HeaderValidator();
v.forbidUnknownFields = forbidExtensions;
v.field("WARC-Record-ID").mandatory().pattern(recordId);
v.field("Content-Length").mandatory().pattern(nonNegativeInteger);
v.field("WARC-Date").mandatory().pattern(date);
v.field("WARC-Type").mandatory().pattern(forbidExtensions ? Pattern.compile(
"warcinfo|response|resource|request|metadata|revisit|conversion|continuation") : null);
v.field("Content-Type").pattern(mediaType);
v.field("WARC-Concurrent-To").pattern(recordId).repeatable()
.forbidOn("warcinfo", "conversion", "continuation");
v.field("WARC-Block-Digest").pattern(labelledDigest);
v.field("WARC-Payload-Digest").pattern(labelledDigest);
// TODO: ip address pattern
v.field("WARC-IP-Address")
.forbidOn("warcinfo", "conversion", "continuation");
v.field("WARC-Refers-To")
.pattern(recordId)
.forbidOn("warcinfo", "response", "resource", "request", "continuation");
v.field("WARC-Refers-To-Target-URI")
.pattern(uri)
.forbidOn("warcinfo", "response", "metadata", "conversion", "resource", "request", "continuation");
v.field("WARC-Refers-To-Date")
.pattern(date)
.forbidOn("warcinfo", "response", "metadata", "conversion", "resource", "request", "continuation");
v.field("WARC-Target-URI")
.pattern(backwardsCompatibleUri)
.forbidOn("warcinfo")
.requireOn("response", "resource", "request", "revisit", "conversion", "continuation");
v.field("WARC-Truncated")
.pattern(forbidExtensions ? Pattern.compile("length|time|disconnect|unspecified") : null);
v.field("WARC-Warcinfo-ID").pattern(recordId).forbidOn("warcinfo");
v.field("WARC-Filename")
.forbidOn("revisit", "response", "metadata", "conversion", "resource", "request", "continuation");
FieldRule profileField = v.field("WARC-Profile")
.requireOn("revisit");
if (forbidExtensions) {
profileField.pattern(Pattern.compile(
"\\Qhttp://netpreserve.org/warc/1.1/revisit/identical-payload-digest\\E"
+ "|\\Qhttp://netpreserve.org/warc/1.1/revisit/server-not-modified\\E"));
profileField.forbidOn("warcinfo", "response", "metadata", "conversion", "resource", "request", "continuation");
} else {
profileField.pattern(backwardsCompatibleUri);
}
v.field("WARC-Identified-Payload-Type").pattern(mediaType);
v.field("WARC-Segment-Number").pattern(nonNegativeInteger);
v.field("WARC-Segment-Origin-ID").pattern(nonNegativeInteger)
.requireOn("continuation")
.forbidOn("warcinfo", "response", "metadata", "conversion", "resource", "request", "revisit");
v.field("WARC-Segment-Total-Length").pattern(nonNegativeInteger)
.forbidOn("warcinfo", "response", "metadata", "conversion", "resource", "request", "revisit");
return v;
}

private FieldRule field(String name) {
return fields.computeIfAbsent(name, FieldRule::new);
}

/**
* Validates the given set of message headers.
*
* @param headers the MessageHeaders object containing headers to be validated
* @return a list of strings describing any validation violations found
*/
public List<String> validate(MessageHeaders headers) {
List<String> violations = new ArrayList<>();

String recordType = headers.first("WARC-Type").orElse(null);

for (Map.Entry<String, List<String>> entry : headers.map().entrySet()) {
String name = entry.getKey();
List<String> values = entry.getValue();

FieldRule fieldRule = fields.get(name);
if (fieldRule == null) {
if (forbidUnknownFields) {
violations.add("Unknown field: " + name);
}
continue;
}

if (!fieldRule.repeatable && values.size() > 1) {
violations.add("Field must not be repeated: " + name);
}

if (recordType != null && fieldRule.forbiddenOn.contains(recordType)) {
violations.add("Field not allowed on " + recordType + " record: " + name);
}

if (fieldRule.pattern != null) {
for (String value : values) {
if (!fieldRule.pattern.matcher(value).matches()) {
violations.add("Field has invalid value: " + value);
}
}
}
}

// Check for fields mandatory on all records
Set<String> names = headers.map().keySet();
for (String field : mandatoryFields) {
if (!names.contains(field)) {
violations.add("Missing mandatory field: " + field);
}
}

// Check for fields mandatory on specific record types
if (recordType != null) {
for (String name : mandatoryFieldsByRecordType.getOrDefault(recordType, Collections.emptySet())) {
if (!names.contains(name)) {
violations.add("Missing mandatory field for " + recordType + " record: " + name);
}
}
}

return violations;
}

}
16 changes: 12 additions & 4 deletions src/org/netpreserve/jwarc/MessageHeaders.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@
import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.*;
import java.util.regex.Pattern;

import static java.util.Collections.emptyList;
Expand All @@ -21,6 +18,17 @@ public class MessageHeaders {
private static Pattern COMMA_SEPARATOR = Pattern.compile("[ \t]*,[ \t]*");
private Map<String,List<String>> map;

public static MessageHeaders of(String... keysAndValues) {
if (keysAndValues.length % 2 != 0) {
throw new IllegalArgumentException("an even number keysAndValues must be provided");
}
Map<String,List<String>> map = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
for (int i = 0; i < keysAndValues.length; i += 2) {
map.computeIfAbsent(keysAndValues[i], k -> new ArrayList<>()).add(keysAndValues[i + 1]);
}
return new MessageHeaders(map);
}

MessageHeaders(Map<String, List<String>> map) {
map.replaceAll((name, values) -> Collections.unmodifiableList(values));
this.map = Collections.unmodifiableMap(map);
Expand Down
21 changes: 21 additions & 0 deletions src/org/netpreserve/jwarc/tools/ValidateTool.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.security.DigestException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
Expand Down Expand Up @@ -66,6 +67,7 @@ public NonVerboseLogger() {

private Logger logger;
private boolean verbose;
private HeaderValidator headerValidator;

public ValidateTool(boolean verbose) {
this.verbose = verbose;
Expand Down Expand Up @@ -178,6 +180,12 @@ private boolean validate(WarcReader reader) throws IOException {
while (record != null) {
boolean valid = true;

if (headerValidator != null) {
List<String> headerViolations = headerValidator.validate(record.headers());
headerViolations.forEach(logger::error);
valid &= headerViolations.isEmpty();
}

if (record instanceof WarcCaptureRecord) {
try {
valid = validateCapture(record);
Expand Down Expand Up @@ -251,6 +259,8 @@ private static void usage(int exitValue) {
System.err.println("");
System.err.println("Options:");
System.err.println("");
System.err.println(" --no-header-validation\tskips checking headers against WARC standard rules");
System.err.println(" --forbid-extensions\tdisallows non-standard WARC header fields and values");
System.err.println(" -h / --help\tshow usage message and exit");
System.err.println(" -v / --verbose\tlog information about every WARC record to stdout");
System.err.println("");
Expand All @@ -263,10 +273,18 @@ private static void usage(int exitValue) {
public static void main(String[] args) throws IOException {
int res = 0;
boolean verbose = false;
boolean headerValidation = true;
boolean forbidExtensions = false;
if (args.length == 0)
usage(0);
for (String arg : args) {
switch (arg) {
case "--no-header-validation":
headerValidation = false;
break;
case "--forbid-extensions":
forbidExtensions = true;
break;
case "-h":
case "--help":
usage(0);
Expand All @@ -277,6 +295,9 @@ public static void main(String[] args) throws IOException {
break;
default:
ValidateTool validator = new ValidateTool(verbose);
if (headerValidation) {
validator.headerValidator = HeaderValidator.warc_1_1(forbidExtensions);
}
try (WarcReader reader = new WarcReader(Paths.get(arg))) {
reader.calculateBlockDigest();
if (verbose)
Expand Down
81 changes: 81 additions & 0 deletions test/org/netpreserve/jwarc/HeaderValidatorTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package org.netpreserve.jwarc;

import org.junit.Test;

import java.util.*;

import static org.junit.Assert.*;

public class HeaderValidatorTest {
private HeaderValidator headerValidator = HeaderValidator.warc_1_1();

@Test
public void testValid() {
MessageHeaders headers = MessageHeaders.of(
"WARC-Record-ID", "<urn:uuid:6c73a5d3-cab3-46b3-b0b6-3b4b617f544d>",
"Content-Length", "123456",
"WARC-Date", "2020-01-01T00:00:00Z",
"WARC-Type", "response",
"WARC-Target-URI", "http://example.com/",
"Content-Type", "application/http; msgtype=response",
"WARC-Concurrent-To", "<urn:uuid:6c73a5d3-cab3-46b3-b0b6-3b4b617f5441>",
"WARC-Concurrent-To", "<urn:uuid:6c73a5d3-cab3-46b3-b0b6-3b4b617f5442>"
);
assertEquals(Collections.emptyList(), headerValidator.validate(headers));
}

@Test
public void testMissingMandatoryFields() {
MessageHeaders headers = MessageHeaders.of(
"Content-Length", "123456",
"WARC-Date", "2020-01-01T00:00:00Z",
"WARC-Type", "response"
);
List<String> validationErrors = headerValidator.validate(headers);
assertFalse(validationErrors.isEmpty());
assertTrue(validationErrors.contains("Missing mandatory field: WARC-Record-ID"));
}

@Test
public void testInvalidPatternValidation() {
MessageHeaders headers = MessageHeaders.of(
"WARC-Record-ID", "<urn:uuid:6c73a5d3-cab3-46b3-b0b6-3b4b617f544d>",
"Content-Length", "123456",
"WARC-Date", "2020-01-01T00:00:00Z",
"WARC-Type", "response",
"Content-Type", "invalid_content_type"
);
List<String> validationErrors = headerValidator.validate(headers);
assertFalse(validationErrors.isEmpty());
assertTrue(validationErrors.contains("Field has invalid value: invalid_content_type"));
}

@Test
public void testNonRepeatableField() {
MessageHeaders headers = MessageHeaders.of(
"WARC-Record-ID", "<urn:uuid:6c73a5d3-cab3-46b3-b0b6-3b4b617f544d>",
"Content-Length", "123456",
"WARC-Date", "2020-01-01T00:00:00Z",
"WARC-Type", "response",
"WARC-Date", "2020-01-01T00:00:00Z",
"WARC-Date", "2020-01-02T00:00:00Z"
);
List<String> validationErrors = headerValidator.validate(headers);
assertFalse(validationErrors.isEmpty());
assertTrue(validationErrors.contains("Field must not be repeated: WARC-Date"));
}

@Test
public void testForbiddenFieldsOnRecordType() {
MessageHeaders headers = MessageHeaders.of(
"WARC-Record-ID", "<urn:uuid:6c73a5d3-cab3-46b3-b0b6-3b4b617f544d>",
"Content-Length", "123456",
"WARC-Date", "2020-01-01T00:00:00Z",
"WARC-Type", "response",
"WARC-Filename", "test.warc.gz"
);
List<String> validationErrors = headerValidator.validate(headers);
assertFalse(validationErrors.isEmpty());
assertTrue(validationErrors.contains("Field not allowed on response record: WARC-Filename"));
}
}

0 comments on commit 55353a7

Please sign in to comment.