From 55353a7678bfd4df792caf377c90b95bac9ef101 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Tue, 26 Nov 2024 11:35:52 +0900 Subject: [PATCH] Add HeaderValidator with a ruleset based on the WARC 1.1 standard --- .../netpreserve/jwarc/HeaderValidator.java | 217 ++++++++++++++++++ src/org/netpreserve/jwarc/MessageHeaders.java | 16 +- .../netpreserve/jwarc/tools/ValidateTool.java | 21 ++ .../jwarc/HeaderValidatorTest.java | 81 +++++++ 4 files changed, 331 insertions(+), 4 deletions(-) create mode 100644 src/org/netpreserve/jwarc/HeaderValidator.java create mode 100644 test/org/netpreserve/jwarc/HeaderValidatorTest.java diff --git a/src/org/netpreserve/jwarc/HeaderValidator.java b/src/org/netpreserve/jwarc/HeaderValidator.java new file mode 100644 index 0000000..0dc6700 --- /dev/null +++ b/src/org/netpreserve/jwarc/HeaderValidator.java @@ -0,0 +1,217 @@ +package org.netpreserve.jwarc; + +import java.util.*; +import java.util.regex.Pattern; + +/** + * The `HeaderValidator` class validates MessageHeaders based on predefined rules. Rules can include which records types + * a header field is allowed, forbidden or mandatory on and whether the value matches a regular expression. + */ +public class HeaderValidator { + private final Map fields = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); + private final Set mandatoryFields = new HashSet<>(); + private final Map> mandatoryFieldsByRecordType = new HashMap<>(); + private boolean forbidUnknownFields = false; + + class FieldRule { + final String name; + Pattern pattern; + boolean repeatable; + final Set forbiddenOn = new HashSet<>(); + + public FieldRule(String name) { + this.name = name; + } + + public FieldRule mandatory() { + mandatoryFields.add(name); + return this; + } + + public FieldRule pattern(Pattern pattern) { + this.pattern = pattern; + return this; + } + + public FieldRule repeatable() { + this.repeatable = true; + return this; + } + + public FieldRule forbidOn(String... recordTypes) { + Collections.addAll(forbiddenOn, recordTypes); + return this; + } + + public FieldRule requireOn(String... recordTypes) { + for (String recordType : recordTypes) { + mandatoryFieldsByRecordType + .computeIfAbsent(recordType, k -> new TreeSet<>(String.CASE_INSENSITIVE_ORDER)) + .add(name); + } + return this; + } + } + + private HeaderValidator() { + } + + /** + * Creates and configures a HeaderValidator object for the WARC 1.1 standard. Extension headers and values will + * be ignored. + */ + public static HeaderValidator warc_1_1() { + return warc_1_1(false); + } + + /** + * Creates and configures a HeaderValidator object for the WARC 1.1 standard. + *

+ * The validation of field values is slightly relaxed from the grammar in the WARC 1.1 standard for + * backwards compatibility with WARC 1.0 and in other cases recommended by the + * + * community annotations. + * + * @param forbidExtensions if true unknown headers are forbidden and only standard values are accepted for + * WARC-Type, WARC-Truncated and WARC-Profile. + */ + public static HeaderValidator warc_1_1(boolean forbidExtensions) { + // TODO: more complete URI validation + String uriRegex = "(?:[a-zA-Z][a-zA-Z0-9+.-]*:)?.*"; + // we allow '<' URI '>' in some fields for backwards compatibility with WARC 1.0 + Pattern backwardsCompatibleUri = Pattern.compile("<" + uriRegex + ">|" + uriRegex); + Pattern uri = Pattern.compile(uriRegex); + + Pattern recordId = Pattern.compile("<" + uriRegex + ">"); + Pattern nonNegativeInteger = Pattern.compile("[0-9]+"); + Pattern date = Pattern.compile("\\d{4}(-\\d{2}(-\\d{2}(T\\d{2}:\\d{2}(:\\d{2}(\\.\\d+)?(Z|[+-]\\d{2}:\\d{2})?)?)?)?)?"); + + String ows = "[ \\t]*"; + String token = "[-!#$%&'*+.^_`|~0-9A-Za-z]+"; + String quotedString = "\"(?:[^\"\\x00-\\x1F\\x7F]|\\\\.)*\""; + String value = token + "|" + quotedString; + String parameter = token + "=" + "(" + value + ")"; + // WARC 1.1 errata: allow OWS in media type parameters + Pattern mediaType = Pattern.compile(token + "/" + token + ows + "(?:;" + ows + parameter + ")*"); + // community recommendation #48: allow / and @ for compatibility with Base32 and Base64 + String digestValue = "[-!#$%&'*+.^_`|~0-9A-Za-z/@]+"; + Pattern labelledDigest = Pattern.compile(token + ":" + digestValue); + + HeaderValidator v = new HeaderValidator(); + v.forbidUnknownFields = forbidExtensions; + v.field("WARC-Record-ID").mandatory().pattern(recordId); + v.field("Content-Length").mandatory().pattern(nonNegativeInteger); + v.field("WARC-Date").mandatory().pattern(date); + v.field("WARC-Type").mandatory().pattern(forbidExtensions ? Pattern.compile( + "warcinfo|response|resource|request|metadata|revisit|conversion|continuation") : null); + v.field("Content-Type").pattern(mediaType); + v.field("WARC-Concurrent-To").pattern(recordId).repeatable() + .forbidOn("warcinfo", "conversion", "continuation"); + v.field("WARC-Block-Digest").pattern(labelledDigest); + v.field("WARC-Payload-Digest").pattern(labelledDigest); + // TODO: ip address pattern + v.field("WARC-IP-Address") + .forbidOn("warcinfo", "conversion", "continuation"); + v.field("WARC-Refers-To") + .pattern(recordId) + .forbidOn("warcinfo", "response", "resource", "request", "continuation"); + v.field("WARC-Refers-To-Target-URI") + .pattern(uri) + .forbidOn("warcinfo", "response", "metadata", "conversion", "resource", "request", "continuation"); + v.field("WARC-Refers-To-Date") + .pattern(date) + .forbidOn("warcinfo", "response", "metadata", "conversion", "resource", "request", "continuation"); + v.field("WARC-Target-URI") + .pattern(backwardsCompatibleUri) + .forbidOn("warcinfo") + .requireOn("response", "resource", "request", "revisit", "conversion", "continuation"); + v.field("WARC-Truncated") + .pattern(forbidExtensions ? Pattern.compile("length|time|disconnect|unspecified") : null); + v.field("WARC-Warcinfo-ID").pattern(recordId).forbidOn("warcinfo"); + v.field("WARC-Filename") + .forbidOn("revisit", "response", "metadata", "conversion", "resource", "request", "continuation"); + FieldRule profileField = v.field("WARC-Profile") + .requireOn("revisit"); + if (forbidExtensions) { + profileField.pattern(Pattern.compile( + "\\Qhttp://netpreserve.org/warc/1.1/revisit/identical-payload-digest\\E" + + "|\\Qhttp://netpreserve.org/warc/1.1/revisit/server-not-modified\\E")); + profileField.forbidOn("warcinfo", "response", "metadata", "conversion", "resource", "request", "continuation"); + } else { + profileField.pattern(backwardsCompatibleUri); + } + v.field("WARC-Identified-Payload-Type").pattern(mediaType); + v.field("WARC-Segment-Number").pattern(nonNegativeInteger); + v.field("WARC-Segment-Origin-ID").pattern(nonNegativeInteger) + .requireOn("continuation") + .forbidOn("warcinfo", "response", "metadata", "conversion", "resource", "request", "revisit"); + v.field("WARC-Segment-Total-Length").pattern(nonNegativeInteger) + .forbidOn("warcinfo", "response", "metadata", "conversion", "resource", "request", "revisit"); + return v; + } + + private FieldRule field(String name) { + return fields.computeIfAbsent(name, FieldRule::new); + } + + /** + * Validates the given set of message headers. + * + * @param headers the MessageHeaders object containing headers to be validated + * @return a list of strings describing any validation violations found + */ + public List validate(MessageHeaders headers) { + List violations = new ArrayList<>(); + + String recordType = headers.first("WARC-Type").orElse(null); + + for (Map.Entry> entry : headers.map().entrySet()) { + String name = entry.getKey(); + List values = entry.getValue(); + + FieldRule fieldRule = fields.get(name); + if (fieldRule == null) { + if (forbidUnknownFields) { + violations.add("Unknown field: " + name); + } + continue; + } + + if (!fieldRule.repeatable && values.size() > 1) { + violations.add("Field must not be repeated: " + name); + } + + if (recordType != null && fieldRule.forbiddenOn.contains(recordType)) { + violations.add("Field not allowed on " + recordType + " record: " + name); + } + + if (fieldRule.pattern != null) { + for (String value : values) { + if (!fieldRule.pattern.matcher(value).matches()) { + violations.add("Field has invalid value: " + value); + } + } + } + } + + // Check for fields mandatory on all records + Set names = headers.map().keySet(); + for (String field : mandatoryFields) { + if (!names.contains(field)) { + violations.add("Missing mandatory field: " + field); + } + } + + // Check for fields mandatory on specific record types + if (recordType != null) { + for (String name : mandatoryFieldsByRecordType.getOrDefault(recordType, Collections.emptySet())) { + if (!names.contains(name)) { + violations.add("Missing mandatory field for " + recordType + " record: " + name); + } + } + } + + return violations; + } + +} diff --git a/src/org/netpreserve/jwarc/MessageHeaders.java b/src/org/netpreserve/jwarc/MessageHeaders.java index 9c2dc52..fc29d0e 100644 --- a/src/org/netpreserve/jwarc/MessageHeaders.java +++ b/src/org/netpreserve/jwarc/MessageHeaders.java @@ -9,10 +9,7 @@ import java.nio.ByteBuffer; import java.nio.channels.ReadableByteChannel; import java.nio.charset.StandardCharsets; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Optional; +import java.util.*; import java.util.regex.Pattern; import static java.util.Collections.emptyList; @@ -21,6 +18,17 @@ public class MessageHeaders { private static Pattern COMMA_SEPARATOR = Pattern.compile("[ \t]*,[ \t]*"); private Map> map; + public static MessageHeaders of(String... keysAndValues) { + if (keysAndValues.length % 2 != 0) { + throw new IllegalArgumentException("an even number keysAndValues must be provided"); + } + Map> map = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); + for (int i = 0; i < keysAndValues.length; i += 2) { + map.computeIfAbsent(keysAndValues[i], k -> new ArrayList<>()).add(keysAndValues[i + 1]); + } + return new MessageHeaders(map); + } + MessageHeaders(Map> map) { map.replaceAll((name, values) -> Collections.unmodifiableList(values)); this.map = Collections.unmodifiableMap(map); diff --git a/src/org/netpreserve/jwarc/tools/ValidateTool.java b/src/org/netpreserve/jwarc/tools/ValidateTool.java index a569948..873c55f 100644 --- a/src/org/netpreserve/jwarc/tools/ValidateTool.java +++ b/src/org/netpreserve/jwarc/tools/ValidateTool.java @@ -13,6 +13,7 @@ import java.security.DigestException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.List; import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; @@ -66,6 +67,7 @@ public NonVerboseLogger() { private Logger logger; private boolean verbose; + private HeaderValidator headerValidator; public ValidateTool(boolean verbose) { this.verbose = verbose; @@ -178,6 +180,12 @@ private boolean validate(WarcReader reader) throws IOException { while (record != null) { boolean valid = true; + if (headerValidator != null) { + List headerViolations = headerValidator.validate(record.headers()); + headerViolations.forEach(logger::error); + valid &= headerViolations.isEmpty(); + } + if (record instanceof WarcCaptureRecord) { try { valid = validateCapture(record); @@ -251,6 +259,8 @@ private static void usage(int exitValue) { System.err.println(""); System.err.println("Options:"); System.err.println(""); + System.err.println(" --no-header-validation\tskips checking headers against WARC standard rules"); + System.err.println(" --forbid-extensions\tdisallows non-standard WARC header fields and values"); System.err.println(" -h / --help\tshow usage message and exit"); System.err.println(" -v / --verbose\tlog information about every WARC record to stdout"); System.err.println(""); @@ -263,10 +273,18 @@ private static void usage(int exitValue) { public static void main(String[] args) throws IOException { int res = 0; boolean verbose = false; + boolean headerValidation = true; + boolean forbidExtensions = false; if (args.length == 0) usage(0); for (String arg : args) { switch (arg) { + case "--no-header-validation": + headerValidation = false; + break; + case "--forbid-extensions": + forbidExtensions = true; + break; case "-h": case "--help": usage(0); @@ -277,6 +295,9 @@ public static void main(String[] args) throws IOException { break; default: ValidateTool validator = new ValidateTool(verbose); + if (headerValidation) { + validator.headerValidator = HeaderValidator.warc_1_1(forbidExtensions); + } try (WarcReader reader = new WarcReader(Paths.get(arg))) { reader.calculateBlockDigest(); if (verbose) diff --git a/test/org/netpreserve/jwarc/HeaderValidatorTest.java b/test/org/netpreserve/jwarc/HeaderValidatorTest.java new file mode 100644 index 0000000..d3a083f --- /dev/null +++ b/test/org/netpreserve/jwarc/HeaderValidatorTest.java @@ -0,0 +1,81 @@ +package org.netpreserve.jwarc; + +import org.junit.Test; + +import java.util.*; + +import static org.junit.Assert.*; + +public class HeaderValidatorTest { + private HeaderValidator headerValidator = HeaderValidator.warc_1_1(); + + @Test + public void testValid() { + MessageHeaders headers = MessageHeaders.of( + "WARC-Record-ID", "", + "Content-Length", "123456", + "WARC-Date", "2020-01-01T00:00:00Z", + "WARC-Type", "response", + "WARC-Target-URI", "http://example.com/", + "Content-Type", "application/http; msgtype=response", + "WARC-Concurrent-To", "", + "WARC-Concurrent-To", "" + ); + assertEquals(Collections.emptyList(), headerValidator.validate(headers)); + } + + @Test + public void testMissingMandatoryFields() { + MessageHeaders headers = MessageHeaders.of( + "Content-Length", "123456", + "WARC-Date", "2020-01-01T00:00:00Z", + "WARC-Type", "response" + ); + List validationErrors = headerValidator.validate(headers); + assertFalse(validationErrors.isEmpty()); + assertTrue(validationErrors.contains("Missing mandatory field: WARC-Record-ID")); + } + + @Test + public void testInvalidPatternValidation() { + MessageHeaders headers = MessageHeaders.of( + "WARC-Record-ID", "", + "Content-Length", "123456", + "WARC-Date", "2020-01-01T00:00:00Z", + "WARC-Type", "response", + "Content-Type", "invalid_content_type" + ); + List validationErrors = headerValidator.validate(headers); + assertFalse(validationErrors.isEmpty()); + assertTrue(validationErrors.contains("Field has invalid value: invalid_content_type")); + } + + @Test + public void testNonRepeatableField() { + MessageHeaders headers = MessageHeaders.of( + "WARC-Record-ID", "", + "Content-Length", "123456", + "WARC-Date", "2020-01-01T00:00:00Z", + "WARC-Type", "response", + "WARC-Date", "2020-01-01T00:00:00Z", + "WARC-Date", "2020-01-02T00:00:00Z" + ); + List validationErrors = headerValidator.validate(headers); + assertFalse(validationErrors.isEmpty()); + assertTrue(validationErrors.contains("Field must not be repeated: WARC-Date")); + } + + @Test + public void testForbiddenFieldsOnRecordType() { + MessageHeaders headers = MessageHeaders.of( + "WARC-Record-ID", "", + "Content-Length", "123456", + "WARC-Date", "2020-01-01T00:00:00Z", + "WARC-Type", "response", + "WARC-Filename", "test.warc.gz" + ); + List validationErrors = headerValidator.validate(headers); + assertFalse(validationErrors.isEmpty()); + assertTrue(validationErrors.contains("Field not allowed on response record: WARC-Filename")); + } +} \ No newline at end of file