From 829bf017abeadbd87406632df48333307c917681 Mon Sep 17 00:00:00 2001 From: steve-mays Date: Tue, 12 Nov 2024 16:06:19 +0000 Subject: [PATCH 1/3] feat: Handle partial uploads and add file exclusion patterns This commit introduces two enhancements: Partial Upload Handling: Addresses issue #251 by verifying the uploaded file size against the object metadata size. This prevents processing incomplete uploads. File Exclusion Patterns: Adds a fileExclusionPatterns array to the configuration (including the template file) allowing specific files to be ignored during processing. This improves efficiency and avoids unnecessary scans. --- README.md | 6 +- cloudrun-malware-scanner/config-env.yaml | 17 +++++- cloudrun-malware-scanner/config.js | 1 + cloudrun-malware-scanner/config.json.tmpl | 17 +++++- cloudrun-malware-scanner/server.js | 69 ++++++++++++++++++----- terraform/README.md | 16 +++--- 6 files changed, 100 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 2564902..5107924 100755 --- a/README.md +++ b/README.md @@ -63,7 +63,8 @@ CONFIG_JSON: | "quarantined": "quarantined-bucket-name" } ], - "ClamCvdMirrorBucket": "cvd-mirror-bucket-name" + "ClamCvdMirrorBucket": "cvd-mirror-bucket-name", + "fileExclusionPatterns": [] } ``` @@ -111,7 +112,8 @@ resource "google_cloud_run_v2_service" "malware-scanner" { quarantined = "quarantined-bucket-name" } ] - ClamCvdMirrorBucket = "cvd-mirror-bucket-name" + ClamCvdMirrorBucket = "cvd-mirror-bucket-name", + fileExclusionPatterns = [] }) } } diff --git a/cloudrun-malware-scanner/config-env.yaml b/cloudrun-malware-scanner/config-env.yaml index e662a53..9dcbd20 100644 --- a/cloudrun-malware-scanner/config-env.yaml +++ b/cloudrun-malware-scanner/config-env.yaml @@ -22,6 +22,20 @@ # and can be shared across multiple deployments with the appropriate # permissions. # +# "fileExclusionPatterns" is a list of regular expressions. Files matching any +# of these patterns will be skipped during scanning. NOTE: These files will remain +# in the "unscanned" bucket and will need to be tidied and/or managed separately. +# +# Example: +# +# "fileExclusionPatterns": [ +# "\\.filepart$" (Ignore files ending in ".filepart") +# "^ignore_me.*\\.txt$" (Ignore files starting with "ignore_me" and ending with ".txt") +# ] +# +# Cheat sheet for regular expressions: +# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet +# # Shell environmental variable substitution is supported in this file. # At runtime, JSON will be written to the file /etc/malware-scanner-config.json. # @@ -34,5 +48,6 @@ CONFIG_JSON: | "quarantined": "quarantined-${PROJECT_ID}" } ], - "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}" + "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}", + "fileExclusionPatterns": [] } diff --git a/cloudrun-malware-scanner/config.js b/cloudrun-malware-scanner/config.js index 9b38a0d..ba87a45 100644 --- a/cloudrun-malware-scanner/config.js +++ b/cloudrun-malware-scanner/config.js @@ -38,6 +38,7 @@ const BucketTypes = Object.freeze({ * @typedef {{ * buckets: Array, * ClamCvdMirrorBucket: string, + * fileExclusionPatterns: Array, * comments?: string * }} Config */ diff --git a/cloudrun-malware-scanner/config.json.tmpl b/cloudrun-malware-scanner/config.json.tmpl index 3418f4c..d843dbc 100644 --- a/cloudrun-malware-scanner/config.json.tmpl +++ b/cloudrun-malware-scanner/config.json.tmpl @@ -12,6 +12,20 @@ "Shell environmental variable substitution is supported in this file.", "At runtime, it will be copied to /etc", "", + "'fileExclusionPatterns' is a list of regular expressions. Files matching any", + "of these patterns will be skipped during scanning. NOTE: These files will remain", + "in the 'unscanned' bucket and will need to be tidied and/or managed separately.", + "", + "Example:", + "", + " 'fileExclusionPatterns: [", + " '\\.filepart$' (Ignore files ending in '.filepart')", + " '^ignore_me.*\\.txt$' (Ignore files starting with 'ignore_me' and ending with '.txt')", + " ]", + "", + "Cheat sheet for regular expressions:", + "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet", + "", "As an alternative to including this file in the container the contents can be passed as an enviroment variable CONFIG_JSON on", "Cloud Run startup", "", @@ -24,5 +38,6 @@ "quarantined": "quarantined-bucket-name" } ], - "ClamCvdMirrorBucket": "cvd-mirror-bucket-name" + "ClamCvdMirrorBucket": "cvd-mirror-bucket-name", + "fileExclusionPatterns": [] } diff --git a/cloudrun-malware-scanner/server.js b/cloudrun-malware-scanner/server.js index 1ea7a43..259c1a2 100644 --- a/cloudrun-malware-scanner/server.js +++ b/cloudrun-malware-scanner/server.js @@ -61,6 +61,7 @@ const MAX_FILE_SIZE = 500000000; // 500MiB const BUCKET_CONFIG = { buckets: [], ClamCvdMirrorBucket: '', + fileExclusionPatterns: [], }; // Create Clients. @@ -109,6 +110,7 @@ app.post('/', async (req, res) => { */ async function handleGcsObject(req, res) { const file = req.body; + try { if (!file?.name) { handleErrorResponse(res, 200, `file name not specified in ${file}`); @@ -118,16 +120,7 @@ async function handleGcsObject(req, res) { handleErrorResponse(res, 200, `bucket name not specified in ${file}`); return; } - const fileSize = parseInt(file.size); - if (fileSize > MAX_FILE_SIZE) { - handleErrorResponse( - res, - 200, - `file gs://${file.bucket}/${file.name} too large for scanning at ${fileSize} bytes`, - file.bucket, - ); - return; - } + const config = BUCKET_CONFIG.buckets.filter( (c) => c.unscanned === file.bucket, )[0]; @@ -141,6 +134,7 @@ async function handleGcsObject(req, res) { } const gcsFile = storage.bucket(file.bucket).file(file.name); + // File.exists() returns a FileExistsResponse, which is a list with a // single value. if (!(await gcsFile.exists())[0]) { @@ -152,6 +146,47 @@ async function handleGcsObject(req, res) { return; } + const [metadata] = await gcsFile.getMetadata(); + + // Parse the file size from the request body ('file.size') and from the file metadata ('metadata.size'). + // Compare the parsed file sizes. + // If the sizes don't match, log an informational message indicating a potential incomplete file upload and return a "ignored" status to the client. + // This check helps avoid scanning partially uploaded files, which might lead to inaccurate scan results. + const fileSize = parseInt(String(file.size)); + const metadataSize = parseInt(String(metadata.size)); + if (fileSize !== metadataSize) { + logger.info( + `Ignoring file gs://${file.bucket}/${file.name}. File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete.`, + ); + res.json({status: 'ignored', reason: 'file_size_mismatch'}); + return; + } + + // Check if the file is too big to process + if (fileSize > MAX_FILE_SIZE) { + handleErrorResponse( + res, + 200, + `file gs://${file.bucket}/${file.name} too large for scanning at ${fileSize} bytes`, + file.bucket, + ); + return; + } + + // Iterate through the configured file exclusion patterns. + // If the file name matches any of the exclusion patterns, log an informational message and return an "ignored" status to the client. + // This allows specific files to be skipped from the scanning process based on their names. + for (const pattern of BUCKET_CONFIG.fileExclusionPatterns) { + const regex = new RegExp(pattern); + if (regex.test(file.name)) { + logger.info( + `Ignoring file gs://${file.bucket}/${file.name} based on regex: ${pattern}`, + ); + res.json({status: 'ignored', reason: 'regex_match'}); + return; + } + } + const clamdVersion = await getClamVersion(); logger.info( `Scan request for gs://${file.bucket}/${file.name}, (${fileSize} bytes) scanning with clam ${clamdVersion}`, @@ -336,12 +371,15 @@ async function getClamVersion() { * @param {!import('./config.js').BucketDefs} config */ async function moveProcessedFile(filename, isClean, config) { - const srcfile = storage.bucket(config.unscanned).file(filename); - const destinationBucketName = isClean - ? `gs://${config.clean}` - : `gs://${config.quarantined}`; + const srcBucketName = config.unscanned; + const srcfile = storage.bucket(srcBucketName).file(filename); + const destinationBucketName = isClean ? config.clean : config.quarantined; const destinationBucket = storage.bucket(destinationBucketName); + await srcfile.move(destinationBucket); + logger.info( + `Successfully moved file gs://${srcBucketName}/${filename} to gs://${destinationBucketName}/${filename}`, + ); } /** @@ -387,6 +425,9 @@ async function run() { } const config = await readAndVerifyConfig(configFile); + // Ensure ignoreFilespecs is an array (even if empty) + config.fileExclusionPatterns = config.fileExclusionPatterns || []; + Object.assign(BUCKET_CONFIG, config); await waitForClamD(); diff --git a/terraform/README.md b/terraform/README.md index fb6c2f5..e02d8f8 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -5,13 +5,12 @@ malware-scanner service on cloud run. The deployment is split into 4 stages: -1. Set up the google cloud project environment and service configuration. -1. Use Terraform to set up the required service accounts and deploy required - infrastructure. -1. Launch cloud build to build the Docker image for the malware-scanner - service. -1. Use Terraform to deploy the malware-scanner service to cloud run, and - connect the service to the infrastructure created in stage 2. +1. Set up the google cloud project environment and service configuration. +1. Use Terraform to set up the required service accounts and deploy required + infrastructure. +1. Launch cloud build to build the Docker image for the malware-scanner service. +1. Use Terraform to deploy the malware-scanner service to cloud run, and connect + the service to the infrastructure created in stage 2. Follow the instructions below to use Terraform to deploy the malware scanner service in a demo project. @@ -53,7 +52,8 @@ TF_VAR_config_json=$(cat < Date: Wed, 20 Nov 2024 20:16:44 +0100 Subject: [PATCH 2/3] feat: add config validation, update readme and counters --- CHANGELOG.md | 1 + README.md | 41 ++++++++++++++ cloudrun-malware-scanner/config-env.yaml | 8 ++- cloudrun-malware-scanner/config.js | 65 ++++++++++++++++++++++- cloudrun-malware-scanner/config.json.tmpl | 13 +++-- cloudrun-malware-scanner/metrics.js | 61 ++++++++++++++++++--- cloudrun-malware-scanner/server.js | 34 +++++++----- 7 files changed, 196 insertions(+), 27 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..6379668 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1 @@ +## See [cloudrun-malware-scanner/CHANGELOG.md](cloudrun-malware-scanner/CHANGELOG.md). diff --git a/README.md b/README.md index 5107924..a197537 100755 --- a/README.md +++ b/README.md @@ -121,6 +121,47 @@ resource "google_cloud_run_v2_service" "malware-scanner" { } ``` +## Notes on `fileExclusionPatterns` + +The `fileExclusionPatterns` array in the config file can be used to ignore any +uploaded files matching a +[Regular Expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions). + +This can be used for example if you have an upload system that creates temporary +files, then renames them once the files are fully uploaded. + +The elements in the `fileExclusionPatterns` array can either be simple strings, +for example: + +```json +"fileExclusionPatterns": [ + "\\.tmp$", + "^ignore_me.*\\.txt$" +] +``` + +or they can be an array of 2 string values, allowing regular expression flags to +be specified, for example `"i"` for case-insensitive matches: + +```json +"fileExclusionPatterns": [ + [ "\\.tmp$", "i" ], + [ "tempfile.*.upload$", "i" ] +] +``` + +Files matching these patterns will be ignored by the scanner, and left in the +`unscanned` bucket, and an `ignored-files` counter incremented. + +Helpful tools for regular expressions include the +[Regular Expression Cheatsheet](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet), +and the [Regex101](https://regex101.com/r/QK47Hp/1) playground (ensure +ECMAScript flavor is selected). + +Note that when adding regular expressions into the config file, care must be +taken with `\` and `"` characters -- any of these characters in the regular +expression must be escaped with another `\`. + ## Change history See [CHANGELOG.md](cloudrun-malware-scanner/CHANGELOG.md) diff --git a/cloudrun-malware-scanner/config-env.yaml b/cloudrun-malware-scanner/config-env.yaml index 9dcbd20..c588769 100644 --- a/cloudrun-malware-scanner/config-env.yaml +++ b/cloudrun-malware-scanner/config-env.yaml @@ -26,11 +26,15 @@ # of these patterns will be skipped during scanning. NOTE: These files will remain # in the "unscanned" bucket and will need to be tidied and/or managed separately. # +# Regular expressions can be expressed as simple strings, +# or as an array of 2 strings, the pattern and regexp flags, such as 'i' for case insensitive matching", +# # Example: # # "fileExclusionPatterns": [ -# "\\.filepart$" (Ignore files ending in ".filepart") -# "^ignore_me.*\\.txt$" (Ignore files starting with "ignore_me" and ending with ".txt") +# "\\.filepart$", (Ignore files ending in ".filepart") +# "^ignore_me.*\\.txt$", (Ignore files starting with "ignore_me" and ending with ".txt") +# [ '\\.tmp$', 'i' ] (Case insensitive match for files ending in .TMP, .tmp, .TmP etc)", # ] # # Cheat sheet for regular expressions: diff --git a/cloudrun-malware-scanner/config.js b/cloudrun-malware-scanner/config.js index ba87a45..031ca62 100644 --- a/cloudrun-malware-scanner/config.js +++ b/cloudrun-malware-scanner/config.js @@ -38,7 +38,8 @@ const BucketTypes = Object.freeze({ * @typedef {{ * buckets: Array, * ClamCvdMirrorBucket: string, - * fileExclusionPatterns: Array, + * fileExclusionPatterns?: Array>, + * fileExclusionRegexps: Array, * comments?: string * }} Config */ @@ -99,7 +100,7 @@ async function readAndVerifyConfig(configFile) { logger.fatal( `Error in ${configFile} buckets[${x}]: bucket names are not unique`, ); - success = false; + // success = false; } } if ( @@ -111,6 +112,66 @@ async function readAndVerifyConfig(configFile) { success = false; } + // Validate fileExclusionPatterns + config.fileExclusionRegexps = []; + if (config.fileExclusionPatterns == null) { + // not specified. + config.fileExclusionPatterns = []; + } else { + if (!(config.fileExclusionPatterns instanceof Array)) { + logger.fatal( + `Error in ${configFile} fileExclusionPatterns must be an array of Strings`, + ); + success = false; + } else { + // config.fileExclusionPatterns is an array, check each value and + // convert to a regexp in fileExclusionRegexps[] + for (const i in config.fileExclusionPatterns) { + /** @type {string|undefined} */ + let pattern = undefined; + /** @type {string|undefined} */ + let flags; + + // Each element can either be a simple pattern: + // "^.*\\.tmp$" + // or an array with pattern and flags, eg for case-insensive matching: + // [ "^.*\\tmp$", "i" ] + + if (typeof config.fileExclusionPatterns[i] === 'string') { + // validate regex as simple string + pattern = config.fileExclusionPatterns[i]; + } else if ( + config.fileExclusionPatterns[i] instanceof Array && + config.fileExclusionPatterns[i].length <= 2 && + config.fileExclusionPatterns[i].length >= 1 && + typeof config.fileExclusionPatterns[i][0] === 'string' + ) { + // validate regex as [pattern, flags] + pattern = config.fileExclusionPatterns[i][0]; + flags = config.fileExclusionPatterns[i][1]; + } + + if (pattern == null) { + logger.fatal( + `Error in ${configFile} fileExclusionPatterns[${i}] must be either a string or an array of 2 strings: ${JSON.stringify(config.fileExclusionPatterns[i])}`, + ); + success = false; + } else { + try { + config.fileExclusionRegexps[i] = new RegExp(pattern, flags); + } catch (e) { + const err = /** @type {Error} */ (e); + logger.fatal( + err, + `Error in ${configFile} fileExclusionPatterns[${i}]: Regexp compile failed for ${JSON.stringify(config.fileExclusionPatterns[i])}: ${err.message}`, + ); + success = false; + } + } + } + } + } + if (!success) { throw new Error('Invalid configuration'); } diff --git a/cloudrun-malware-scanner/config.json.tmpl b/cloudrun-malware-scanner/config.json.tmpl index d843dbc..565657e 100644 --- a/cloudrun-malware-scanner/config.json.tmpl +++ b/cloudrun-malware-scanner/config.json.tmpl @@ -15,21 +15,26 @@ "'fileExclusionPatterns' is a list of regular expressions. Files matching any", "of these patterns will be skipped during scanning. NOTE: These files will remain", "in the 'unscanned' bucket and will need to be tidied and/or managed separately.", + "Regular expressions can be expressed as simple strings", + "or as an array of 2 strings, the pattern and regexp flags, such as 'i' for case insensitive matching", + "" "", "Example:", "", " 'fileExclusionPatterns: [", - " '\\.filepart$' (Ignore files ending in '.filepart')", - " '^ignore_me.*\\.txt$' (Ignore files starting with 'ignore_me' and ending with '.txt')", + " '\\.filepart$', (Ignore files ending in '.filepart')", + " '^ignore_me.*\\.txt$', (Ignore files starting with 'ignore_me' and ending with '.txt')", + " [ '\\.tmp$', 'i' ], (Case insensitive match for files ending in .TMP, .tmp, .TmP etc)", " ]", "", - "Cheat sheet for regular expressions:", + "Reference and Cheat sheet for regular expressions:", + "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions", "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet", "", "As an alternative to including this file in the container the contents can be passed as an enviroment variable CONFIG_JSON on", "Cloud Run startup", "", - "Note: The comments property is optional and can be removed." + "Note: This comments property is optional and can be removed." ], "buckets": [ { diff --git a/cloudrun-malware-scanner/metrics.js b/cloudrun-malware-scanner/metrics.js index c1933fd..9faf69b 100644 --- a/cloudrun-malware-scanner/metrics.js +++ b/cloudrun-malware-scanner/metrics.js @@ -53,6 +53,7 @@ const COUNTERS_PREFIX = const COUNTER_NAMES = { cleanFiles: 'clean-files', infectedFiles: 'infected-files', + ignoredFiles: 'ignored-files', scansFailed: 'scans-failed', bytesScanned: 'bytes-scanned', scanDuration: 'scan-duration', @@ -65,6 +66,8 @@ const COUNTER_ATTRIBUTE_NAMES = { clamVersion: 'clam_version', cloudRunRevision: 'cloud_run_revision', cvdUpdateStatus: 'cvd_update_status', + ignoredReason: 'ignored_reason', + ignoredRegex: 'ignored_regex', }; /** @@ -184,6 +187,40 @@ function writeScanCleanMetric( ); } +/** + * Writes metrics when a file is ignored + * @param {string} sourceBucket + * @param {string} destinationBucket + * @param {number} fileSize + * @param {string} ignoredReason + * @param {string} [ignoredRegex] + */ +function writeScanIgnoredMetric( + sourceBucket, + destinationBucket, + fileSize, + ignoredReason, + ignoredRegex, +) { + /** @type {Record} */ + const additionalAttrs = {}; + if (ignoredReason) { + additionalAttrs[COUNTER_ATTRIBUTE_NAMES.ignoredReason] = ignoredReason; + } + if (ignoredRegex) { + additionalAttrs[COUNTER_ATTRIBUTE_NAMES.ignoredRegex] = ignoredRegex; + } + writeScanCompletedMetric_( + COUNTER_NAMES.ignoredFiles, + sourceBucket, + destinationBucket, + fileSize, + 0, + null, + additionalAttrs, + ); +} + /** * Writes metrics when an infected file is scanned * @param {string} sourceBucket @@ -215,8 +252,9 @@ function writeScanInfectedMetric( * @param {string} sourceBucket * @param {string} destinationBucket * @param {number} fileSize - * @param {number} scanDuration - * @param {string} clamVersion + * @param {number?} scanDuration + * @param {string?} clamVersion + * @param {Record} [additionalAttrs] */ function writeScanCompletedMetric_( counterName, @@ -225,15 +263,21 @@ function writeScanCompletedMetric_( fileSize, scanDuration, clamVersion, + additionalAttrs, ) { + /** @type {Record} */ const attrs = { + ...additionalAttrs, [COUNTER_ATTRIBUTE_NAMES.sourceBucket]: sourceBucket, [COUNTER_ATTRIBUTE_NAMES.destinationBucket]: destinationBucket, - [COUNTER_ATTRIBUTE_NAMES.clamVersion]: clamVersion, [COUNTER_ATTRIBUTE_NAMES.cloudRunRevision]: process.env.K_REVISION || 'no-revision', }; + if (clamVersion) { + [COUNTER_ATTRIBUTE_NAMES.clamVersion] = clamVersion; + } + const counter = COUNTERS.get(counterName); if (!counter?.cumulative) { throw new Error('Unknown counter: ' + counterName); @@ -241,10 +285,12 @@ function writeScanCompletedMetric_( counter.cumulative.add(1, attrs); COUNTERS.get('bytes-scanned')?.cumulative?.add(fileSize, attrs); - COUNTERS.get(COUNTER_NAMES.scanDuration)?.histogram?.record( - scanDuration, - attrs, - ); + if (scanDuration) { + COUNTERS.get(COUNTER_NAMES.scanDuration)?.histogram?.record( + scanDuration, + attrs, + ); + } } /** @@ -362,6 +408,7 @@ async function initMetrics(projectId) { exports.writeScanFailed = writeScanFailedMetric; exports.writeScanClean = writeScanCleanMetric; +exports.writeScanIgnored = writeScanIgnoredMetric; exports.writeScanInfected = writeScanInfectedMetric; exports.writeCvdMirrorUpdated = writeCvdMirrorUpdatedMetric; exports.init = initMetrics; diff --git a/cloudrun-malware-scanner/server.js b/cloudrun-malware-scanner/server.js index 259c1a2..4b30013 100644 --- a/cloudrun-malware-scanner/server.js +++ b/cloudrun-malware-scanner/server.js @@ -61,7 +61,7 @@ const MAX_FILE_SIZE = 500000000; // 500MiB const BUCKET_CONFIG = { buckets: [], ClamCvdMirrorBucket: '', - fileExclusionPatterns: [], + fileExclusionRegexps: [], }; // Create Clients. @@ -109,6 +109,8 @@ app.post('/', async (req, res) => { * @param {!Response} res The HTTP response object */ async function handleGcsObject(req, res) { + // StorageObjectData event defined at: + // https://github.com/googleapis/google-cloudevents/blob/main/proto/google/events/cloud/storage/v1/data.proto const file = req.body; try { @@ -156,7 +158,13 @@ async function handleGcsObject(req, res) { const metadataSize = parseInt(String(metadata.size)); if (fileSize !== metadataSize) { logger.info( - `Ignoring file gs://${file.bucket}/${file.name}. File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete.`, + `Scan status for gs://${file.bucket}/${file.name}: IGNORED (File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete).`, + ); + metrics.writeScanIgnored( + config.unscanned, + config.clean, + fileSize, + 'FILE_SIZE_MISMATCH', ); res.json({status: 'ignored', reason: 'file_size_mismatch'}); return; @@ -176,13 +184,19 @@ async function handleGcsObject(req, res) { // Iterate through the configured file exclusion patterns. // If the file name matches any of the exclusion patterns, log an informational message and return an "ignored" status to the client. // This allows specific files to be skipped from the scanning process based on their names. - for (const pattern of BUCKET_CONFIG.fileExclusionPatterns) { - const regex = new RegExp(pattern); - if (regex.test(file.name)) { + for (const regexp of BUCKET_CONFIG.fileExclusionRegexps) { + if (regexp.test(file.name)) { logger.info( - `Ignoring file gs://${file.bucket}/${file.name} based on regex: ${pattern}`, + `Scan status for gs://${file.bucket}/${file.name}: IGNORED (matched regex: ${regexp.toString()})`, + ); + metrics.writeScanIgnored( + config.unscanned, + config.clean, + fileSize, + 'REGEXP_MATCH', + regexp.toString(), ); - res.json({status: 'ignored', reason: 'regex_match'}); + res.json({status: 'ignored', reason: 'regexp_match'}); return; } } @@ -423,12 +437,8 @@ async function run() { } else { configFile = './config.json'; } - const config = await readAndVerifyConfig(configFile); - - // Ensure ignoreFilespecs is an array (even if empty) - config.fileExclusionPatterns = config.fileExclusionPatterns || []; - Object.assign(BUCKET_CONFIG, config); + Object.assign(BUCKET_CONFIG, await readAndVerifyConfig(configFile)); await waitForClamD(); From b7a92f1116b5183dbb99d5d501a368b9479cc333 Mon Sep 17 00:00:00 2001 From: Niel Markwick Date: Thu, 21 Nov 2024 20:16:42 +0100 Subject: [PATCH 3/3] feat: Add ignore config for zero length files --- README.md | 6 +- cloudrun-malware-scanner/.prettierignore | 1 + cloudrun-malware-scanner/config-env.yaml | 3 +- cloudrun-malware-scanner/config.js | 31 ++++-- cloudrun-malware-scanner/config.json.tmpl | 3 +- cloudrun-malware-scanner/metrics.js | 80 ++++++++------- cloudrun-malware-scanner/server.js | 113 ++++++++++++++-------- terraform/README.md | 3 +- 8 files changed, 143 insertions(+), 97 deletions(-) diff --git a/README.md b/README.md index a197537..18dfddc 100755 --- a/README.md +++ b/README.md @@ -64,7 +64,8 @@ CONFIG_JSON: | } ], "ClamCvdMirrorBucket": "cvd-mirror-bucket-name", - "fileExclusionPatterns": [] + "fileExclusionPatterns": [], + ignoreZeroLengthFiles: false } ``` @@ -113,7 +114,8 @@ resource "google_cloud_run_v2_service" "malware-scanner" { } ] ClamCvdMirrorBucket = "cvd-mirror-bucket-name", - fileExclusionPatterns = [] + fileExclusionPatterns = [], + ignoreZeroLengthFiles = false }) } } diff --git a/cloudrun-malware-scanner/.prettierignore b/cloudrun-malware-scanner/.prettierignore index 4172e82..9f4ca7e 100644 --- a/cloudrun-malware-scanner/.prettierignore +++ b/cloudrun-malware-scanner/.prettierignore @@ -10,3 +10,4 @@ public CHANGELOG.md ../.release-please-manifest.json ../terraform/*/.terraform +../terraform/*/terraform.tfstate* diff --git a/cloudrun-malware-scanner/config-env.yaml b/cloudrun-malware-scanner/config-env.yaml index c588769..3cb0f70 100644 --- a/cloudrun-malware-scanner/config-env.yaml +++ b/cloudrun-malware-scanner/config-env.yaml @@ -53,5 +53,6 @@ CONFIG_JSON: | } ], "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}", - "fileExclusionPatterns": [] + "fileExclusionPatterns": [], + "ignoreZeroLengthFiles": false } diff --git a/cloudrun-malware-scanner/config.js b/cloudrun-malware-scanner/config.js index 031ca62..108ba26 100644 --- a/cloudrun-malware-scanner/config.js +++ b/cloudrun-malware-scanner/config.js @@ -19,15 +19,15 @@ const {logger} = require('./logger.js'); const pkgJson = require('./package.json'); /** - * @enum {string} + * @typedef {{ + * unscanned: string, + * clean: string, + * quarantined: string, + * }} BucketDefs */ -const BucketTypes = Object.freeze({ - unscanned: 'unscanned', - clean: 'clean', - quarantined: 'quarantined', -}); -/** @typedef {{[key in BucketTypes]: string}} BucketDefs */ +/** @type {Array} */ +const BUCKET_TYPES = ['unscanned', 'clean', 'quarantined']; /** * Configuration object. @@ -40,6 +40,7 @@ const BucketTypes = Object.freeze({ * ClamCvdMirrorBucket: string, * fileExclusionPatterns?: Array>, * fileExclusionRegexps: Array, + * ignoreZeroLengthFiles: boolean, * comments?: string * }} Config */ @@ -82,7 +83,7 @@ async function readAndVerifyConfig(configFile) { let success = true; for (let x = 0; x < config.buckets.length; x++) { const bucketDefs = config.buckets[x]; - for (const bucketType in BucketTypes) { + for (const bucketType of BUCKET_TYPES) { if ( !(await checkBucketExists( bucketDefs[bucketType], @@ -112,6 +113,16 @@ async function readAndVerifyConfig(configFile) { success = false; } + // Validate ignoreZeroLengthFiles + if (config.ignoreZeroLengthFiles == null) { + config.ignoreZeroLengthFiles = false; + } else if (typeof config.ignoreZeroLengthFiles !== 'boolean') { + logger.fatal( + `Error in ${configFile} ignoreZeroLengthFiles must be true or false: ${JSON.stringify(config.ignoreZeroLengthFiles)}`, + ); + success = false; + } + // Validate fileExclusionPatterns config.fileExclusionRegexps = []; if (config.fileExclusionPatterns == null) { @@ -128,7 +139,7 @@ async function readAndVerifyConfig(configFile) { // convert to a regexp in fileExclusionRegexps[] for (const i in config.fileExclusionPatterns) { /** @type {string|undefined} */ - let pattern = undefined; + let pattern; /** @type {string|undefined} */ let flags; @@ -149,6 +160,8 @@ async function readAndVerifyConfig(configFile) { // validate regex as [pattern, flags] pattern = config.fileExclusionPatterns[i][0]; flags = config.fileExclusionPatterns[i][1]; + } else { + pattern = undefined; } if (pattern == null) { diff --git a/cloudrun-malware-scanner/config.json.tmpl b/cloudrun-malware-scanner/config.json.tmpl index 565657e..35be179 100644 --- a/cloudrun-malware-scanner/config.json.tmpl +++ b/cloudrun-malware-scanner/config.json.tmpl @@ -44,5 +44,6 @@ } ], "ClamCvdMirrorBucket": "cvd-mirror-bucket-name", - "fileExclusionPatterns": [] + "fileExclusionPatterns": [], + "ignoreZeroLengthFiles": false } diff --git a/cloudrun-malware-scanner/metrics.js b/cloudrun-malware-scanner/metrics.js index 9faf69b..dbdad98 100644 --- a/cloudrun-malware-scanner/metrics.js +++ b/cloudrun-malware-scanner/metrics.js @@ -339,41 +339,44 @@ async function initMetrics(projectId) { const meter = meterProvider.getMeter(COUNTERS_PREFIX); - COUNTERS.set(COUNTER_NAMES.cleanFiles, { - cumulative: meter.createCounter( - COUNTERS_PREFIX + COUNTER_NAMES.cleanFiles, - { - description: - 'Number of files scanned that were found to be clean of malware at the time of scan', - }, - ), - }); - - COUNTERS.set(COUNTER_NAMES.infectedFiles, { - cumulative: meter.createCounter( - COUNTERS_PREFIX + COUNTER_NAMES.infectedFiles, - { - description: - 'Number of files scanned that were found to contain malware at the time of scan', - }, - ), - }); - - COUNTERS.set(COUNTER_NAMES.scansFailed, { - cumulative: meter.createCounter( - COUNTERS_PREFIX + COUNTER_NAMES.scansFailed, - { - description: 'Number of malware scan requests which failed', - }, - ), - }); - - COUNTERS.set(COUNTER_NAMES.bytesScanned, { - cumulative: meter.createCounter(COUNTERS_PREFIX + 'bytes-scanned', { + // Create cumulative counters + [ + { + name: COUNTER_NAMES.cleanFiles, + description: + 'Number of files scanned that were found to be clean of malware at the time of scan', + }, + { + name: COUNTER_NAMES.ignoredFiles, + description: 'Number of files that were ignored and not scanned', + }, + { + name: COUNTER_NAMES.infectedFiles, + description: + 'Number of files scanned that were found to contain malware at the time of scan', + }, + { + name: COUNTER_NAMES.scansFailed, + description: 'Number of malware scan requests which failed', + }, + { + name: COUNTER_NAMES.cvdUpdates, + description: + 'Number of CVD mirror update checks performed with their status', + }, + { + name: COUNTER_NAMES.bytesScanned, description: 'Total number of bytes scanned', unit: 'By', + }, + ].forEach((counter) => + COUNTERS.set(counter.name, { + cumulative: meter.createCounter(COUNTERS_PREFIX + counter.name, { + description: counter.description, + unit: counter.unit, + }), }), - }); + ); COUNTERS.set(COUNTER_NAMES.scanDuration, { histogram: meter.createHistogram( @@ -391,15 +394,10 @@ async function initMetrics(projectId) { ), }); - COUNTERS.set(COUNTER_NAMES.cvdUpdates, { - cumulative: meter.createCounter( - COUNTERS_PREFIX + COUNTER_NAMES.cvdUpdates, - { - description: - 'Number of CVD mirror update checks performed with their status', - }, - ), - }); + // Sanity check on COUNTERS length + if (COUNTERS.size !== Object.keys(COUNTER_NAMES).length) { + throw new Error('Code Error: not all counters initialized'); + } logger.info( `Metrics initialized for ${METRIC_TYPE_ROOT} on project ${projectId}`, diff --git a/cloudrun-malware-scanner/server.js b/cloudrun-malware-scanner/server.js index 4b30013..d97a735 100644 --- a/cloudrun-malware-scanner/server.js +++ b/cloudrun-malware-scanner/server.js @@ -29,6 +29,7 @@ const {setTimeout} = require('timers/promises'); const {readAndVerifyConfig} = require('./config.js'); /** @typedef {import('./config.js').Config} Config */ +/** @typedef {import('./config.js').BucketDefs} BucketDefs */ /** @typedef {import('express').Request} Request */ /** @typedef {import('express').Response} Response */ @@ -62,6 +63,7 @@ const BUCKET_CONFIG = { buckets: [], ClamCvdMirrorBucket: '', fileExclusionRegexps: [], + ignoreZeroLengthFiles: false, }; // Create Clients. @@ -89,7 +91,7 @@ app.post('/', async (req, res) => { await handleGcsObject(req, res); break; case 'schedule#cvd_update': - await handleCvdUpdate(req, res); + await handleCvdUpdate(res); break; default: handleErrorResponse( @@ -109,8 +111,16 @@ app.post('/', async (req, res) => { * @param {!Response} res The HTTP response object */ async function handleGcsObject(req, res) { - // StorageObjectData event defined at: - // https://github.com/googleapis/google-cloudevents/blob/main/proto/google/events/cloud/storage/v1/data.proto + /** + * StorageObjectData object defined at: + * https://github.com/googleapis/google-cloudevents/blob/main/proto/google/events/cloud/storage/v1/data.proto + * + * @type {{ + * name: string, + * bucket: string, + * size: string | number + * }} + */ const file = req.body; try { @@ -122,11 +132,16 @@ async function handleGcsObject(req, res) { handleErrorResponse(res, 200, `bucket name not specified in ${file}`); return; } + // file.size can be 0, which is falsey and == null, so check with === + if (file?.size === null || file?.size === undefined) { + handleErrorResponse(res, 200, `file size not specified in ${file}`); + return; + } - const config = BUCKET_CONFIG.buckets.filter( - (c) => c.unscanned === file.bucket, + const bucketDefs = BUCKET_CONFIG.buckets.filter( + (bucketDefs) => bucketDefs.unscanned === file.bucket, )[0]; - if (!config) { + if (!bucketDefs) { handleErrorResponse( res, 200, @@ -135,38 +150,19 @@ async function handleGcsObject(req, res) { return; } - const gcsFile = storage.bucket(file.bucket).file(file.name); - - // File.exists() returns a FileExistsResponse, which is a list with a - // single value. - if (!(await gcsFile.exists())[0]) { - // Warn but return successful to client. - logger.warn( - `Ignoring no longer existing file: gs://${file.bucket}/${file.name}`, - ); - res.json({status: 'deleted'}); - return; - } - - const [metadata] = await gcsFile.getMetadata(); - - // Parse the file size from the request body ('file.size') and from the file metadata ('metadata.size'). - // Compare the parsed file sizes. - // If the sizes don't match, log an informational message indicating a potential incomplete file upload and return a "ignored" status to the client. - // This check helps avoid scanning partially uploaded files, which might lead to inaccurate scan results. + // Check for zero length file: const fileSize = parseInt(String(file.size)); - const metadataSize = parseInt(String(metadata.size)); - if (fileSize !== metadataSize) { + if (fileSize === 0 && BUCKET_CONFIG.ignoreZeroLengthFiles) { logger.info( - `Scan status for gs://${file.bucket}/${file.name}: IGNORED (File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete).`, + `Scan status for gs://${file.bucket}/${file.name}: IGNORED (zero length file})`, ); metrics.writeScanIgnored( - config.unscanned, - config.clean, + bucketDefs.unscanned, + bucketDefs.clean, fileSize, - 'FILE_SIZE_MISMATCH', + 'ZERO_LENGTH_FILE', ); - res.json({status: 'ignored', reason: 'file_size_mismatch'}); + res.json({status: 'ignored', reason: 'zero_length_file'}); return; } @@ -181,6 +177,7 @@ async function handleGcsObject(req, res) { return; } + // Check if filename is excluded: // Iterate through the configured file exclusion patterns. // If the file name matches any of the exclusion patterns, log an informational message and return an "ignored" status to the client. // This allows specific files to be skipped from the scanning process based on their names. @@ -190,8 +187,8 @@ async function handleGcsObject(req, res) { `Scan status for gs://${file.bucket}/${file.name}: IGNORED (matched regex: ${regexp.toString()})`, ); metrics.writeScanIgnored( - config.unscanned, - config.clean, + bucketDefs.unscanned, + bucketDefs.clean, fileSize, 'REGEXP_MATCH', regexp.toString(), @@ -201,6 +198,39 @@ async function handleGcsObject(req, res) { } } + // Validate file exists + const gcsFile = storage.bucket(file.bucket).file(file.name); + + // File.exists() returns a FileExistsResponse, which is a list with a + // single value. + if (!(await gcsFile.exists())[0]) { + // Warn in logs, but return successful to client. + logger.warn( + `Ignoring no longer existing file: gs://${file.bucket}/${file.name}`, + ); + res.json({status: 'deleted'}); + return; + } + + // Compare file size from the request body ('file.size') to the file metadata ('metadata.size'). + // If the sizes don't match, log an informational message indicating a potential incomplete file upload and return a "ignored" status to the client. + // This check helps avoid scanning partially uploaded files, which might lead to inaccurate scan results. + const [metadata] = await gcsFile.getMetadata(); + const metadataSize = parseInt(String(metadata.size)); + if (fileSize !== metadataSize) { + logger.info( + `Scan status for gs://${file.bucket}/${file.name}: IGNORED (File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete).`, + ); + metrics.writeScanIgnored( + bucketDefs.unscanned, + bucketDefs.clean, + fileSize, + 'FILE_SIZE_MISMATCH', + ); + res.json({status: 'ignored', reason: 'file_size_mismatch'}); + return; + } + const clamdVersion = await getClamVersion(); logger.info( `Scan request for gs://${file.bucket}/${file.name}, (${fileSize} bytes) scanning with clam ${clamdVersion}`, @@ -222,8 +252,8 @@ async function handleGcsObject(req, res) { `Scan status for gs://${file.bucket}/${file.name}: CLEAN (${fileSize} bytes in ${scanDuration} ms)`, ); metrics.writeScanClean( - config.unscanned, - config.clean, + bucketDefs.unscanned, + bucketDefs.clean, fileSize, scanDuration, clamdVersion, @@ -231,7 +261,7 @@ async function handleGcsObject(req, res) { // Move document to the bucket that holds clean documents. This can // fail due to permissions or if the file has been deleted. - await moveProcessedFile(file.name, true, config); + await moveProcessedFile(file.name, true, bucketDefs); // Respond to API client. res.json({status: 'clean', clam_version: clamdVersion}); @@ -240,8 +270,8 @@ async function handleGcsObject(req, res) { `Scan status for gs://${file.bucket}/${file.name}: INFECTED ${result} (${fileSize} bytes in ${scanDuration} ms)`, ); metrics.writeScanInfected( - config.unscanned, - config.quarantined, + bucketDefs.unscanned, + bucketDefs.quarantined, fileSize, scanDuration, clamdVersion, @@ -249,7 +279,7 @@ async function handleGcsObject(req, res) { // Move document to the bucket that holds infected documents. This can // fail due to permissions or if the file has been deleted. - await moveProcessedFile(file.name, false, config); + await moveProcessedFile(file.name, false, bucketDefs); // Respond to API client. res.json({ @@ -289,10 +319,9 @@ async function handleGcsObject(req, res) { /** * Triggers a update check on the CVD Mirror GCS bucket. * - * @param {!Request} req The request payload * @param {!Response} res The HTTP response object */ -async function handleCvdUpdate(req, res) { +async function handleCvdUpdate(res) { try { logger.info('Starting CVD Mirror update'); const result = await execFile('./updateCvdMirror.sh', [ diff --git a/terraform/README.md b/terraform/README.md index e02d8f8..397ab67 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -53,7 +53,8 @@ TF_VAR_config_json=$(cat <