From 6890d22a74f46372ead01b30f5972a3cb7f3ced7 Mon Sep 17 00:00:00 2001 From: steve-mays Date: Tue, 12 Nov 2024 16:06:19 +0000 Subject: [PATCH] feat: Handle partial uploads and add file exclusion patterns This commit introduces two enhancements: Partial Upload Handling: Addresses issue #251 by verifying the uploaded file size against the object metadata size. This prevents processing incomplete uploads. File Exclusion Patterns: Adds a fileExclusionPatterns array to the configuration (including the template file) allowing specific files to be ignored during processing. This improves efficiency and avoids unnecessary scans. --- README.md | 6 +- cloudrun-malware-scanner/config-env.yaml | 17 +++++- cloudrun-malware-scanner/config.js | 1 + cloudrun-malware-scanner/config.json.tmpl | 18 +++++- cloudrun-malware-scanner/server.js | 69 ++++++++++++++++++----- terraform/README.md | 16 +++--- 6 files changed, 99 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 2564902a..51079249 100755 --- a/README.md +++ b/README.md @@ -63,7 +63,8 @@ CONFIG_JSON: | "quarantined": "quarantined-bucket-name" } ], - "ClamCvdMirrorBucket": "cvd-mirror-bucket-name" + "ClamCvdMirrorBucket": "cvd-mirror-bucket-name", + "fileExclusionPatterns": [] } ``` @@ -111,7 +112,8 @@ resource "google_cloud_run_v2_service" "malware-scanner" { quarantined = "quarantined-bucket-name" } ] - ClamCvdMirrorBucket = "cvd-mirror-bucket-name" + ClamCvdMirrorBucket = "cvd-mirror-bucket-name", + fileExclusionPatterns = [] }) } } diff --git a/cloudrun-malware-scanner/config-env.yaml b/cloudrun-malware-scanner/config-env.yaml index e662a530..9dcbd204 100644 --- a/cloudrun-malware-scanner/config-env.yaml +++ b/cloudrun-malware-scanner/config-env.yaml @@ -22,6 +22,20 @@ # and can be shared across multiple deployments with the appropriate # permissions. # +# "fileExclusionPatterns" is a list of regular expressions. Files matching any +# of these patterns will be skipped during scanning. NOTE: These files will remain +# in the "unscanned" bucket and will need to be tidied and/or managed separately. +# +# Example: +# +# "fileExclusionPatterns": [ +# "\\.filepart$" (Ignore files ending in ".filepart") +# "^ignore_me.*\\.txt$" (Ignore files starting with "ignore_me" and ending with ".txt") +# ] +# +# Cheat sheet for regular expressions: +# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet +# # Shell environmental variable substitution is supported in this file. # At runtime, JSON will be written to the file /etc/malware-scanner-config.json. # @@ -34,5 +48,6 @@ CONFIG_JSON: | "quarantined": "quarantined-${PROJECT_ID}" } ], - "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}" + "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}", + "fileExclusionPatterns": [] } diff --git a/cloudrun-malware-scanner/config.js b/cloudrun-malware-scanner/config.js index 9b38a0d8..ba87a45c 100644 --- a/cloudrun-malware-scanner/config.js +++ b/cloudrun-malware-scanner/config.js @@ -38,6 +38,7 @@ const BucketTypes = Object.freeze({ * @typedef {{ * buckets: Array, * ClamCvdMirrorBucket: string, + * fileExclusionPatterns: Array, * comments?: string * }} Config */ diff --git a/cloudrun-malware-scanner/config.json.tmpl b/cloudrun-malware-scanner/config.json.tmpl index 3418f4c1..a28ba338 100644 --- a/cloudrun-malware-scanner/config.json.tmpl +++ b/cloudrun-malware-scanner/config.json.tmpl @@ -9,8 +9,19 @@ "'ClamCvdMirrorBucket' is a GCS bucket used to mirror the clamav database definition files to prevent overloading the Clam servers", "and being rate limited/blacklisted. Its contents are maintained by the updateCvdMirror.sh script", "", - "Shell environmental variable substitution is supported in this file.", - "At runtime, it will be copied to /etc", + "'fileExclusionPatterns' is a list of regular expressions. Files matching any", + "of these patterns will be skipped during scanning. NOTE: These files will remain", + "in the 'unscanned' bucket and will need to be tidied and/or managed separately.", + "", + "Example:", + "", + " 'fileExclusionPatterns: [", + " '\\.filepart$' (Ignore files ending in '.filepart')", + " '^ignore_me.*\\.txt$' (Ignore files starting with 'ignore_me' and ending with '.txt')", + " ]", + "", + "Cheat sheet for regular expressions:", + "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet", "", "As an alternative to including this file in the container the contents can be passed as an enviroment variable CONFIG_JSON on", "Cloud Run startup", @@ -24,5 +35,6 @@ "quarantined": "quarantined-bucket-name" } ], - "ClamCvdMirrorBucket": "cvd-mirror-bucket-name" + "ClamCvdMirrorBucket": "cvd-mirror-bucket-name", + "fileExclusionPatterns": [] } diff --git a/cloudrun-malware-scanner/server.js b/cloudrun-malware-scanner/server.js index 1ea7a43f..259c1a2b 100644 --- a/cloudrun-malware-scanner/server.js +++ b/cloudrun-malware-scanner/server.js @@ -61,6 +61,7 @@ const MAX_FILE_SIZE = 500000000; // 500MiB const BUCKET_CONFIG = { buckets: [], ClamCvdMirrorBucket: '', + fileExclusionPatterns: [], }; // Create Clients. @@ -109,6 +110,7 @@ app.post('/', async (req, res) => { */ async function handleGcsObject(req, res) { const file = req.body; + try { if (!file?.name) { handleErrorResponse(res, 200, `file name not specified in ${file}`); @@ -118,16 +120,7 @@ async function handleGcsObject(req, res) { handleErrorResponse(res, 200, `bucket name not specified in ${file}`); return; } - const fileSize = parseInt(file.size); - if (fileSize > MAX_FILE_SIZE) { - handleErrorResponse( - res, - 200, - `file gs://${file.bucket}/${file.name} too large for scanning at ${fileSize} bytes`, - file.bucket, - ); - return; - } + const config = BUCKET_CONFIG.buckets.filter( (c) => c.unscanned === file.bucket, )[0]; @@ -141,6 +134,7 @@ async function handleGcsObject(req, res) { } const gcsFile = storage.bucket(file.bucket).file(file.name); + // File.exists() returns a FileExistsResponse, which is a list with a // single value. if (!(await gcsFile.exists())[0]) { @@ -152,6 +146,47 @@ async function handleGcsObject(req, res) { return; } + const [metadata] = await gcsFile.getMetadata(); + + // Parse the file size from the request body ('file.size') and from the file metadata ('metadata.size'). + // Compare the parsed file sizes. + // If the sizes don't match, log an informational message indicating a potential incomplete file upload and return a "ignored" status to the client. + // This check helps avoid scanning partially uploaded files, which might lead to inaccurate scan results. + const fileSize = parseInt(String(file.size)); + const metadataSize = parseInt(String(metadata.size)); + if (fileSize !== metadataSize) { + logger.info( + `Ignoring file gs://${file.bucket}/${file.name}. File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete.`, + ); + res.json({status: 'ignored', reason: 'file_size_mismatch'}); + return; + } + + // Check if the file is too big to process + if (fileSize > MAX_FILE_SIZE) { + handleErrorResponse( + res, + 200, + `file gs://${file.bucket}/${file.name} too large for scanning at ${fileSize} bytes`, + file.bucket, + ); + return; + } + + // Iterate through the configured file exclusion patterns. + // If the file name matches any of the exclusion patterns, log an informational message and return an "ignored" status to the client. + // This allows specific files to be skipped from the scanning process based on their names. + for (const pattern of BUCKET_CONFIG.fileExclusionPatterns) { + const regex = new RegExp(pattern); + if (regex.test(file.name)) { + logger.info( + `Ignoring file gs://${file.bucket}/${file.name} based on regex: ${pattern}`, + ); + res.json({status: 'ignored', reason: 'regex_match'}); + return; + } + } + const clamdVersion = await getClamVersion(); logger.info( `Scan request for gs://${file.bucket}/${file.name}, (${fileSize} bytes) scanning with clam ${clamdVersion}`, @@ -336,12 +371,15 @@ async function getClamVersion() { * @param {!import('./config.js').BucketDefs} config */ async function moveProcessedFile(filename, isClean, config) { - const srcfile = storage.bucket(config.unscanned).file(filename); - const destinationBucketName = isClean - ? `gs://${config.clean}` - : `gs://${config.quarantined}`; + const srcBucketName = config.unscanned; + const srcfile = storage.bucket(srcBucketName).file(filename); + const destinationBucketName = isClean ? config.clean : config.quarantined; const destinationBucket = storage.bucket(destinationBucketName); + await srcfile.move(destinationBucket); + logger.info( + `Successfully moved file gs://${srcBucketName}/${filename} to gs://${destinationBucketName}/${filename}`, + ); } /** @@ -387,6 +425,9 @@ async function run() { } const config = await readAndVerifyConfig(configFile); + // Ensure ignoreFilespecs is an array (even if empty) + config.fileExclusionPatterns = config.fileExclusionPatterns || []; + Object.assign(BUCKET_CONFIG, config); await waitForClamD(); diff --git a/terraform/README.md b/terraform/README.md index fb6c2f5b..e02d8f86 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -5,13 +5,12 @@ malware-scanner service on cloud run. The deployment is split into 4 stages: -1. Set up the google cloud project environment and service configuration. -1. Use Terraform to set up the required service accounts and deploy required - infrastructure. -1. Launch cloud build to build the Docker image for the malware-scanner - service. -1. Use Terraform to deploy the malware-scanner service to cloud run, and - connect the service to the infrastructure created in stage 2. +1. Set up the google cloud project environment and service configuration. +1. Use Terraform to set up the required service accounts and deploy required + infrastructure. +1. Launch cloud build to build the Docker image for the malware-scanner service. +1. Use Terraform to deploy the malware-scanner service to cloud run, and connect + the service to the infrastructure created in stage 2. Follow the instructions below to use Terraform to deploy the malware scanner service in a demo project. @@ -53,7 +52,8 @@ TF_VAR_config_json=$(cat <