diff --git a/README.md b/README.md index 2564902a..59516d1d 100755 --- a/README.md +++ b/README.md @@ -63,7 +63,8 @@ CONFIG_JSON: | "quarantined": "quarantined-bucket-name" } ], - "ClamCvdMirrorBucket": "cvd-mirror-bucket-name" + "ClamCvdMirrorBucket": "cvd-mirror-bucket-name", + fileExclusionPatterns: [] } ``` @@ -111,7 +112,8 @@ resource "google_cloud_run_v2_service" "malware-scanner" { quarantined = "quarantined-bucket-name" } ] - ClamCvdMirrorBucket = "cvd-mirror-bucket-name" + ClamCvdMirrorBucket = "cvd-mirror-bucket-name", + fileExclusionPatterns = [] }) } } diff --git a/cloudrun-malware-scanner/config.js b/cloudrun-malware-scanner/config.js index 9b38a0d8..ba87a45c 100644 --- a/cloudrun-malware-scanner/config.js +++ b/cloudrun-malware-scanner/config.js @@ -38,6 +38,7 @@ const BucketTypes = Object.freeze({ * @typedef {{ * buckets: Array, * ClamCvdMirrorBucket: string, + * fileExclusionPatterns: Array, * comments?: string * }} Config */ diff --git a/cloudrun-malware-scanner/config.json.tmpl b/cloudrun-malware-scanner/config.json.tmpl index 3418f4c1..a09597b6 100644 --- a/cloudrun-malware-scanner/config.json.tmpl +++ b/cloudrun-malware-scanner/config.json.tmpl @@ -9,6 +9,12 @@ "'ClamCvdMirrorBucket' is a GCS bucket used to mirror the clamav database definition files to prevent overloading the Clam servers", "and being rate limited/blacklisted. Its contents are maintained by the updateCvdMirror.sh script", "", + "'fileExclusionPatterns': is list of regular expressions. Files matching any of these patterns will be skipped during scanning.", + "NOTE: These files will remain in the 'unscanned' bucket and will need to be tidied and/or managed separately.", + "Example: '\\.filepart$' - Ignore files ending in '.filepart'", + "Example: '^ignore_me.*\\.txt$' - Ignore files starting with 'ignore_me' and ending with '.txt'", + "Cheat sheet for regular expressions: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet", + "", "Shell environmental variable substitution is supported in this file.", "At runtime, it will be copied to /etc", "", @@ -24,5 +30,6 @@ "quarantined": "quarantined-bucket-name" } ], - "ClamCvdMirrorBucket": "cvd-mirror-bucket-name" + "ClamCvdMirrorBucket": "cvd-mirror-bucket-name", + "fileExclusionPatterns": [] } diff --git a/cloudrun-malware-scanner/server.js b/cloudrun-malware-scanner/server.js index 1ea7a43f..178af19f 100644 --- a/cloudrun-malware-scanner/server.js +++ b/cloudrun-malware-scanner/server.js @@ -61,6 +61,7 @@ const MAX_FILE_SIZE = 500000000; // 500MiB const BUCKET_CONFIG = { buckets: [], ClamCvdMirrorBucket: '', + fileExclusionPatterns: [], }; // Create Clients. @@ -109,6 +110,7 @@ app.post('/', async (req, res) => { */ async function handleGcsObject(req, res) { const file = req.body; + try { if (!file?.name) { handleErrorResponse(res, 200, `file name not specified in ${file}`); @@ -118,16 +120,7 @@ async function handleGcsObject(req, res) { handleErrorResponse(res, 200, `bucket name not specified in ${file}`); return; } - const fileSize = parseInt(file.size); - if (fileSize > MAX_FILE_SIZE) { - handleErrorResponse( - res, - 200, - `file gs://${file.bucket}/${file.name} too large for scanning at ${fileSize} bytes`, - file.bucket, - ); - return; - } + const config = BUCKET_CONFIG.buckets.filter( (c) => c.unscanned === file.bucket, )[0]; @@ -141,6 +134,7 @@ async function handleGcsObject(req, res) { } const gcsFile = storage.bucket(file.bucket).file(file.name); + // File.exists() returns a FileExistsResponse, which is a list with a // single value. if (!(await gcsFile.exists())[0]) { @@ -152,6 +146,45 @@ async function handleGcsObject(req, res) { return; } + const [metadata] = await gcsFile.getMetadata(); + + // Parse the file size from the request body ('file.size') and from the file metadata ('metadata.size'). + // Compare the parsed file sizes. + // If the sizes don't match, log an informational message indicating a potential incomplete file upload and return a "ignored" status to the client. + // This check helps avoid scanning partially uploaded files, which might lead to inaccurate scan results. + const fileSize = parseInt(file.size); + const metadataSize = parseInt(metadata.size); + if (fileSize !== metadataSize) { + logger.info( + `Ignoring file gs://${file.bucket}/${file.name}. File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete.` + ); + res.json({status: 'ignored', reason: 'file_size_mismatch'}); + return; + } + + // Check if the file is too big to process + if (fileSize > MAX_FILE_SIZE) { + handleErrorResponse( + res, + 200, + `file gs://${file.bucket}/${file.name} too large for scanning at ${fileSize} bytes`, + file.bucket, + ); + return; + } + + // Iterate through the configured file exclusion patterns. + // If the file name matches any of the exclusion patterns, log an informational message and return an "ignored" status to the client. + // This allows specific files to be skipped from the scanning process based on their names. + for (const pattern of BUCKET_CONFIG.fileExclusionPatterns) { + const regex = new RegExp(pattern); + if (regex.test(file.name)) { + logger.info(`Ignoring file gs://${file.bucket}/${file.name} based on regex: ${pattern}`); + res.json({ status: 'ignored', reason: 'regex_match' }); + return; + } + } + const clamdVersion = await getClamVersion(); logger.info( `Scan request for gs://${file.bucket}/${file.name}, (${fileSize} bytes) scanning with clam ${clamdVersion}`, @@ -336,12 +369,15 @@ async function getClamVersion() { * @param {!import('./config.js').BucketDefs} config */ async function moveProcessedFile(filename, isClean, config) { - const srcfile = storage.bucket(config.unscanned).file(filename); + const srcBucketName = config.unscanned; + const srcfile = storage.bucket(srcBucketName).file(filename); const destinationBucketName = isClean - ? `gs://${config.clean}` - : `gs://${config.quarantined}`; + ? config.clean + : config.quarantined; const destinationBucket = storage.bucket(destinationBucketName); + await srcfile.move(destinationBucket); + logger.info(`Successfully moved file gs://${srcBucketName}/${filename} to gs://${destinationBucketName}/${filename}`); } /** @@ -387,6 +423,9 @@ async function run() { } const config = await readAndVerifyConfig(configFile); + // Ensure ignoreFilespecs is an array (even if empty) + config.fileExclusionPatterns = config.fileExclusionPatterns || []; + Object.assign(BUCKET_CONFIG, config); await waitForClamD(); diff --git a/terraform/README.md b/terraform/README.md index fb6c2f5b..5bd13b61 100644 --- a/terraform/README.md +++ b/terraform/README.md @@ -5,13 +5,13 @@ malware-scanner service on cloud run. The deployment is split into 4 stages: -1. Set up the google cloud project environment and service configuration. -1. Use Terraform to set up the required service accounts and deploy required - infrastructure. -1. Launch cloud build to build the Docker image for the malware-scanner - service. -1. Use Terraform to deploy the malware-scanner service to cloud run, and - connect the service to the infrastructure created in stage 2. +1. Set up the google cloud project environment and service configuration. +1. Use Terraform to set up the required service accounts and deploy required + infrastructure. +1. Launch cloud build to build the Docker image for the malware-scanner + service. +1. Use Terraform to deploy the malware-scanner service to cloud run, and + connect the service to the infrastructure created in stage 2. Follow the instructions below to use Terraform to deploy the malware scanner service in a demo project. @@ -53,7 +53,8 @@ TF_VAR_config_json=$(cat <