From 6890d22a74f46372ead01b30f5972a3cb7f3ced7 Mon Sep 17 00:00:00 2001
From: steve-mays <stevemays@google.com>
Date: Tue, 12 Nov 2024 16:06:19 +0000
Subject: [PATCH] feat: Handle partial uploads and add file exclusion patterns

This commit introduces two enhancements:

Partial Upload Handling: Addresses issue #251 by verifying the uploaded file size against the object metadata size. This prevents processing incomplete uploads.

File Exclusion Patterns: Adds a fileExclusionPatterns array to the configuration (including the template file) allowing specific files to be ignored during processing. This improves efficiency and avoids unnecessary scans.
---
 README.md                                 |  6 +-
 cloudrun-malware-scanner/config-env.yaml  | 17 +++++-
 cloudrun-malware-scanner/config.js        |  1 +
 cloudrun-malware-scanner/config.json.tmpl | 18 +++++-
 cloudrun-malware-scanner/server.js        | 69 ++++++++++++++++++-----
 terraform/README.md                       | 16 +++---
 6 files changed, 99 insertions(+), 28 deletions(-)
diff --git a/README.md b/README.md
index 2564902a..51079249 100755
--- a/README.md
+++ b/README.md
@@ -63,7 +63,8 @@ CONFIG_JSON: |
         "quarantined": "quarantined-bucket-name"
       }
     ],
-    "ClamCvdMirrorBucket": "cvd-mirror-bucket-name"
+    "ClamCvdMirrorBucket": "cvd-mirror-bucket-name",
+    "fileExclusionPatterns": []
   }
 ```
 
@@ -111,7 +112,8 @@ resource "google_cloud_run_v2_service" "malware-scanner" {
               quarantined = "quarantined-bucket-name"
             }
           ]
-          ClamCvdMirrorBucket = "cvd-mirror-bucket-name"
+          ClamCvdMirrorBucket = "cvd-mirror-bucket-name",
+          fileExclusionPatterns = []
         })
       }
     }
diff --git a/cloudrun-malware-scanner/config-env.yaml b/cloudrun-malware-scanner/config-env.yaml
index e662a530..9dcbd204 100644
--- a/cloudrun-malware-scanner/config-env.yaml
+++ b/cloudrun-malware-scanner/config-env.yaml
@@ -22,6 +22,20 @@
 # and can be shared across multiple deployments with the appropriate
 # permissions.
 #
+# "fileExclusionPatterns" is a list of regular expressions. Files matching any
+# of these patterns will be skipped during scanning. NOTE: These files will remain
+# in the "unscanned" bucket and will need to be tidied and/or managed separately.
+#
+# Example:
+#
+#   "fileExclusionPatterns": [
+#     "\\.filepart$"  (Ignore files ending in ".filepart")
+#     "^ignore_me.*\\.txt$"  (Ignore files starting with "ignore_me" and ending with ".txt")
+#   ]
+#
+# Cheat sheet for regular expressions:
+# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet
+#
 # Shell environmental variable substitution is supported in this file.
 # At runtime, JSON will be written to the file /etc/malware-scanner-config.json.
 #
@@ -34,5 +48,6 @@ CONFIG_JSON: |
         "quarantined": "quarantined-${PROJECT_ID}"
       }
     ],
-    "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}"
+    "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}",
+    "fileExclusionPatterns": []
   }
diff --git a/cloudrun-malware-scanner/config.js b/cloudrun-malware-scanner/config.js
index 9b38a0d8..ba87a45c 100644
--- a/cloudrun-malware-scanner/config.js
+++ b/cloudrun-malware-scanner/config.js
@@ -38,6 +38,7 @@ const BucketTypes = Object.freeze({
  * @typedef {{
  *    buckets: Array<BucketDefs>,
  *    ClamCvdMirrorBucket: string,
+ *    fileExclusionPatterns: Array<string>,
  *    comments?: string
  *  }} Config
  */
diff --git a/cloudrun-malware-scanner/config.json.tmpl b/cloudrun-malware-scanner/config.json.tmpl
index 3418f4c1..a28ba338 100644
--- a/cloudrun-malware-scanner/config.json.tmpl
+++ b/cloudrun-malware-scanner/config.json.tmpl
@@ -9,8 +9,19 @@
     "'ClamCvdMirrorBucket' is a GCS bucket used to mirror the clamav database definition files to prevent overloading the Clam servers",
     "and being rate limited/blacklisted. Its contents are maintained by the updateCvdMirror.sh script",
     "",
-    "Shell environmental variable substitution is supported in this file.",
-    "At runtime, it will be copied to /etc",
+    "'fileExclusionPatterns' is a list of regular expressions. Files matching any",
+    "of these patterns will be skipped during scanning. NOTE: These files will remain",
+    "in the 'unscanned' bucket and will need to be tidied and/or managed separately.",
+    "",
+    "Example:",
+    "",
+    "  'fileExclusionPatterns: [",
+    "    '\\.filepart$'  (Ignore files ending in '.filepart')",
+    "    '^ignore_me.*\\.txt$'  (Ignore files starting with 'ignore_me' and ending with '.txt')",
+    "  ]",
+    "",
+    "Cheat sheet for regular expressions:",
+    "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet",
     "",
     "As an alternative to including this file in the container the contents can be passed as an enviroment variable CONFIG_JSON on",
     "Cloud Run startup",
@@ -24,5 +35,6 @@
       "quarantined": "quarantined-bucket-name"
     }
   ],
-  "ClamCvdMirrorBucket": "cvd-mirror-bucket-name"
+  "ClamCvdMirrorBucket": "cvd-mirror-bucket-name",
+  "fileExclusionPatterns": []
 }
diff --git a/cloudrun-malware-scanner/server.js b/cloudrun-malware-scanner/server.js
index 1ea7a43f..259c1a2b 100644
--- a/cloudrun-malware-scanner/server.js
+++ b/cloudrun-malware-scanner/server.js
@@ -61,6 +61,7 @@ const MAX_FILE_SIZE = 500000000; // 500MiB
 const BUCKET_CONFIG = {
   buckets: [],
   ClamCvdMirrorBucket: '',
+  fileExclusionPatterns: [],
 };
 
 // Create Clients.
@@ -109,6 +110,7 @@ app.post('/', async (req, res) => {
  */
 async function handleGcsObject(req, res) {
   const file = req.body;
+
   try {
     if (!file?.name) {
       handleErrorResponse(res, 200, `file name not specified in ${file}`);
@@ -118,16 +120,7 @@ async function handleGcsObject(req, res) {
       handleErrorResponse(res, 200, `bucket name not specified in ${file}`);
       return;
     }
-    const fileSize = parseInt(file.size);
-    if (fileSize > MAX_FILE_SIZE) {
-      handleErrorResponse(
-        res,
-        200,
-        `file gs://${file.bucket}/${file.name} too large for scanning at ${fileSize} bytes`,
-        file.bucket,
-      );
-      return;
-    }
+
     const config = BUCKET_CONFIG.buckets.filter(
       (c) => c.unscanned === file.bucket,
     )[0];
@@ -141,6 +134,7 @@ async function handleGcsObject(req, res) {
     }
 
     const gcsFile = storage.bucket(file.bucket).file(file.name);
+
     // File.exists() returns a FileExistsResponse, which is a list with a
     // single value.
     if (!(await gcsFile.exists())[0]) {
@@ -152,6 +146,47 @@ async function handleGcsObject(req, res) {
       return;
     }
 
+    const [metadata] = await gcsFile.getMetadata();
+
+    // Parse the file size from the request body ('file.size') and from the file metadata ('metadata.size').
+    // Compare the parsed file sizes.
+    // If the sizes don't match, log an informational message indicating a potential incomplete file upload and return a "ignored" status to the client.
+    // This check helps avoid scanning partially uploaded files, which might lead to inaccurate scan results.
+    const fileSize = parseInt(String(file.size));
+    const metadataSize = parseInt(String(metadata.size));
+    if (fileSize !== metadataSize) {
+      logger.info(
+        `Ignoring file gs://${file.bucket}/${file.name}. File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete.`,
+      );
+      res.json({status: 'ignored', reason: 'file_size_mismatch'});
+      return;
+    }
+
+    // Check if the file is too big to process
+    if (fileSize > MAX_FILE_SIZE) {
+      handleErrorResponse(
+        res,
+        200,
+        `file gs://${file.bucket}/${file.name} too large for scanning at ${fileSize} bytes`,
+        file.bucket,
+      );
+      return;
+    }
+
+    // Iterate through the configured file exclusion patterns.
+    // If the file name matches any of the exclusion patterns, log an informational message and return an "ignored" status to the client.
+    // This allows specific files to be skipped from the scanning process based on their names.
+    for (const pattern of BUCKET_CONFIG.fileExclusionPatterns) {
+      const regex = new RegExp(pattern);
+      if (regex.test(file.name)) {
+        logger.info(
+          `Ignoring file gs://${file.bucket}/${file.name} based on regex: ${pattern}`,
+        );
+        res.json({status: 'ignored', reason: 'regex_match'});
+        return;
+      }
+    }
+
     const clamdVersion = await getClamVersion();
     logger.info(
       `Scan request for gs://${file.bucket}/${file.name}, (${fileSize} bytes) scanning with clam ${clamdVersion}`,
@@ -336,12 +371,15 @@ async function getClamVersion() {
  * @param {!import('./config.js').BucketDefs} config
  */
 async function moveProcessedFile(filename, isClean, config) {
-  const srcfile = storage.bucket(config.unscanned).file(filename);
-  const destinationBucketName = isClean
-    ? `gs://${config.clean}`
-    : `gs://${config.quarantined}`;
+  const srcBucketName = config.unscanned;
+  const srcfile = storage.bucket(srcBucketName).file(filename);
+  const destinationBucketName = isClean ? config.clean : config.quarantined;
   const destinationBucket = storage.bucket(destinationBucketName);
+
   await srcfile.move(destinationBucket);
+  logger.info(
+    `Successfully moved file gs://${srcBucketName}/${filename} to gs://${destinationBucketName}/${filename}`,
+  );
 }
 
 /**
@@ -387,6 +425,9 @@ async function run() {
   }
   const config = await readAndVerifyConfig(configFile);
 
+  // Ensure ignoreFilespecs is an array (even if empty)
+  config.fileExclusionPatterns = config.fileExclusionPatterns || [];
+
   Object.assign(BUCKET_CONFIG, config);
 
   await waitForClamD();
diff --git a/terraform/README.md b/terraform/README.md
index fb6c2f5b..e02d8f86 100644
--- a/terraform/README.md
+++ b/terraform/README.md
@@ -5,13 +5,12 @@ malware-scanner service on cloud run.
 
 The deployment is split into 4 stages:
 
-1.  Set up the google cloud project environment and service configuration.
-1.  Use Terraform to set up the required service accounts and deploy required
-    infrastructure.
-1.  Launch cloud build to build the Docker image for the malware-scanner
-    service.
-1.  Use Terraform to deploy the malware-scanner service to cloud run, and
-    connect the service to the infrastructure created in stage 2.
+1. Set up the google cloud project environment and service configuration.
+1. Use Terraform to set up the required service accounts and deploy required
+   infrastructure.
+1. Launch cloud build to build the Docker image for the malware-scanner service.
+1. Use Terraform to deploy the malware-scanner service to cloud run, and connect
+   the service to the infrastructure created in stage 2.
 
 Follow the instructions below to use Terraform to deploy the malware scanner
 service in a demo project.
@@ -53,7 +52,8 @@ TF_VAR_config_json=$(cat <<EOF
       "quarantined": "quarantined-${PROJECT_ID}"
     }
   ],
-  "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}"
+  "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}",
+  "fileExclusionPatterns": []
 }
 EOF
 )