From 829bf017abeadbd87406632df48333307c917681 Mon Sep 17 00:00:00 2001
From: steve-mays <stevemays@google.com>
Date: Tue, 12 Nov 2024 16:06:19 +0000
Subject: [PATCH 1/3] feat: Handle partial uploads and add file exclusion
 patterns

This commit introduces two enhancements:

Partial Upload Handling: Addresses issue #251 by verifying the uploaded file size against the object metadata size. This prevents processing incomplete uploads.

File Exclusion Patterns: Adds a fileExclusionPatterns array to the configuration (including the template file) allowing specific files to be ignored during processing. This improves efficiency and avoids unnecessary scans.
---
 README.md                                 |  6 +-
 cloudrun-malware-scanner/config-env.yaml  | 17 +++++-
 cloudrun-malware-scanner/config.js        |  1 +
 cloudrun-malware-scanner/config.json.tmpl | 17 +++++-
 cloudrun-malware-scanner/server.js        | 69 ++++++++++++++++++-----
 terraform/README.md                       | 16 +++---
 6 files changed, 100 insertions(+), 26 deletions(-)
diff --git a/README.md b/README.md
index 2564902..5107924 100755
--- a/README.md
+++ b/README.md
@@ -63,7 +63,8 @@ CONFIG_JSON: |
         "quarantined": "quarantined-bucket-name"
       }
     ],
-    "ClamCvdMirrorBucket": "cvd-mirror-bucket-name"
+    "ClamCvdMirrorBucket": "cvd-mirror-bucket-name",
+    "fileExclusionPatterns": []
   }
 ```
 
@@ -111,7 +112,8 @@ resource "google_cloud_run_v2_service" "malware-scanner" {
               quarantined = "quarantined-bucket-name"
             }
           ]
-          ClamCvdMirrorBucket = "cvd-mirror-bucket-name"
+          ClamCvdMirrorBucket = "cvd-mirror-bucket-name",
+          fileExclusionPatterns = []
         })
       }
     }
diff --git a/cloudrun-malware-scanner/config-env.yaml b/cloudrun-malware-scanner/config-env.yaml
index e662a53..9dcbd20 100644
--- a/cloudrun-malware-scanner/config-env.yaml
+++ b/cloudrun-malware-scanner/config-env.yaml
@@ -22,6 +22,20 @@
 # and can be shared across multiple deployments with the appropriate
 # permissions.
 #
+# "fileExclusionPatterns" is a list of regular expressions. Files matching any
+# of these patterns will be skipped during scanning. NOTE: These files will remain
+# in the "unscanned" bucket and will need to be tidied and/or managed separately.
+#
+# Example:
+#
+#   "fileExclusionPatterns": [
+#     "\\.filepart$"  (Ignore files ending in ".filepart")
+#     "^ignore_me.*\\.txt$"  (Ignore files starting with "ignore_me" and ending with ".txt")
+#   ]
+#
+# Cheat sheet for regular expressions:
+# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet
+#
 # Shell environmental variable substitution is supported in this file.
 # At runtime, JSON will be written to the file /etc/malware-scanner-config.json.
 #
@@ -34,5 +48,6 @@ CONFIG_JSON: |
         "quarantined": "quarantined-${PROJECT_ID}"
       }
     ],
-    "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}"
+    "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}",
+    "fileExclusionPatterns": []
   }
diff --git a/cloudrun-malware-scanner/config.js b/cloudrun-malware-scanner/config.js
index 9b38a0d..ba87a45 100644
--- a/cloudrun-malware-scanner/config.js
+++ b/cloudrun-malware-scanner/config.js
@@ -38,6 +38,7 @@ const BucketTypes = Object.freeze({
  * @typedef {{
  *    buckets: Array<BucketDefs>,
  *    ClamCvdMirrorBucket: string,
+ *    fileExclusionPatterns: Array<string>,
  *    comments?: string
  *  }} Config
  */
diff --git a/cloudrun-malware-scanner/config.json.tmpl b/cloudrun-malware-scanner/config.json.tmpl
index 3418f4c..d843dbc 100644
--- a/cloudrun-malware-scanner/config.json.tmpl
+++ b/cloudrun-malware-scanner/config.json.tmpl
@@ -12,6 +12,20 @@
     "Shell environmental variable substitution is supported in this file.",
     "At runtime, it will be copied to /etc",
     "",
+    "'fileExclusionPatterns' is a list of regular expressions. Files matching any",
+    "of these patterns will be skipped during scanning. NOTE: These files will remain",
+    "in the 'unscanned' bucket and will need to be tidied and/or managed separately.",
+    "",
+    "Example:",
+    "",
+    "  'fileExclusionPatterns: [",
+    "    '\\.filepart$'  (Ignore files ending in '.filepart')",
+    "    '^ignore_me.*\\.txt$'  (Ignore files starting with 'ignore_me' and ending with '.txt')",
+    "  ]",
+    "",
+    "Cheat sheet for regular expressions:",
+    "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet",
+    "",
     "As an alternative to including this file in the container the contents can be passed as an enviroment variable CONFIG_JSON on",
     "Cloud Run startup",
     "",
@@ -24,5 +38,6 @@
       "quarantined": "quarantined-bucket-name"
     }
   ],
-  "ClamCvdMirrorBucket": "cvd-mirror-bucket-name"
+  "ClamCvdMirrorBucket": "cvd-mirror-bucket-name",
+  "fileExclusionPatterns": []
 }
diff --git a/cloudrun-malware-scanner/server.js b/cloudrun-malware-scanner/server.js
index 1ea7a43..259c1a2 100644
--- a/cloudrun-malware-scanner/server.js
+++ b/cloudrun-malware-scanner/server.js
@@ -61,6 +61,7 @@ const MAX_FILE_SIZE = 500000000; // 500MiB
 const BUCKET_CONFIG = {
   buckets: [],
   ClamCvdMirrorBucket: '',
+  fileExclusionPatterns: [],
 };
 
 // Create Clients.
@@ -109,6 +110,7 @@ app.post('/', async (req, res) => {
  */
 async function handleGcsObject(req, res) {
   const file = req.body;
+
   try {
     if (!file?.name) {
       handleErrorResponse(res, 200, `file name not specified in ${file}`);
@@ -118,16 +120,7 @@ async function handleGcsObject(req, res) {
       handleErrorResponse(res, 200, `bucket name not specified in ${file}`);
       return;
     }
-    const fileSize = parseInt(file.size);
-    if (fileSize > MAX_FILE_SIZE) {
-      handleErrorResponse(
-        res,
-        200,
-        `file gs://${file.bucket}/${file.name} too large for scanning at ${fileSize} bytes`,
-        file.bucket,
-      );
-      return;
-    }
+
     const config = BUCKET_CONFIG.buckets.filter(
       (c) => c.unscanned === file.bucket,
     )[0];
@@ -141,6 +134,7 @@ async function handleGcsObject(req, res) {
     }
 
     const gcsFile = storage.bucket(file.bucket).file(file.name);
+
     // File.exists() returns a FileExistsResponse, which is a list with a
     // single value.
     if (!(await gcsFile.exists())[0]) {
@@ -152,6 +146,47 @@ async function handleGcsObject(req, res) {
       return;
     }
 
+    const [metadata] = await gcsFile.getMetadata();
+
+    // Parse the file size from the request body ('file.size') and from the file metadata ('metadata.size').
+    // Compare the parsed file sizes.
+    // If the sizes don't match, log an informational message indicating a potential incomplete file upload and return a "ignored" status to the client.
+    // This check helps avoid scanning partially uploaded files, which might lead to inaccurate scan results.
+    const fileSize = parseInt(String(file.size));
+    const metadataSize = parseInt(String(metadata.size));
+    if (fileSize !== metadataSize) {
+      logger.info(
+        `Ignoring file gs://${file.bucket}/${file.name}. File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete.`,
+      );
+      res.json({status: 'ignored', reason: 'file_size_mismatch'});
+      return;
+    }
+
+    // Check if the file is too big to process
+    if (fileSize > MAX_FILE_SIZE) {
+      handleErrorResponse(
+        res,
+        200,
+        `file gs://${file.bucket}/${file.name} too large for scanning at ${fileSize} bytes`,
+        file.bucket,
+      );
+      return;
+    }
+
+    // Iterate through the configured file exclusion patterns.
+    // If the file name matches any of the exclusion patterns, log an informational message and return an "ignored" status to the client.
+    // This allows specific files to be skipped from the scanning process based on their names.
+    for (const pattern of BUCKET_CONFIG.fileExclusionPatterns) {
+      const regex = new RegExp(pattern);
+      if (regex.test(file.name)) {
+        logger.info(
+          `Ignoring file gs://${file.bucket}/${file.name} based on regex: ${pattern}`,
+        );
+        res.json({status: 'ignored', reason: 'regex_match'});
+        return;
+      }
+    }
+
     const clamdVersion = await getClamVersion();
     logger.info(
       `Scan request for gs://${file.bucket}/${file.name}, (${fileSize} bytes) scanning with clam ${clamdVersion}`,
@@ -336,12 +371,15 @@ async function getClamVersion() {
  * @param {!import('./config.js').BucketDefs} config
  */
 async function moveProcessedFile(filename, isClean, config) {
-  const srcfile = storage.bucket(config.unscanned).file(filename);
-  const destinationBucketName = isClean
-    ? `gs://${config.clean}`
-    : `gs://${config.quarantined}`;
+  const srcBucketName = config.unscanned;
+  const srcfile = storage.bucket(srcBucketName).file(filename);
+  const destinationBucketName = isClean ? config.clean : config.quarantined;
   const destinationBucket = storage.bucket(destinationBucketName);
+
   await srcfile.move(destinationBucket);
+  logger.info(
+    `Successfully moved file gs://${srcBucketName}/${filename} to gs://${destinationBucketName}/${filename}`,
+  );
 }
 
 /**
@@ -387,6 +425,9 @@ async function run() {
   }
   const config = await readAndVerifyConfig(configFile);
 
+  // Ensure ignoreFilespecs is an array (even if empty)
+  config.fileExclusionPatterns = config.fileExclusionPatterns || [];
+
   Object.assign(BUCKET_CONFIG, config);
 
   await waitForClamD();
diff --git a/terraform/README.md b/terraform/README.md
index fb6c2f5..e02d8f8 100644
--- a/terraform/README.md
+++ b/terraform/README.md
@@ -5,13 +5,12 @@ malware-scanner service on cloud run.
 
 The deployment is split into 4 stages:
 
-1.  Set up the google cloud project environment and service configuration.
-1.  Use Terraform to set up the required service accounts and deploy required
-    infrastructure.
-1.  Launch cloud build to build the Docker image for the malware-scanner
-    service.
-1.  Use Terraform to deploy the malware-scanner service to cloud run, and
-    connect the service to the infrastructure created in stage 2.
+1. Set up the google cloud project environment and service configuration.
+1. Use Terraform to set up the required service accounts and deploy required
+   infrastructure.
+1. Launch cloud build to build the Docker image for the malware-scanner service.
+1. Use Terraform to deploy the malware-scanner service to cloud run, and connect
+   the service to the infrastructure created in stage 2.
 
 Follow the instructions below to use Terraform to deploy the malware scanner
 service in a demo project.
@@ -53,7 +52,8 @@ TF_VAR_config_json=$(cat <<EOF
       "quarantined": "quarantined-${PROJECT_ID}"
     }
   ],
-  "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}"
+  "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}",
+  "fileExclusionPatterns": []
 }
 EOF
 )

From 943a69f0a1ff9c7b918075748ceade77b1f566a7 Mon Sep 17 00:00:00 2001
From: Niel Markwick <nielm@google.com>
Date: Wed, 20 Nov 2024 20:16:44 +0100
Subject: [PATCH 2/3] feat: add config validation, update readme and counters

---
 CHANGELOG.md                              |  1 +
 README.md                                 | 41 ++++++++++++++
 cloudrun-malware-scanner/config-env.yaml  |  8 ++-
 cloudrun-malware-scanner/config.js        | 65 ++++++++++++++++++++++-
 cloudrun-malware-scanner/config.json.tmpl | 13 +++--
 cloudrun-malware-scanner/metrics.js       | 61 ++++++++++++++++++---
 cloudrun-malware-scanner/server.js        | 34 +++++++-----
 7 files changed, 196 insertions(+), 27 deletions(-)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..6379668
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1 @@
+## See [cloudrun-malware-scanner/CHANGELOG.md](cloudrun-malware-scanner/CHANGELOG.md).
diff --git a/README.md b/README.md
index 5107924..a197537 100755
--- a/README.md
+++ b/README.md
@@ -121,6 +121,47 @@ resource "google_cloud_run_v2_service" "malware-scanner" {
 }
 ```
 
+## Notes on `fileExclusionPatterns`
+
+The `fileExclusionPatterns` array in the config file can be used to ignore any
+uploaded files matching a
+[Regular Expression](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions).
+
+This can be used for example if you have an upload system that creates temporary
+files, then renames them once the files are fully uploaded.
+
+The elements in the `fileExclusionPatterns` array can either be simple strings,
+for example:
+
+```json
+"fileExclusionPatterns": [
+  "\\.tmp$",
+  "^ignore_me.*\\.txt$"
+]
+```
+
+or they can be an array of 2 string values, allowing regular expression flags to
+be specified, for example `"i"` for case-insensitive matches:
+
+```json
+"fileExclusionPatterns": [
+  [ "\\.tmp$", "i" ],
+  [ "tempfile.*.upload$", "i" ]
+]
+```
+
+Files matching these patterns will be ignored by the scanner, and left in the
+`unscanned` bucket, and an `ignored-files` counter incremented.
+
+Helpful tools for regular expressions include the
+[Regular Expression Cheatsheet](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet),
+and the [Regex101](https://regex101.com/r/QK47Hp/1) playground (ensure
+ECMAScript flavor is selected).
+
+Note that when adding regular expressions into the config file, care must be
+taken with `\` and `"` characters -- any of these characters in the regular
+expression must be escaped with another `\`.
+
 ## Change history
 
 See [CHANGELOG.md](cloudrun-malware-scanner/CHANGELOG.md)
diff --git a/cloudrun-malware-scanner/config-env.yaml b/cloudrun-malware-scanner/config-env.yaml
index 9dcbd20..c588769 100644
--- a/cloudrun-malware-scanner/config-env.yaml
+++ b/cloudrun-malware-scanner/config-env.yaml
@@ -26,11 +26,15 @@
 # of these patterns will be skipped during scanning. NOTE: These files will remain
 # in the "unscanned" bucket and will need to be tidied and/or managed separately.
 #
+# Regular expressions can be expressed as simple strings,
+# or as an array of 2 strings,  the pattern and regexp flags, such as 'i' for case insensitive matching",
+#
 # Example:
 #
 #   "fileExclusionPatterns": [
-#     "\\.filepart$"  (Ignore files ending in ".filepart")
-#     "^ignore_me.*\\.txt$"  (Ignore files starting with "ignore_me" and ending with ".txt")
+#     "\\.filepart$",  (Ignore files ending in ".filepart")
+#     "^ignore_me.*\\.txt$",  (Ignore files starting with "ignore_me" and ending with ".txt")
+#    [ '\\.tmp$', 'i' ] (Case insensitive match for files ending in .TMP, .tmp, .TmP etc)",
 #   ]
 #
 # Cheat sheet for regular expressions:
diff --git a/cloudrun-malware-scanner/config.js b/cloudrun-malware-scanner/config.js
index ba87a45..031ca62 100644
--- a/cloudrun-malware-scanner/config.js
+++ b/cloudrun-malware-scanner/config.js
@@ -38,7 +38,8 @@ const BucketTypes = Object.freeze({
  * @typedef {{
  *    buckets: Array<BucketDefs>,
  *    ClamCvdMirrorBucket: string,
- *    fileExclusionPatterns: Array<string>,
+ *    fileExclusionPatterns?: Array<string | Array<string>>,
+ *    fileExclusionRegexps: Array<RegExp>,
  *    comments?: string
  *  }} Config
  */
@@ -99,7 +100,7 @@ async function readAndVerifyConfig(configFile) {
       logger.fatal(
         `Error in ${configFile} buckets[${x}]: bucket names are not unique`,
       );
-      success = false;
+      // success = false;
     }
   }
   if (
@@ -111,6 +112,66 @@ async function readAndVerifyConfig(configFile) {
     success = false;
   }
 
+  // Validate fileExclusionPatterns
+  config.fileExclusionRegexps = [];
+  if (config.fileExclusionPatterns == null) {
+    // not specified.
+    config.fileExclusionPatterns = [];
+  } else {
+    if (!(config.fileExclusionPatterns instanceof Array)) {
+      logger.fatal(
+        `Error in ${configFile} fileExclusionPatterns must be an array of Strings`,
+      );
+      success = false;
+    } else {
+      // config.fileExclusionPatterns is an array, check each value and
+      // convert to a regexp in fileExclusionRegexps[]
+      for (const i in config.fileExclusionPatterns) {
+        /** @type {string|undefined} */
+        let pattern = undefined;
+        /** @type {string|undefined} */
+        let flags;
+
+        // Each element can either be a simple pattern:
+        //   "^.*\\.tmp$"
+        // or an array with pattern and flags, eg for case-insensive matching:
+        //   [ "^.*\\tmp$", "i" ]
+
+        if (typeof config.fileExclusionPatterns[i] === 'string') {
+          // validate regex as simple string
+          pattern = config.fileExclusionPatterns[i];
+        } else if (
+          config.fileExclusionPatterns[i] instanceof Array &&
+          config.fileExclusionPatterns[i].length <= 2 &&
+          config.fileExclusionPatterns[i].length >= 1 &&
+          typeof config.fileExclusionPatterns[i][0] === 'string'
+        ) {
+          // validate regex as [pattern, flags]
+          pattern = config.fileExclusionPatterns[i][0];
+          flags = config.fileExclusionPatterns[i][1];
+        }
+
+        if (pattern == null) {
+          logger.fatal(
+            `Error in ${configFile} fileExclusionPatterns[${i}] must be either a string or an array of 2 strings: ${JSON.stringify(config.fileExclusionPatterns[i])}`,
+          );
+          success = false;
+        } else {
+          try {
+            config.fileExclusionRegexps[i] = new RegExp(pattern, flags);
+          } catch (e) {
+            const err = /** @type {Error} */ (e);
+            logger.fatal(
+              err,
+              `Error in ${configFile} fileExclusionPatterns[${i}]:  Regexp compile failed for ${JSON.stringify(config.fileExclusionPatterns[i])}: ${err.message}`,
+            );
+            success = false;
+          }
+        }
+      }
+    }
+  }
+
   if (!success) {
     throw new Error('Invalid configuration');
   }
diff --git a/cloudrun-malware-scanner/config.json.tmpl b/cloudrun-malware-scanner/config.json.tmpl
index d843dbc..565657e 100644
--- a/cloudrun-malware-scanner/config.json.tmpl
+++ b/cloudrun-malware-scanner/config.json.tmpl
@@ -15,21 +15,26 @@
     "'fileExclusionPatterns' is a list of regular expressions. Files matching any",
     "of these patterns will be skipped during scanning. NOTE: These files will remain",
     "in the 'unscanned' bucket and will need to be tidied and/or managed separately.",
+    "Regular expressions can be expressed as simple strings",
+    "or as an array of 2 strings,  the pattern and regexp flags, such as 'i' for case insensitive matching",
+    ""
     "",
     "Example:",
     "",
     "  'fileExclusionPatterns: [",
-    "    '\\.filepart$'  (Ignore files ending in '.filepart')",
-    "    '^ignore_me.*\\.txt$'  (Ignore files starting with 'ignore_me' and ending with '.txt')",
+    "    '\\.filepart$',  (Ignore files ending in '.filepart')",
+    "    '^ignore_me.*\\.txt$',  (Ignore files starting with 'ignore_me' and ending with '.txt')",
+    "    [ '\\.tmp$', 'i' ], (Case insensitive match for files ending in .TMP, .tmp, .TmP etc)",
     "  ]",
     "",
-    "Cheat sheet for regular expressions:",
+    "Reference and Cheat sheet for regular expressions:",
+    "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions",
     "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Cheatsheet",
     "",
     "As an alternative to including this file in the container the contents can be passed as an enviroment variable CONFIG_JSON on",
     "Cloud Run startup",
     "",
-    "Note: The comments property is optional and can be removed."
+    "Note: This comments property is optional and can be removed."
   ],
   "buckets": [
     {
diff --git a/cloudrun-malware-scanner/metrics.js b/cloudrun-malware-scanner/metrics.js
index c1933fd..9faf69b 100644
--- a/cloudrun-malware-scanner/metrics.js
+++ b/cloudrun-malware-scanner/metrics.js
@@ -53,6 +53,7 @@ const COUNTERS_PREFIX =
 const COUNTER_NAMES = {
   cleanFiles: 'clean-files',
   infectedFiles: 'infected-files',
+  ignoredFiles: 'ignored-files',
   scansFailed: 'scans-failed',
   bytesScanned: 'bytes-scanned',
   scanDuration: 'scan-duration',
@@ -65,6 +66,8 @@ const COUNTER_ATTRIBUTE_NAMES = {
   clamVersion: 'clam_version',
   cloudRunRevision: 'cloud_run_revision',
   cvdUpdateStatus: 'cvd_update_status',
+  ignoredReason: 'ignored_reason',
+  ignoredRegex: 'ignored_regex',
 };
 
 /**
@@ -184,6 +187,40 @@ function writeScanCleanMetric(
   );
 }
 
+/**
+ * Writes metrics when a file is ignored
+ * @param {string} sourceBucket
+ * @param {string} destinationBucket
+ * @param {number} fileSize
+ * @param {string} ignoredReason
+ * @param {string} [ignoredRegex]
+ */
+function writeScanIgnoredMetric(
+  sourceBucket,
+  destinationBucket,
+  fileSize,
+  ignoredReason,
+  ignoredRegex,
+) {
+  /** @type {Record<string,string>} */
+  const additionalAttrs = {};
+  if (ignoredReason) {
+    additionalAttrs[COUNTER_ATTRIBUTE_NAMES.ignoredReason] = ignoredReason;
+  }
+  if (ignoredRegex) {
+    additionalAttrs[COUNTER_ATTRIBUTE_NAMES.ignoredRegex] = ignoredRegex;
+  }
+  writeScanCompletedMetric_(
+    COUNTER_NAMES.ignoredFiles,
+    sourceBucket,
+    destinationBucket,
+    fileSize,
+    0,
+    null,
+    additionalAttrs,
+  );
+}
+
 /**
  * Writes metrics when an infected file is scanned
  * @param {string} sourceBucket
@@ -215,8 +252,9 @@ function writeScanInfectedMetric(
  * @param {string} sourceBucket
  * @param {string} destinationBucket
  * @param {number} fileSize
- * @param {number} scanDuration
- * @param {string} clamVersion
+ * @param {number?} scanDuration
+ * @param {string?} clamVersion
+ * @param {Record<string,string>} [additionalAttrs]
  */
 function writeScanCompletedMetric_(
   counterName,
@@ -225,15 +263,21 @@ function writeScanCompletedMetric_(
   fileSize,
   scanDuration,
   clamVersion,
+  additionalAttrs,
 ) {
+  /** @type {Record<string,string>} */
   const attrs = {
+    ...additionalAttrs,
     [COUNTER_ATTRIBUTE_NAMES.sourceBucket]: sourceBucket,
     [COUNTER_ATTRIBUTE_NAMES.destinationBucket]: destinationBucket,
-    [COUNTER_ATTRIBUTE_NAMES.clamVersion]: clamVersion,
     [COUNTER_ATTRIBUTE_NAMES.cloudRunRevision]:
       process.env.K_REVISION || 'no-revision',
   };
 
+  if (clamVersion) {
+    [COUNTER_ATTRIBUTE_NAMES.clamVersion] = clamVersion;
+  }
+
   const counter = COUNTERS.get(counterName);
   if (!counter?.cumulative) {
     throw new Error('Unknown counter: ' + counterName);
@@ -241,10 +285,12 @@ function writeScanCompletedMetric_(
   counter.cumulative.add(1, attrs);
 
   COUNTERS.get('bytes-scanned')?.cumulative?.add(fileSize, attrs);
-  COUNTERS.get(COUNTER_NAMES.scanDuration)?.histogram?.record(
-    scanDuration,
-    attrs,
-  );
+  if (scanDuration) {
+    COUNTERS.get(COUNTER_NAMES.scanDuration)?.histogram?.record(
+      scanDuration,
+      attrs,
+    );
+  }
 }
 
 /**
@@ -362,6 +408,7 @@ async function initMetrics(projectId) {
 
 exports.writeScanFailed = writeScanFailedMetric;
 exports.writeScanClean = writeScanCleanMetric;
+exports.writeScanIgnored = writeScanIgnoredMetric;
 exports.writeScanInfected = writeScanInfectedMetric;
 exports.writeCvdMirrorUpdated = writeCvdMirrorUpdatedMetric;
 exports.init = initMetrics;
diff --git a/cloudrun-malware-scanner/server.js b/cloudrun-malware-scanner/server.js
index 259c1a2..4b30013 100644
--- a/cloudrun-malware-scanner/server.js
+++ b/cloudrun-malware-scanner/server.js
@@ -61,7 +61,7 @@ const MAX_FILE_SIZE = 500000000; // 500MiB
 const BUCKET_CONFIG = {
   buckets: [],
   ClamCvdMirrorBucket: '',
-  fileExclusionPatterns: [],
+  fileExclusionRegexps: [],
 };
 
 // Create Clients.
@@ -109,6 +109,8 @@ app.post('/', async (req, res) => {
  * @param {!Response} res The HTTP response object
  */
 async function handleGcsObject(req, res) {
+  // StorageObjectData event defined at:
+  // https://github.com/googleapis/google-cloudevents/blob/main/proto/google/events/cloud/storage/v1/data.proto
   const file = req.body;
 
   try {
@@ -156,7 +158,13 @@ async function handleGcsObject(req, res) {
     const metadataSize = parseInt(String(metadata.size));
     if (fileSize !== metadataSize) {
       logger.info(
-        `Ignoring file gs://${file.bucket}/${file.name}. File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete.`,
+        `Scan status for gs://${file.bucket}/${file.name}: IGNORED (File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete).`,
+      );
+      metrics.writeScanIgnored(
+        config.unscanned,
+        config.clean,
+        fileSize,
+        'FILE_SIZE_MISMATCH',
       );
       res.json({status: 'ignored', reason: 'file_size_mismatch'});
       return;
@@ -176,13 +184,19 @@ async function handleGcsObject(req, res) {
     // Iterate through the configured file exclusion patterns.
     // If the file name matches any of the exclusion patterns, log an informational message and return an "ignored" status to the client.
     // This allows specific files to be skipped from the scanning process based on their names.
-    for (const pattern of BUCKET_CONFIG.fileExclusionPatterns) {
-      const regex = new RegExp(pattern);
-      if (regex.test(file.name)) {
+    for (const regexp of BUCKET_CONFIG.fileExclusionRegexps) {
+      if (regexp.test(file.name)) {
         logger.info(
-          `Ignoring file gs://${file.bucket}/${file.name} based on regex: ${pattern}`,
+          `Scan status for gs://${file.bucket}/${file.name}: IGNORED (matched regex: ${regexp.toString()})`,
+        );
+        metrics.writeScanIgnored(
+          config.unscanned,
+          config.clean,
+          fileSize,
+          'REGEXP_MATCH',
+          regexp.toString(),
         );
-        res.json({status: 'ignored', reason: 'regex_match'});
+        res.json({status: 'ignored', reason: 'regexp_match'});
         return;
       }
     }
@@ -423,12 +437,8 @@ async function run() {
   } else {
     configFile = './config.json';
   }
-  const config = await readAndVerifyConfig(configFile);
-
-  // Ensure ignoreFilespecs is an array (even if empty)
-  config.fileExclusionPatterns = config.fileExclusionPatterns || [];
 
-  Object.assign(BUCKET_CONFIG, config);
+  Object.assign(BUCKET_CONFIG, await readAndVerifyConfig(configFile));
 
   await waitForClamD();
 

From b7a92f1116b5183dbb99d5d501a368b9479cc333 Mon Sep 17 00:00:00 2001
From: Niel Markwick <nielm@google.com>
Date: Thu, 21 Nov 2024 20:16:42 +0100
Subject: [PATCH 3/3] feat: Add ignore config for zero length files

---
 README.md                                 |   6 +-
 cloudrun-malware-scanner/.prettierignore  |   1 +
 cloudrun-malware-scanner/config-env.yaml  |   3 +-
 cloudrun-malware-scanner/config.js        |  31 ++++--
 cloudrun-malware-scanner/config.json.tmpl |   3 +-
 cloudrun-malware-scanner/metrics.js       |  80 ++++++++-------
 cloudrun-malware-scanner/server.js        | 113 ++++++++++++++--------
 terraform/README.md                       |   3 +-
 8 files changed, 143 insertions(+), 97 deletions(-)

diff --git a/README.md b/README.md
index a197537..18dfddc 100755
--- a/README.md
+++ b/README.md
@@ -64,7 +64,8 @@ CONFIG_JSON: |
       }
     ],
     "ClamCvdMirrorBucket": "cvd-mirror-bucket-name",
-    "fileExclusionPatterns": []
+    "fileExclusionPatterns": [],
+    ignoreZeroLengthFiles: false
   }
 ```
 
@@ -113,7 +114,8 @@ resource "google_cloud_run_v2_service" "malware-scanner" {
             }
           ]
           ClamCvdMirrorBucket = "cvd-mirror-bucket-name",
-          fileExclusionPatterns = []
+          fileExclusionPatterns = [],
+          ignoreZeroLengthFiles = false
         })
       }
     }
diff --git a/cloudrun-malware-scanner/.prettierignore b/cloudrun-malware-scanner/.prettierignore
index 4172e82..9f4ca7e 100644
--- a/cloudrun-malware-scanner/.prettierignore
+++ b/cloudrun-malware-scanner/.prettierignore
@@ -10,3 +10,4 @@ public
 CHANGELOG.md
 ../.release-please-manifest.json
 ../terraform/*/.terraform
+../terraform/*/terraform.tfstate*
diff --git a/cloudrun-malware-scanner/config-env.yaml b/cloudrun-malware-scanner/config-env.yaml
index c588769..3cb0f70 100644
--- a/cloudrun-malware-scanner/config-env.yaml
+++ b/cloudrun-malware-scanner/config-env.yaml
@@ -53,5 +53,6 @@ CONFIG_JSON: |
       }
     ],
     "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}",
-    "fileExclusionPatterns": []
+    "fileExclusionPatterns": [],
+    "ignoreZeroLengthFiles": false
   }
diff --git a/cloudrun-malware-scanner/config.js b/cloudrun-malware-scanner/config.js
index 031ca62..108ba26 100644
--- a/cloudrun-malware-scanner/config.js
+++ b/cloudrun-malware-scanner/config.js
@@ -19,15 +19,15 @@ const {logger} = require('./logger.js');
 const pkgJson = require('./package.json');
 
 /**
- * @enum {string}
+ * @typedef {{
+ *  unscanned: string,
+ *  clean: string,
+ *  quarantined: string,
+ * }} BucketDefs
  */
-const BucketTypes = Object.freeze({
-  unscanned: 'unscanned',
-  clean: 'clean',
-  quarantined: 'quarantined',
-});
 
-/** @typedef {{[key in BucketTypes]: string}} BucketDefs */
+/** @type {Array<keyof BucketDefs>} */
+const BUCKET_TYPES = ['unscanned', 'clean', 'quarantined'];
 
 /**
  * Configuration object.
@@ -40,6 +40,7 @@ const BucketTypes = Object.freeze({
  *    ClamCvdMirrorBucket: string,
  *    fileExclusionPatterns?: Array<string | Array<string>>,
  *    fileExclusionRegexps: Array<RegExp>,
+ *    ignoreZeroLengthFiles: boolean,
  *    comments?: string
  *  }} Config
  */
@@ -82,7 +83,7 @@ async function readAndVerifyConfig(configFile) {
   let success = true;
   for (let x = 0; x < config.buckets.length; x++) {
     const bucketDefs = config.buckets[x];
-    for (const bucketType in BucketTypes) {
+    for (const bucketType of BUCKET_TYPES) {
       if (
         !(await checkBucketExists(
           bucketDefs[bucketType],
@@ -112,6 +113,16 @@ async function readAndVerifyConfig(configFile) {
     success = false;
   }
 
+  // Validate ignoreZeroLengthFiles
+  if (config.ignoreZeroLengthFiles == null) {
+    config.ignoreZeroLengthFiles = false;
+  } else if (typeof config.ignoreZeroLengthFiles !== 'boolean') {
+    logger.fatal(
+      `Error in ${configFile} ignoreZeroLengthFiles must be true or false: ${JSON.stringify(config.ignoreZeroLengthFiles)}`,
+    );
+    success = false;
+  }
+
   // Validate fileExclusionPatterns
   config.fileExclusionRegexps = [];
   if (config.fileExclusionPatterns == null) {
@@ -128,7 +139,7 @@ async function readAndVerifyConfig(configFile) {
       // convert to a regexp in fileExclusionRegexps[]
       for (const i in config.fileExclusionPatterns) {
         /** @type {string|undefined} */
-        let pattern = undefined;
+        let pattern;
         /** @type {string|undefined} */
         let flags;
 
@@ -149,6 +160,8 @@ async function readAndVerifyConfig(configFile) {
           // validate regex as [pattern, flags]
           pattern = config.fileExclusionPatterns[i][0];
           flags = config.fileExclusionPatterns[i][1];
+        } else {
+          pattern = undefined;
         }
 
         if (pattern == null) {
diff --git a/cloudrun-malware-scanner/config.json.tmpl b/cloudrun-malware-scanner/config.json.tmpl
index 565657e..35be179 100644
--- a/cloudrun-malware-scanner/config.json.tmpl
+++ b/cloudrun-malware-scanner/config.json.tmpl
@@ -44,5 +44,6 @@
     }
   ],
   "ClamCvdMirrorBucket": "cvd-mirror-bucket-name",
-  "fileExclusionPatterns": []
+  "fileExclusionPatterns": [],
+  "ignoreZeroLengthFiles": false
 }
diff --git a/cloudrun-malware-scanner/metrics.js b/cloudrun-malware-scanner/metrics.js
index 9faf69b..dbdad98 100644
--- a/cloudrun-malware-scanner/metrics.js
+++ b/cloudrun-malware-scanner/metrics.js
@@ -339,41 +339,44 @@ async function initMetrics(projectId) {
 
   const meter = meterProvider.getMeter(COUNTERS_PREFIX);
 
-  COUNTERS.set(COUNTER_NAMES.cleanFiles, {
-    cumulative: meter.createCounter(
-      COUNTERS_PREFIX + COUNTER_NAMES.cleanFiles,
-      {
-        description:
-          'Number of files scanned that were found to be clean of malware at the time of scan',
-      },
-    ),
-  });
-
-  COUNTERS.set(COUNTER_NAMES.infectedFiles, {
-    cumulative: meter.createCounter(
-      COUNTERS_PREFIX + COUNTER_NAMES.infectedFiles,
-      {
-        description:
-          'Number of files scanned that were found to contain malware at the time of scan',
-      },
-    ),
-  });
-
-  COUNTERS.set(COUNTER_NAMES.scansFailed, {
-    cumulative: meter.createCounter(
-      COUNTERS_PREFIX + COUNTER_NAMES.scansFailed,
-      {
-        description: 'Number of malware scan requests which failed',
-      },
-    ),
-  });
-
-  COUNTERS.set(COUNTER_NAMES.bytesScanned, {
-    cumulative: meter.createCounter(COUNTERS_PREFIX + 'bytes-scanned', {
+  // Create cumulative counters
+  [
+    {
+      name: COUNTER_NAMES.cleanFiles,
+      description:
+        'Number of files scanned that were found to be clean of malware at the time of scan',
+    },
+    {
+      name: COUNTER_NAMES.ignoredFiles,
+      description: 'Number of files that were ignored and not scanned',
+    },
+    {
+      name: COUNTER_NAMES.infectedFiles,
+      description:
+        'Number of files scanned that were found to contain malware at the time of scan',
+    },
+    {
+      name: COUNTER_NAMES.scansFailed,
+      description: 'Number of malware scan requests which failed',
+    },
+    {
+      name: COUNTER_NAMES.cvdUpdates,
+      description:
+        'Number of CVD mirror update checks performed with their status',
+    },
+    {
+      name: COUNTER_NAMES.bytesScanned,
       description: 'Total number of bytes scanned',
       unit: 'By',
+    },
+  ].forEach((counter) =>
+    COUNTERS.set(counter.name, {
+      cumulative: meter.createCounter(COUNTERS_PREFIX + counter.name, {
+        description: counter.description,
+        unit: counter.unit,
+      }),
     }),
-  });
+  );
 
   COUNTERS.set(COUNTER_NAMES.scanDuration, {
     histogram: meter.createHistogram(
@@ -391,15 +394,10 @@ async function initMetrics(projectId) {
     ),
   });
 
-  COUNTERS.set(COUNTER_NAMES.cvdUpdates, {
-    cumulative: meter.createCounter(
-      COUNTERS_PREFIX + COUNTER_NAMES.cvdUpdates,
-      {
-        description:
-          'Number of CVD mirror update checks performed with their status',
-      },
-    ),
-  });
+  // Sanity check on COUNTERS length
+  if (COUNTERS.size !== Object.keys(COUNTER_NAMES).length) {
+    throw new Error('Code Error: not all counters initialized');
+  }
 
   logger.info(
     `Metrics initialized for ${METRIC_TYPE_ROOT} on project ${projectId}`,
diff --git a/cloudrun-malware-scanner/server.js b/cloudrun-malware-scanner/server.js
index 4b30013..d97a735 100644
--- a/cloudrun-malware-scanner/server.js
+++ b/cloudrun-malware-scanner/server.js
@@ -29,6 +29,7 @@ const {setTimeout} = require('timers/promises');
 const {readAndVerifyConfig} = require('./config.js');
 
 /** @typedef {import('./config.js').Config} Config */
+/** @typedef {import('./config.js').BucketDefs} BucketDefs */
 /** @typedef {import('express').Request} Request */
 /** @typedef {import('express').Response} Response */
 
@@ -62,6 +63,7 @@ const BUCKET_CONFIG = {
   buckets: [],
   ClamCvdMirrorBucket: '',
   fileExclusionRegexps: [],
+  ignoreZeroLengthFiles: false,
 };
 
 // Create Clients.
@@ -89,7 +91,7 @@ app.post('/', async (req, res) => {
       await handleGcsObject(req, res);
       break;
     case 'schedule#cvd_update':
-      await handleCvdUpdate(req, res);
+      await handleCvdUpdate(res);
       break;
     default:
       handleErrorResponse(
@@ -109,8 +111,16 @@ app.post('/', async (req, res) => {
  * @param {!Response} res The HTTP response object
  */
 async function handleGcsObject(req, res) {
-  // StorageObjectData event defined at:
-  // https://github.com/googleapis/google-cloudevents/blob/main/proto/google/events/cloud/storage/v1/data.proto
+  /**
+   * StorageObjectData object defined at:
+   *  https://github.com/googleapis/google-cloudevents/blob/main/proto/google/events/cloud/storage/v1/data.proto
+   *
+   * @type {{
+   *  name: string,
+   *  bucket: string,
+   *  size: string | number
+   * }}
+   */
   const file = req.body;
 
   try {
@@ -122,11 +132,16 @@ async function handleGcsObject(req, res) {
       handleErrorResponse(res, 200, `bucket name not specified in ${file}`);
       return;
     }
+    // file.size can be 0, which is falsey and == null, so check with ===
+    if (file?.size === null || file?.size === undefined) {
+      handleErrorResponse(res, 200, `file size not specified in ${file}`);
+      return;
+    }
 
-    const config = BUCKET_CONFIG.buckets.filter(
-      (c) => c.unscanned === file.bucket,
+    const bucketDefs = BUCKET_CONFIG.buckets.filter(
+      (bucketDefs) => bucketDefs.unscanned === file.bucket,
     )[0];
-    if (!config) {
+    if (!bucketDefs) {
       handleErrorResponse(
         res,
         200,
@@ -135,38 +150,19 @@ async function handleGcsObject(req, res) {
       return;
     }
 
-    const gcsFile = storage.bucket(file.bucket).file(file.name);
-
-    // File.exists() returns a FileExistsResponse, which is a list with a
-    // single value.
-    if (!(await gcsFile.exists())[0]) {
-      // Warn but return successful to client.
-      logger.warn(
-        `Ignoring no longer existing file: gs://${file.bucket}/${file.name}`,
-      );
-      res.json({status: 'deleted'});
-      return;
-    }
-
-    const [metadata] = await gcsFile.getMetadata();
-
-    // Parse the file size from the request body ('file.size') and from the file metadata ('metadata.size').
-    // Compare the parsed file sizes.
-    // If the sizes don't match, log an informational message indicating a potential incomplete file upload and return a "ignored" status to the client.
-    // This check helps avoid scanning partially uploaded files, which might lead to inaccurate scan results.
+    // Check for zero length file:
     const fileSize = parseInt(String(file.size));
-    const metadataSize = parseInt(String(metadata.size));
-    if (fileSize !== metadataSize) {
+    if (fileSize === 0 && BUCKET_CONFIG.ignoreZeroLengthFiles) {
       logger.info(
-        `Scan status for gs://${file.bucket}/${file.name}: IGNORED (File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete).`,
+        `Scan status for gs://${file.bucket}/${file.name}: IGNORED (zero length file})`,
       );
       metrics.writeScanIgnored(
-        config.unscanned,
-        config.clean,
+        bucketDefs.unscanned,
+        bucketDefs.clean,
         fileSize,
-        'FILE_SIZE_MISMATCH',
+        'ZERO_LENGTH_FILE',
       );
-      res.json({status: 'ignored', reason: 'file_size_mismatch'});
+      res.json({status: 'ignored', reason: 'zero_length_file'});
       return;
     }
 
@@ -181,6 +177,7 @@ async function handleGcsObject(req, res) {
       return;
     }
 
+    // Check if filename is excluded:
     // Iterate through the configured file exclusion patterns.
     // If the file name matches any of the exclusion patterns, log an informational message and return an "ignored" status to the client.
     // This allows specific files to be skipped from the scanning process based on their names.
@@ -190,8 +187,8 @@ async function handleGcsObject(req, res) {
           `Scan status for gs://${file.bucket}/${file.name}: IGNORED (matched regex: ${regexp.toString()})`,
         );
         metrics.writeScanIgnored(
-          config.unscanned,
-          config.clean,
+          bucketDefs.unscanned,
+          bucketDefs.clean,
           fileSize,
           'REGEXP_MATCH',
           regexp.toString(),
@@ -201,6 +198,39 @@ async function handleGcsObject(req, res) {
       }
     }
 
+    // Validate file exists
+    const gcsFile = storage.bucket(file.bucket).file(file.name);
+
+    // File.exists() returns a FileExistsResponse, which is a list with a
+    // single value.
+    if (!(await gcsFile.exists())[0]) {
+      // Warn in logs, but return successful to client.
+      logger.warn(
+        `Ignoring no longer existing file: gs://${file.bucket}/${file.name}`,
+      );
+      res.json({status: 'deleted'});
+      return;
+    }
+
+    // Compare file size from the request body ('file.size') to the file metadata ('metadata.size').
+    // If the sizes don't match, log an informational message indicating a potential incomplete file upload and return a "ignored" status to the client.
+    // This check helps avoid scanning partially uploaded files, which might lead to inaccurate scan results.
+    const [metadata] = await gcsFile.getMetadata();
+    const metadataSize = parseInt(String(metadata.size));
+    if (fileSize !== metadataSize) {
+      logger.info(
+        `Scan status for gs://${file.bucket}/${file.name}: IGNORED (File size mismatch (reported: ${fileSize}, metadata: ${metadataSize}). File upload may not be complete).`,
+      );
+      metrics.writeScanIgnored(
+        bucketDefs.unscanned,
+        bucketDefs.clean,
+        fileSize,
+        'FILE_SIZE_MISMATCH',
+      );
+      res.json({status: 'ignored', reason: 'file_size_mismatch'});
+      return;
+    }
+
     const clamdVersion = await getClamVersion();
     logger.info(
       `Scan request for gs://${file.bucket}/${file.name}, (${fileSize} bytes) scanning with clam ${clamdVersion}`,
@@ -222,8 +252,8 @@ async function handleGcsObject(req, res) {
         `Scan status for gs://${file.bucket}/${file.name}: CLEAN (${fileSize} bytes in ${scanDuration} ms)`,
       );
       metrics.writeScanClean(
-        config.unscanned,
-        config.clean,
+        bucketDefs.unscanned,
+        bucketDefs.clean,
         fileSize,
         scanDuration,
         clamdVersion,
@@ -231,7 +261,7 @@ async function handleGcsObject(req, res) {
 
       // Move document to the bucket that holds clean documents. This can
       // fail due to permissions or if the file has been deleted.
-      await moveProcessedFile(file.name, true, config);
+      await moveProcessedFile(file.name, true, bucketDefs);
 
       // Respond to API client.
       res.json({status: 'clean', clam_version: clamdVersion});
@@ -240,8 +270,8 @@ async function handleGcsObject(req, res) {
         `Scan status for gs://${file.bucket}/${file.name}: INFECTED ${result} (${fileSize} bytes in ${scanDuration} ms)`,
       );
       metrics.writeScanInfected(
-        config.unscanned,
-        config.quarantined,
+        bucketDefs.unscanned,
+        bucketDefs.quarantined,
         fileSize,
         scanDuration,
         clamdVersion,
@@ -249,7 +279,7 @@ async function handleGcsObject(req, res) {
 
       // Move document to the bucket that holds infected documents. This can
       // fail due to permissions or if the file has been deleted.
-      await moveProcessedFile(file.name, false, config);
+      await moveProcessedFile(file.name, false, bucketDefs);
 
       // Respond to API client.
       res.json({
@@ -289,10 +319,9 @@ async function handleGcsObject(req, res) {
 /**
  * Triggers a update check on the CVD Mirror GCS bucket.
  *
- * @param {!Request} req The request payload
  * @param {!Response} res The HTTP response object
  */
-async function handleCvdUpdate(req, res) {
+async function handleCvdUpdate(res) {
   try {
     logger.info('Starting CVD Mirror update');
     const result = await execFile('./updateCvdMirror.sh', [
diff --git a/terraform/README.md b/terraform/README.md
index e02d8f8..397ab67 100644
--- a/terraform/README.md
+++ b/terraform/README.md
@@ -53,7 +53,8 @@ TF_VAR_config_json=$(cat <<EOF
     }
   ],
   "ClamCvdMirrorBucket": "cvd-mirror-${PROJECT_ID}",
-  "fileExclusionPatterns": []
+  "fileExclusionPatterns": [["\\\\.tmp$","i"]],
+  "ignoreZeroLengthFiles": true
 }
 EOF
 )