From c3a2386f687fd7a556cfd7782d9ea9df1309c70a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gabriel=20Villalonga=20Sim=C3=B3n?= Date: Mon, 23 Sep 2024 00:18:03 -0500 Subject: [PATCH] registry: add more docs around garbage collection and improve the 'locking' --- docs/garbage-collection.md | 119 ++++++++++++++++++++++++++++++ src/registry/garbage-collector.ts | 96 ++++++++++++++++++++---- src/registry/r2.ts | 5 +- 3 files changed, 202 insertions(+), 18 deletions(-) create mode 100644 docs/garbage-collection.md diff --git a/docs/garbage-collection.md b/docs/garbage-collection.md new file mode 100644 index 0000000..59bccd2 --- /dev/null +++ b/docs/garbage-collection.md @@ -0,0 +1,119 @@ +# Removing manifests and garbage collection + +Garbage collection is useful due to how [OCI](https://github.com/opencontainers/image-spec/blob/main/manifest.md) container images get shipped to registries. + +```json +{ + "schemaVersion": 2, + "mediaType": "application/vnd.oci.image.manifest.v1+json", + "config": { + "mediaType": "application/vnd.oci.image.config.v1+json", + "digest": "sha256:b5b2b2c507a0944348e0303114d8d93aaaa081732b86451d9bce1f432a537bc7", + "size": 7023 + }, + "layers": [ + { + "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip", + "digest": "sha256:9834876dcfb05cb167a5c24953eba58c4ac89b1adf57f28f2f9d09af107ee8f0", + "size": 32654 + }, + { + "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip", + "digest": "sha256:3c3a4604a545cdc127456d94e421cd355bca5b528f4a9c1905b15da2eb4a4c6b", + "size": 16724 + }, + { + "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip", + "digest": "sha256:ec4b8955958665577945c89419d1af06b5f7636b4ac3da7f12184802ad867736", + "size": 73109 + } + ] +} +``` + +This is how a container image manifest looks, it's a "tree-like" structure where the manifest references +layers. If you remove an image from a registry, you're probably just removing its manifest. However, layers +will still be around taking space. + +## Removing an image and triggering the garbage collection + +To delete an image of your registry, you can use `skopeo delete` or an API call: + +``` +# If you pushed to serverless.workers.dev/my-image:latest +curl -X DELETE -X "Authorization: $CREDENTIAL" https://serverless.workers.dev/my-image/manifests/latest +# You will also need to remove the digest reference +curl -X DELETE -X "Authorization: $CREDENTIAL" https://serverless.workers.dev/my-image/manifests/ +``` + +The layer still exists in the registry, but we can remove it by triggering the garbage collector. + +``` +curl -X POST -H "Authorization: $CREDENTIAL" https://serverless.workers.dev/my-image/gc +{"success":true} +``` + +## How does it work + +How do we remove them? We take the approach of listing all manifests in a namespace and storing its digests +in a Set, then we list all the layers and those that are not in the Set get removed. That has a big drawback +that means we might be removing layers that don't have a manifest but are about to have one at the end of their push. + +In serverless-registry, if we remove a layer garbage collecting the manifest endpoint will throw a BLOB_UNKNOWN +error, but the garbage collector can still race with that endpont, so we go back to square one. + +Some registries take a lock stop the world approach, however serverless-registry can't really do that due +to its objective of only using R2. However, we need to fail whenever a race condition happens, a data +race that causes data-loss would be completely unacceptable. + +That's when we introduce a simple system where instead of taking a lock, we mark in R2 +that we are about to create a manifest and that we are inserting data. +If the garbage collector starts and sees that key, it will fail. At the end of the insertion, the insertion mark +gets updated. + +The same goes for the garbage collector, when it starts it creates a mark, and when it finishes it updates the +mark. + +Let's state some scenarios: + +``` +PutManifest GC +1. markForInsertion() 2. markForGarbageCollection() +... +3. checkLayersExist() ... +4. checkGCDidntStart() // fails due to ongoing gc +5. insertManifest() +``` + +``` +PutManifest GC +1. markForInsertion() 4. markForGarbageCollection() +... +2. checkLayersExist() 6. mark = getInsertionMark(); +3. checkGCDidntStart() 7. ... finds a layer to remove +5. insertManifest() 8. checkOnGoingUpdates() // fails due to ongoing updates +9. unmarkForInsertion() +``` + +``` +PutManifest GC +1. markForInsertion() 4. markForGarbageCollection() +... +2. checkLayersExist() 6. mark = getInsertionMark(); +3. checkGCDidntStart() 7. ... finds a layer to remove +5. insertManifest() 9. checkOnGoingUpdates() +8. unmarkForInsertion() 10. checkMark(mark) // this fails, not latest, can't delete layer +``` + +``` +PutManifest GC +4. markForInsertion() 1. markForGarbageCollection() +5. gcMark = getGCMark() +6. checkLayersExist() 2. mark = getInsertionMark(); + 3. checkOngoingUpdates() and checkMark(mark) + 7. deleteLayer() and unmarkGarbageCollector(); +8. checkGCDidntStart(gcMark) // fails because latest gc marker is different +``` + +It's a pattern where you build the state you need a lock in, get the mark of when you built that world, +and confirm before making changes from that view that there is nothing that might've changed the view. diff --git a/src/registry/garbage-collector.ts b/src/registry/garbage-collector.ts index 9ec41d9..2ab3119 100644 --- a/src/registry/garbage-collector.ts +++ b/src/registry/garbage-collector.ts @@ -2,6 +2,7 @@ // Unreferenced will delete all blobs that are not referenced by any manifest. // Untagged will delete all blobs that are not referenced by any manifest and are not tagged. +import { ServerError } from "../errors"; import { ManifestSchema } from "../manifest"; export type GarbageCollectionMode = "unreferenced" | "untagged"; @@ -16,17 +17,19 @@ export type GCOptions = { // // Summary: // insertParent() { +// gcMark = getGCMark(); // get last gc mark // mark = updateInsertMark(); // mark insertion // defer cleanInsertMark(mark); // checkEveryChildIsOK(); -// getDeletionMarkIsFalse(); // make sure not ongoing deletion mark after checking child is in db +// gcMarkIsEqualAndNotOngoingGc(gcMark); // make sure not ongoing deletion mark after checking child is in db // insertParent(); // insert parent in db // } // // gc() { -// setDeletionMark() // marks deletion as gc -// defer { cleanDeletionMark(); } // clean up mark -// checkNotOngoingInsertMark() // makes sure not ongoing updateInsertMark +// insertionMark = getInsertionMark() // get last insertion mark +// mark = setGCMark() // marks deletion as gc +// defer { cleanGCMark(mark); } // clean up mark +// checkNotOngoingInsertMark(mark) // makes sure not ongoing updateInsertMark, and no new one // deleteChildrenWithoutParent(); // go ahead and clean children // } // @@ -41,40 +44,90 @@ export class GarbageCollector { async markForGarbageCollection(namespace: string): Promise { const etag = crypto.randomUUID(); - const deletion = await this.registry.put(`${namespace}/deletion`, etag); + const deletion = await this.registry.put(`${namespace}/gc/marker`, etag); if (deletion === null) throw new Error("unreachable"); + // set last_update so inserters are able to invalidate + await this.registry.put(`${namespace}/gc/last_update`, null, { + customMetadata: { timestamp: `${Date.now()}-${crypto.randomUUID()}` }, + }); return etag; } async cleanupGarbageCollectionMark(namespace: string) { - await this.registry.delete(`${namespace}/deletion`); + // set last_update so inserters can confirm that a GC didnt happen while they were confirming data + await this.registry.put(`${namespace}/gc/last_update`, null, { + customMetadata: { timestamp: `${Date.now()}-${crypto.randomUUID()}` }, + }); + await this.registry.delete(`${namespace}/gc/marker`); } - async checkCanInsertData(namespace: string): Promise { - const deletion = await this.registry.head(`${namespace}/deletion`); - if (deletion === null) { - return true; + async getGCMarker(namespace: string): Promise { + const object = await this.registry.head(`${namespace}/gc/last_update`); + if (object === null) { + return ""; + } + + if (object.customMetadata === undefined) { + return ""; + } + + return object.customMetadata["timestamp"] ?? "mark"; + } + + async checkCanInsertData(namespace: string, mark: string): Promise { + const gcMarker = await this.registry.head(`${namespace}/gc/marker`); + if (gcMarker !== null) { + return false; } - return false; + const newMarker = await this.getGCMarker(namespace); + // There's been a new garbage collection since we started the check for insertion + if (newMarker !== mark) return false; + + return true; } // If successful, it inserted in R2 that its going // to start inserting data that might conflight with GC. async markForInsertion(namespace: string): Promise { const uid = crypto.randomUUID(); + // mark that there is an on-going insertion const deletion = await this.registry.put(`${namespace}/insertion/${uid}`, uid); if (deletion === null) throw new Error("unreachable"); + // set last_update so GC is able to invalidate + await this.registry.put(`${namespace}/insertion/last_update`, null, { + customMetadata: { timestamp: `${Date.now()}-${crypto.randomUUID()}` }, + }); + return uid; } async cleanInsertion(namespace: string, tag: string) { + // update again to invalidate GC and the insertion is safe + await this.registry.put(`${namespace}/insertion/last_update`, null, { + customMetadata: { timestamp: `${Date.now()}-${crypto.randomUUID()}` }, + }); + await this.registry.delete(`${namespace}/insertion/${tag}`); } - async checkIfGCCanContinue(namespace: string): Promise { + async getInsertionMark(namespace: string): Promise { + const object = await this.registry.head(`${namespace}/insertion/last_update`); + if (object === null) { + return ""; + } + + if (object.customMetadata === undefined) { + return ""; + } + + return object.customMetadata["timestamp"] ?? "mark"; + } + + async checkIfGCCanContinue(namespace: string, mark: string): Promise { const objects = await this.registry.list({ prefix: `${namespace}/insertion` }); for (const object of objects.objects) { + if (object.key.endsWith("/last_update")) continue; if (object.uploaded.getTime() + 1000 * 60 <= Date.now()) { await this.registry.delete(object.key); } else { @@ -84,6 +137,12 @@ export class GarbageCollector { // call again to clean more if (objects.truncated) return false; + + const newMark = await this.getInsertionMark(namespace); + if (newMark !== mark) { + return false; + } + return true; } @@ -124,6 +183,7 @@ export class GarbageCollector { private async collectInner(options: GCOptions): Promise { // We can run out of memory, this should be a bloom filter let referencedBlobs = new Set(); + const mark = await this.getInsertionMark(options.name); await this.list(`${options.name}/manifests/`, async (manifestObject) => { const tag = manifestObject.key.split("/").pop(); @@ -146,14 +206,14 @@ export class GarbageCollector { let unreferencedKeys: string[] = []; const deleteThreshold = 15; await this.list(`${options.name}/blobs/`, async (object) => { - if (!(await this.checkIfGCCanContinue(options.name))) { - throw new Error("there is a manifest insertion going, the garbage collection shall stop"); - } - const hash = object.key.split("/").pop(); if (hash && !referencedBlobs.has(hash)) { unreferencedKeys.push(object.key); if (unreferencedKeys.length > deleteThreshold) { + if (!(await this.checkIfGCCanContinue(options.name, mark))) { + throw new ServerError("there is a manifest insertion going, the garbage collection shall stop"); + } + await this.registry.delete(unreferencedKeys); unreferencedKeys = []; } @@ -161,6 +221,10 @@ export class GarbageCollector { return true; }); if (unreferencedKeys.length > 0) { + if (!(await this.checkIfGCCanContinue(options.name, mark))) { + throw new Error("there is a manifest insertion going, the garbage collection shall stop"); + } + await this.registry.delete(unreferencedKeys); } diff --git a/src/registry/r2.ts b/src/registry/r2.ts index 3b723fb..5a6bc8f 100644 --- a/src/registry/r2.ts +++ b/src/registry/r2.ts @@ -293,6 +293,7 @@ export class R2Registry implements Registry { readableStream: ReadableStream, contentType: string, ): Promise { + const gcMarker = await this.gc.getGCMarker(name); const env = this.env; const sha256 = new crypto.DigestStream("SHA-256"); const reader = readableStream.getReader(); @@ -308,8 +309,8 @@ export class R2Registry implements Registry { const verifyManifestErr = await this.verifyManifest(name, manifest); if (verifyManifestErr !== null) return { response: verifyManifestErr }; - if (!(await this.gc.checkCanInsertData(name))) { - console.error("Manifest can't be uploaded as there is a garbage collection going"); + if (!(await this.gc.checkCanInsertData(name, gcMarker))) { + console.error("Manifest can't be uploaded as there is/was a garbage collection going"); return { response: new ServerError("garbage collection is on-going... check with registry administrator", 500) }; }