Skip to content

Commit

Permalink
Introduce the S3 sink for consumer groups offsets (#996)
Browse files Browse the repository at this point in the history
* introduces the S3 sink for consumer groups offsets.

The sink stores the latest offsets for a group-topic-partition under a key: $bucket/[$prefix]/$group/$topic/$partition

Since eventually the latest offset is written, the connector does not need to track the offsets for a given key, thus reducing the complexity.

Consumer offsets topic contains OffsetsKey messages and GroupMetadata. The latter is ignored since it deals with the group information.

* Handle offset deletes

* Fix the unit tests

Ensure the prefix with ending / is corrected

---------

Co-authored-by: stheppi <[email protected]>
  • Loading branch information
stheppi and stheppi authored Nov 6, 2023
1 parent 6702ba2 commit d5e0de2
Show file tree
Hide file tree
Showing 23 changed files with 1,912 additions and 48 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
/*
* Copyright 2020 Lenses.io
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.lenses.streamreactor.connect.aws.s3.sink

import cats.implicits._
import com.typesafe.scalalogging.LazyLogging
import io.lenses.streamreactor.connect.aws.s3.config.AuthMode
import io.lenses.streamreactor.connect.aws.s3.config.S3ConfigSettings._
import io.lenses.streamreactor.connect.aws.s3.utils.S3ProxyContainerTest
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.connect.sink.SinkRecord
import org.apache.kafka.connect.sink.SinkTaskContext
import org.mockito.MockitoSugar
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

import java.nio.ByteBuffer
import java.util
import scala.jdk.CollectionConverters.MapHasAsJava
import scala.jdk.CollectionConverters.SeqHasAsJava

class S3ConsumerGroupsSinkTaskTest
extends AnyFlatSpec
with Matchers
with S3ProxyContainerTest
with MockitoSugar
with LazyLogging {

import helper._

private val TopicName = "__consumer_offsets"

private def DefaultProps: Map[String, String] = Map(
AWS_ACCESS_KEY -> Identity,
AWS_SECRET_KEY -> Credential,
AUTH_MODE -> AuthMode.Credentials.toString,
CUSTOM_ENDPOINT -> uri(),
ENABLE_VIRTUAL_HOST_BUCKETS -> "true",
"name" -> "s3SinkTaskBuildLocalTest",
AWS_REGION -> "eu-west-1",
TASK_INDEX -> "1:1",
)

private def generateOffsetKey(group: String, topic: String, partition: Int): Array[Byte] = {
val buffer = ByteBuffer.allocate(256)
buffer.putShort(0.toShort)

buffer.putShort(group.getBytes.length.toShort)
buffer.put(group.getBytes)

buffer.putShort(topic.getBytes.length.toShort)
buffer.put(topic.getBytes)
buffer.putInt(partition)
buffer.array

}

private def generateOffsetDetails(offset: Long) = {
val valueBuffer = ByteBuffer.allocate(256)
valueBuffer.putShort(0.toShort)
valueBuffer.putLong(offset)
val metadata = "metadata"
valueBuffer.putShort(metadata.getBytes.length.toShort)
valueBuffer.put(metadata.getBytes)
val commitTimestamp = 456L
valueBuffer.putLong(commitTimestamp)
valueBuffer.array
}

"S3ConsumerGroupsSinkTask" should "write the offsets" in {

val task = new S3ConsumerGroupsSinkTask()

val props = DefaultProps
.combine(
Map(
S3_BUCKET_CONFIG -> BucketName,
),
).asJava

task.start(props)
task.open(Seq(new TopicPartition(TopicName, 1)).asJava)

val records: List[SinkRecord] = List(
new SinkRecord(
TopicName,
1,
null,
generateOffsetKey("group1", "topic1", 1),
null,
generateOffsetDetails(123L),
100,
),
new SinkRecord(
TopicName,
1,
null,
generateOffsetKey("group1", "topic1", 1),
null,
generateOffsetDetails(456L),
100,
),
new SinkRecord(
TopicName,
1,
null,
generateOffsetKey("group1", "topic1", 1),
null,
generateOffsetDetails(789L),
100,
),
)
task.put(records.asJava)

{
listBucketPath(BucketName, "group1/topic1").size should be(1)

val byteArray = remoteFileAsBytes(BucketName, "group1/topic1/1")
ByteBuffer.wrap(byteArray).getLong should be(789L)
}

val records2 = List(
new SinkRecord(
TopicName,
1,
null,
generateOffsetKey("group1", "topic1", 1),
null,
generateOffsetDetails(999L),
100,
),
)

task.put(records2.asJava)

{
listBucketPath(BucketName, "group1/topic1").size should be(1)

val byteArray = remoteFileAsBytes(BucketName, "group1/topic1/1")
ByteBuffer.wrap(byteArray).getLong should be(999L)
}
task.close(Seq(new TopicPartition(TopicName, 1)).asJava)
}

"S3SinkTask" should "write the offsets with prefix" in {

val task = new S3ConsumerGroupsSinkTask()

val props = DefaultProps
.combine(
Map(
S3_BUCKET_CONFIG -> (BucketName + ":a/b"),
),
).asJava

task.start(props)
task.open(Seq(new TopicPartition(TopicName, 1)).asJava)

val records: List[SinkRecord] = List(
new SinkRecord(
TopicName,
1,
null,
generateOffsetKey("group1", "topic1", 1),
null,
generateOffsetDetails(123L),
100,
),
new SinkRecord(
TopicName,
1,
null,
generateOffsetKey("group1", "topic1", 1),
null,
generateOffsetDetails(456L),
100,
),
new SinkRecord(
TopicName,
1,
null,
generateOffsetKey("group1", "topic1", 1),
null,
generateOffsetDetails(789L),
100,
),
)
task.put(records.asJava)

listBucketPath(BucketName, "a/b/group1/topic1").size should be(1)

val byteArray = remoteFileAsBytes(BucketName, "a/b/group1/topic1/1")
ByteBuffer.wrap(byteArray).getLong should be(789L)
task.close(Seq(new TopicPartition(TopicName, 1)).asJava)
}

"S3SinkTask" should "ignore records which are not offset keys" in {

val task = new S3ConsumerGroupsSinkTask()

val props = DefaultProps
.combine(
Map(
S3_BUCKET_CONFIG -> BucketName,
),
).asJava

task.start(props)
task.open(Seq(new TopicPartition(TopicName, 1)).asJava)

val buffer = ByteBuffer.allocate(256)
buffer.putShort(3)
val records: List[SinkRecord] = List(
new SinkRecord(
TopicName,
1,
null,
buffer.array(),
null,
"value".getBytes,
100,
),
new SinkRecord(
TopicName,
1,
null,
generateOffsetKey("group1", "topic1", 1),
null,
generateOffsetDetails(456L),
100,
),
)
task.put(records.asJava)

listBucketPath(BucketName, "group1/topic1").size should be(1)
val array = remoteFileAsBytes(BucketName, "group1/topic1/1")
ByteBuffer.wrap(array).getLong should be(456L)
task.close(Seq(new TopicPartition(TopicName, 1)).asJava)
}
def createTask(context: SinkTaskContext, props: util.Map[String, String]): S3SinkTask = {
reset(context)
val task: S3SinkTask = new S3SinkTask()
task.initialize(context)
task.start(props)
task
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ trait S3ProxyContainerTest extends AnyFlatSpec with ForAllTestContainer with Laz
helperOpt = Some(new RemoteFileHelper(sI))

logger.debug("Creating test bucket")
createTestBucket() match {
createTestBucket(BucketName) match {
case Left(err) =>
logger.error("Failed to create test bucket.", err)
fail("Failed to create test bucket.", err)
Expand All @@ -108,20 +108,20 @@ trait S3ProxyContainerTest extends AnyFlatSpec with ForAllTestContainer with Laz

after {
if (cleanUpEnabled) {
clearTestBucket()
clearTestBucket(BucketName)
setUpTestData()
}
}

def createTestBucket(): Either[Throwable, Unit] =
def createTestBucket(bucketName: String): Either[Throwable, Unit] =
// It is fine if it already exists
Try(s3Client.createBucket(CreateBucketRequest.builder().bucket(BucketName).build())).toEither.map(_ => ())
Try(s3Client.createBucket(CreateBucketRequest.builder().bucket(bucketName).build())).toEither.map(_ => ())

def clearTestBucket(): Either[Throwable, Unit] =
def clearTestBucket(bucketName: String): Either[Throwable, Unit] =
Try {

val toDeleteArray = helper
.listBucketPath(BucketName, "")
.listBucketPath(bucketName, "")
.map(ObjectIdentifier.builder().key(_).build())
val delete = Delete.builder().objects(toDeleteArray: _*).build
s3Client.deleteObjects(DeleteObjectsRequest.builder().bucket(BucketName).delete(delete).build())
Expand Down
30 changes: 30 additions & 0 deletions kafka-connect-aws-s3/src/main/resources/aws-s3-cg-sink-ascii.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

████████▀▀▀▀▀███████████████████████████████████████████████████████████████████
█████▀ ▀████████████████████████████████████████████████████████████████
███▀ ▄█████▄ ▀██████████████████████████████████████████████████████████████
███ ▄███████▄ ██████ █████▌ █▌ ████ ███ ▄▄ ██ ███ ▄▄ ███
███ █████████ ██████ █████▌ ██████▌ ▀██ ██ ██████ ██████ ███████
███ ▀███████▀ ██████ █████▌ ██▌ █▄ █ ███▄▄ ██ ███▄▄ ███
████▄ ▄███████ █████▌ ██████▌ ███ ███████ █ ███████████ ██
█████████ ████████████ ▌ █▌ ████▄ ██▄ ▄██ █▄ ▄███
█████████ ████████████████████████████████████████████████████████████████████
█████████ ▄████████████████████████████████████████████████████████████████████
████████████████████████████████████████████████████████████████████████████████


___ ______ ____ _____
/ \ \ / / ___| / ___|___ /
/ _ \ \ /\ / /\___ \ \___ \ |_ \
/ ___ \ V V / ___) | ___) |__) |
/_/___\_\_/\_/ |____/ |____/____/ ____
/ ___|___ _ __ ___ _ _ _ __ ___ ___ _ __ / ___|_ __ ___ _ _ _ __
| | / _ \| '_ \/ __| | | | '_ ` _ \ / _ \ '__| | | _| '__/ _ \| | | | '_ \
| |__| (_) | | | \__ \ |_| | | | | | | __/ | | |_| | | | (_) | |_| | |_) |
\____\___/|_| |_|___/\__,_|_| |_| |_|\___|_| \____|_| \___/ \__,_| .__/
____ _ _ ____ _ |_|
/ ___|(_)_ __ | | __ / ___|___ _ __ _ __ ___ ___| |_ ___ _ __
\___ \| | '_ \| |/ / | | / _ \| '_ \| '_ \ / _ \/ __| __/ _ \| '__|
___) | | | | | < | |__| (_) | | | | | | | __/ (__| || (_) | |
|____/|_|_| |_|_|\_\ \____\___/|_| |_|_| |_|\___|\___|\__\___/|_|


Loading

0 comments on commit d5e0de2

Please sign in to comment.