diff --git a/airbyte-cdk/java/airbyte-cdk/README.md b/airbyte-cdk/java/airbyte-cdk/README.md index 1387a16e9f2b..80daa37b1fff 100644 --- a/airbyte-cdk/java/airbyte-cdk/README.md +++ b/airbyte-cdk/java/airbyte-cdk/README.md @@ -174,6 +174,7 @@ corresponds to that version. | Version | Date | Pull Request | Subject | |:-----------|:-----------| :--------------------------------------------------------- |:---------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 0.41.2 | 2024-07-12 | [\#40567](https://github.com/airbytehq/airbyte/pull/40567) | Fix BaseSqlGenerator test case (generation_id support); update minimum platform version for refreshes support. | | 0.41.1 | 2024-07-11 | [\#41212](https://github.com/airbytehq/airbyte/pull/41212) | Improve debezium logging. | | 0.41.0 | 2024-07-11 | [\#38240](https://github.com/airbytehq/airbyte/pull/38240) | Sources : Changes in CDC interfaces to support WASS algorithm | | 0.40.11 | 2024-07-08 | [\#41041](https://github.com/airbytehq/airbyte/pull/41041) | Destinations: Fix truncate refreshes incorrectly discarding data if successful attempt had 0 records | diff --git a/airbyte-cdk/java/airbyte-cdk/core/src/main/resources/version.properties b/airbyte-cdk/java/airbyte-cdk/core/src/main/resources/version.properties index 1cdaa4e6d53a..7069f23b4197 100644 --- a/airbyte-cdk/java/airbyte-cdk/core/src/main/resources/version.properties +++ b/airbyte-cdk/java/airbyte-cdk/core/src/main/resources/version.properties @@ -1 +1 @@ -version=0.41.1 \ No newline at end of file +version=0.41.2 diff --git a/airbyte-cdk/java/airbyte-cdk/typing-deduping/src/main/kotlin/io/airbyte/integrations/base/destination/typing_deduping/CatalogParser.kt b/airbyte-cdk/java/airbyte-cdk/typing-deduping/src/main/kotlin/io/airbyte/integrations/base/destination/typing_deduping/CatalogParser.kt index aa16052c071f..97c6dd0857d4 100644 --- a/airbyte-cdk/java/airbyte-cdk/typing-deduping/src/main/kotlin/io/airbyte/integrations/base/destination/typing_deduping/CatalogParser.kt +++ b/airbyte-cdk/java/airbyte-cdk/typing-deduping/src/main/kotlin/io/airbyte/integrations/base/destination/typing_deduping/CatalogParser.kt @@ -137,7 +137,7 @@ constructor( fun toStreamConfig(stream: ConfiguredAirbyteStream): StreamConfig { if (stream.generationId == null || stream.minimumGenerationId == null) { throw ConfigErrorException( - "You must upgrade your platform version to use this connector version. Either downgrade your connector or upgrade platform to 0.63.0" + "You must upgrade your platform version to use this connector version. Either downgrade your connector or upgrade platform to 0.63.7" ) } if ( diff --git a/airbyte-cdk/java/airbyte-cdk/typing-deduping/src/testFixtures/kotlin/io/airbyte/integrations/base/destination/typing_deduping/BaseSqlGeneratorIntegrationTest.kt b/airbyte-cdk/java/airbyte-cdk/typing-deduping/src/testFixtures/kotlin/io/airbyte/integrations/base/destination/typing_deduping/BaseSqlGeneratorIntegrationTest.kt index ea4626e29b68..1494d8bdf58a 100644 --- a/airbyte-cdk/java/airbyte-cdk/typing-deduping/src/testFixtures/kotlin/io/airbyte/integrations/base/destination/typing_deduping/BaseSqlGeneratorIntegrationTest.kt +++ b/airbyte-cdk/java/airbyte-cdk/typing-deduping/src/testFixtures/kotlin/io/airbyte/integrations/base/destination/typing_deduping/BaseSqlGeneratorIntegrationTest.kt @@ -1814,6 +1814,9 @@ abstract class BaseSqlGeneratorIntegrationTest = jdbcDatabase.queryJsons(sql) + private fun toJdbcTypeName(airbyteProtocolType: AirbyteProtocolType): String { return when (airbyteProtocolType) { AirbyteProtocolType.STRING -> "varchar" diff --git a/airbyte-integrations/connectors/destination-redshift/src/test-integration/kotlin/io/airbyte/integrations/destination/redshift/RedshiftS3StagingStorageOperationTest.kt b/airbyte-integrations/connectors/destination-redshift/src/test-integration/kotlin/io/airbyte/integrations/destination/redshift/RedshiftS3StagingStorageOperationTest.kt new file mode 100644 index 000000000000..63c22a169cba --- /dev/null +++ b/airbyte-integrations/connectors/destination-redshift/src/test-integration/kotlin/io/airbyte/integrations/destination/redshift/RedshiftS3StagingStorageOperationTest.kt @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2024 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.integrations.destination.redshift + +import com.amazon.redshift.util.RedshiftException +import com.fasterxml.jackson.databind.JsonNode +import io.airbyte.cdk.db.jdbc.JdbcUtils +import io.airbyte.cdk.integrations.base.JavaBaseConstants +import io.airbyte.cdk.integrations.destination.async.model.PartialAirbyteMessage +import io.airbyte.cdk.integrations.destination.async.model.PartialAirbyteRecordMessage +import io.airbyte.cdk.integrations.destination.s3.FileUploadFormat +import io.airbyte.cdk.integrations.destination.s3.S3DestinationConfig +import io.airbyte.cdk.integrations.destination.s3.S3StorageOperations +import io.airbyte.cdk.integrations.destination.staging.StagingSerializedBufferFactory +import io.airbyte.commons.json.Jsons +import io.airbyte.commons.string.Strings +import io.airbyte.integrations.base.destination.operation.AbstractStreamOperation.Companion.TMP_TABLE_SUFFIX +import io.airbyte.integrations.base.destination.typing_deduping.StreamConfig +import io.airbyte.integrations.base.destination.typing_deduping.StreamId +import io.airbyte.integrations.destination.redshift.operation.RedshiftStagingStorageOperation +import io.airbyte.integrations.destination.redshift.typing_deduping.RedshiftDestinationHandler +import io.airbyte.integrations.destination.redshift.typing_deduping.RedshiftSqlGenerator +import io.airbyte.integrations.destination.redshift.util.RedshiftUtil +import io.airbyte.protocol.models.v0.AirbyteMessage.Type +import io.airbyte.protocol.models.v0.AirbyteRecordMessageMeta +import io.airbyte.protocol.models.v0.DestinationSyncMode +import java.nio.file.Files +import java.nio.file.Path +import java.util.Optional +import kotlin.test.assertEquals +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.assertThrows +import org.junit.jupiter.api.parallel.Execution +import org.junit.jupiter.api.parallel.ExecutionMode + +@Execution(ExecutionMode.CONCURRENT) +class RedshiftS3StagingStorageOperationTest { + private val randomString = Strings.addRandomSuffix("", "", 10) + private val streamId = + StreamId( + finalNamespace = "final_namespace_$randomString", + finalName = "final_name_$randomString", + rawNamespace = "raw_namespace_$randomString", + rawName = "raw_name_$randomString", + originalNamespace = "original_namespace_$randomString", + originalName = "original_name_$randomString", + ) + private val streamConfig = + StreamConfig( + streamId, + DestinationSyncMode.APPEND, + emptyList(), + Optional.empty(), + LinkedHashMap(), + GENERATION_ID, + 0, + SYNC_ID, + ) + private val storageOperation = + RedshiftStagingStorageOperation( + s3Config, + keepStagingFiles = false, + s3StorageOperations, + RedshiftSqlGenerator(RedshiftSQLNameTransformer(), config), + RedshiftDestinationHandler(databaseName, jdbcDatabase, streamId.rawNamespace) + ) + + @BeforeEach + fun setup() { + jdbcDatabase.execute("CREATE SCHEMA ${streamId.rawNamespace}") + } + + @AfterEach + fun teardown() { + jdbcDatabase.execute("DROP SCHEMA ${streamId.rawNamespace} CASCADE") + } + + @Test + fun testTransferStage() { + storageOperation.prepareStage(streamId, "") + storageOperation.prepareStage(streamId, TMP_TABLE_SUFFIX) + // Table is currently empty, so expect null generation. + assertEquals(null, storageOperation.getStageGeneration(streamId, TMP_TABLE_SUFFIX)) + + // Write one record to the real raw table + writeRecords(suffix = "", record(1)) + assertEquals( + listOf("""{"record_number":1}"""), + // We write the raw data as a string column, not a JSON column, so use asText(). + dumpRawRecords("").map { it["_airbyte_data"].asText() }, + ) + + // And write one record to the temp final table + writeRecords(suffix = TMP_TABLE_SUFFIX, record(2)) + assertEquals( + listOf("""{"record_number":2}"""), + dumpRawRecords(TMP_TABLE_SUFFIX).map { it["_airbyte_data"].asText() }, + ) + assertEquals(GENERATION_ID, storageOperation.getStageGeneration(streamId, TMP_TABLE_SUFFIX)) + + // If we transfer the records, we should end up with 2 records in the real raw table. + storageOperation.transferFromTempStage(streamId, TMP_TABLE_SUFFIX) + assertEquals( + listOf( + """{"record_number":1}""", + """{"record_number":2}""", + ), + dumpRawRecords("") + .sortedBy { + Jsons.deserialize(it["_airbyte_data"].asText())["record_number"].asLong() + } + .map { it["_airbyte_data"].asText() }, + ) + + // After transferring the records to the real table, the temp table should no longer exist. + assertEquals( + """ERROR: relation "${streamId.rawNamespace}.${streamId.rawName}$TMP_TABLE_SUFFIX" does not exist""", + assertThrows { dumpRawRecords(TMP_TABLE_SUFFIX) }.message, + ) + } + + @Test + fun testOverwriteStage() { + // If we then create another temp raw table and _overwrite_ the real raw table, + // we should end up with a single raw record. + storageOperation.prepareStage(streamId, "") + storageOperation.prepareStage(streamId, TMP_TABLE_SUFFIX) + writeRecords(suffix = "", record(3)) + writeRecords(suffix = TMP_TABLE_SUFFIX, record(4)) + + storageOperation.overwriteStage(streamId, TMP_TABLE_SUFFIX) + + assertEquals( + listOf("""{"record_number":4}"""), + dumpRawRecords("").map { it["_airbyte_data"].asText() }, + ) + assertEquals( + """ERROR: relation "${streamId.rawNamespace}.${streamId.rawName}$TMP_TABLE_SUFFIX" does not exist""", + assertThrows { dumpRawRecords(TMP_TABLE_SUFFIX) }.message, + ) + } + + private fun dumpRawRecords(suffix: String): List { + return jdbcDatabase.queryJsons( + "SELECT * FROM ${streamId.rawNamespace}.${streamId.rawName}$suffix" + ) + } + + private fun record(recordNumber: Int): PartialAirbyteMessage { + val serializedData = """{"record_number":$recordNumber}""" + return PartialAirbyteMessage() + .withType(Type.RECORD) + .withSerialized(serializedData) + .withRecord( + PartialAirbyteRecordMessage() + .withNamespace(streamId.originalNamespace) + .withStream(streamId.originalName) + .withEmittedAt(10_000) + .withMeta( + AirbyteRecordMessageMeta() + .withChanges(emptyList()) + .withAdditionalProperty( + JavaBaseConstants.AIRBYTE_META_SYNC_ID_KEY, + SYNC_ID, + ), + ) + .withData(Jsons.deserialize(serializedData)), + ) + } + + /** + * Utility method to create the SerializableBuffer, write records into it, and then push that + * buffer into [storageOperation]. + */ + private fun writeRecords(suffix: String, vararg records: PartialAirbyteMessage) { + val writeBuffer = + StagingSerializedBufferFactory.initializeBuffer( + FileUploadFormat.CSV, + JavaBaseConstants.DestinationColumns.V2_WITH_GENERATION + ) + + writeBuffer.use { + records.forEach { record: PartialAirbyteMessage -> + it.accept( + record.serialized!!, + Jsons.serialize(record.record!!.meta), + GENERATION_ID, + record.record!!.emittedAt + ) + } + it.flush() + storageOperation.writeToStage(streamConfig, suffix, writeBuffer) + } + } + + companion object { + private val config = + Jsons.deserialize(Files.readString(Path.of("secrets/1s1t_config_staging.json"))) + private val s3Config = + S3DestinationConfig.getS3DestinationConfig(RedshiftUtil.findS3Options(config)) + private val s3StorageOperations = + S3StorageOperations(RedshiftSQLNameTransformer(), s3Config.getS3Client(), s3Config) + private val jdbcDatabase = + RedshiftDestination().run { + val dataSource = getDataSource(config) + getDatabase(dataSource) + } + private val databaseName = config[JdbcUtils.DATABASE_KEY].asText() + + private const val SYNC_ID = 12L + private const val GENERATION_ID = 42L + } +} diff --git a/airbyte-integrations/connectors/destination-redshift/src/test-integration/kotlin/io/airbyte/integrations/destination/redshift/typing_deduping/AbstractRedshiftTypingDedupingTest.kt b/airbyte-integrations/connectors/destination-redshift/src/test-integration/kotlin/io/airbyte/integrations/destination/redshift/typing_deduping/AbstractRedshiftTypingDedupingTest.kt index 6f745e418f79..82441f60612c 100644 --- a/airbyte-integrations/connectors/destination-redshift/src/test-integration/kotlin/io/airbyte/integrations/destination/redshift/typing_deduping/AbstractRedshiftTypingDedupingTest.kt +++ b/airbyte-integrations/connectors/destination-redshift/src/test-integration/kotlin/io/airbyte/integrations/destination/redshift/typing_deduping/AbstractRedshiftTypingDedupingTest.kt @@ -11,7 +11,6 @@ import io.airbyte.cdk.integrations.base.JavaBaseConstants import io.airbyte.cdk.integrations.standardtest.destination.typing_deduping.JdbcTypingDedupingTest import io.airbyte.commons.json.Jsons.deserialize import io.airbyte.integrations.base.destination.typing_deduping.BaseTypingDedupingTest -import io.airbyte.integrations.base.destination.typing_deduping.BaseTypingDedupingTest.Companion import io.airbyte.integrations.base.destination.typing_deduping.SqlGenerator import io.airbyte.integrations.destination.redshift.RedshiftDestination import io.airbyte.integrations.destination.redshift.RedshiftSQLNameTransformer @@ -236,6 +235,9 @@ abstract class AbstractRedshiftTypingDedupingTest : JdbcTypingDedupingTest() { catalog, messages1, "airbyte/destination-redshift:3.1.1", + // Old connector version can't handle TRACE messages; disable the + // stream status message + streamStatus = null, ) // Second sync @@ -286,15 +288,16 @@ abstract class AbstractRedshiftTypingDedupingTest : JdbcTypingDedupingTest() { @Test fun testGenerationIdMigrationForOverwrite() { - val catalog = + // First sync + val catalog1 = ConfiguredAirbyteCatalog() .withStreams( listOf( ConfiguredAirbyteStream() .withSyncMode(SyncMode.FULL_REFRESH) .withDestinationSyncMode(DestinationSyncMode.OVERWRITE) - .withSyncId(42L) - .withGenerationId(43L) + .withSyncId(41L) + .withGenerationId(42L) .withMinimumGenerationId(0L) .withStream( AirbyteStream() @@ -304,14 +307,37 @@ abstract class AbstractRedshiftTypingDedupingTest : JdbcTypingDedupingTest() { ), ), ) - - // First sync val messages1 = readMessages("dat/sync1_messages.jsonl") - runSync(catalog, messages1, "airbyte/destination-redshift:3.1.1") + runSync( + catalog1, + messages1, + "airbyte/destination-redshift:3.1.1", + // Old connector version can't handle TRACE messages; disable the + // stream status message + streamStatus = null, + ) // Second sync + val catalog2 = + ConfiguredAirbyteCatalog() + .withStreams( + listOf( + ConfiguredAirbyteStream() + .withSyncMode(SyncMode.FULL_REFRESH) + .withDestinationSyncMode(DestinationSyncMode.OVERWRITE) + .withSyncId(42L) + .withGenerationId(43L) + .withMinimumGenerationId(43L) + .withStream( + AirbyteStream() + .withNamespace(streamNamespace) + .withName(streamName) + .withJsonSchema(BaseTypingDedupingTest.Companion.SCHEMA), + ), + ), + ) val messages2 = readMessages("dat/sync2_messages.jsonl") - runSync(catalog, messages2) + runSync(catalog2, messages2) val expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_overwrite_raw.jsonl") val expectedFinalRecords2 = diff --git a/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_fullrefresh_append_with_new_gen_id_final.jsonl b/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_fullrefresh_append_with_new_gen_id_final.jsonl new file mode 100644 index 000000000000..d14394d3eccb --- /dev/null +++ b/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_fullrefresh_append_with_new_gen_id_final.jsonl @@ -0,0 +1,9 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:01.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 43, "id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00.000000Z", "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 43, "id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00.000000Z", "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 43, "id1": 1, "id2": 201, "updated_at": "2000-01-01T00:02:00.000000Z", "name": "Bob", "address": {"city": "Boston", "state": "MA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:01.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 43, "id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00.000000Z", "name": "Charlie", "age": 42, "registration_date": "2023-12-23"} +{"_airbyte_extracted_at": "1970-01-01T00:00:01.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 43, "id1": 3, "id2": 200, "updated_at": "2000-01-01T00:04:00.000000Z", "name": "a\bb\fc\nd\re\tf`~!@#$%^&*()_+-=[]\\{}|'\",./<>?"} + +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44, "id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00.000000Z", "name": "Alice", "address": {"city": "Seattle", "state": "WA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44, "id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00.000000Z", "name": "Bob", "address": {"city": "New York", "state": "NY"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44, "id1": 1, "id2": 201, "updated_at": "2000-01-02T00:01:00.000000Z", "_ab_cdc_deleted_at": "1970-01-01T00:00:00.000000Z"} diff --git a/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_fullrefresh_overwrite_with_new_gen_id_final.jsonl b/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_fullrefresh_overwrite_with_new_gen_id_final.jsonl new file mode 100644 index 000000000000..bc2849224699 --- /dev/null +++ b/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_fullrefresh_overwrite_with_new_gen_id_final.jsonl @@ -0,0 +1,3 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44, "id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00.000000Z", "name": "Alice", "address": {"city": "Seattle", "state": "WA"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44, "id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00.000000Z", "name": "Bob", "address": {"city": "New York", "state": "NY"}} +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44, "id1": 1, "id2": 201, "updated_at": "2000-01-02T00:01:00.000000Z", "_ab_cdc_deleted_at": "1970-01-01T00:00:00.000000Z"} diff --git a/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_fullrefresh_overwrite_with_new_gen_id_raw.jsonl b/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_fullrefresh_overwrite_with_new_gen_id_raw.jsonl new file mode 100644 index 000000000000..f2e286786eaa --- /dev/null +++ b/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_fullrefresh_overwrite_with_new_gen_id_raw.jsonl @@ -0,0 +1,3 @@ +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44} +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44} +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:01:00Z", "_ab_cdc_deleted_at": "1970-01-01T00:00:00Z"}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44} diff --git a/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_with_new_gen_id_raw.jsonl b/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_with_new_gen_id_raw.jsonl new file mode 100644 index 000000000000..023a15b98ac9 --- /dev/null +++ b/airbyte-integrations/connectors/destination-redshift/src/test-integration/resources/dat/sync2_expectedrecords_with_new_gen_id_raw.jsonl @@ -0,0 +1,10 @@ +// We keep the records from the first sync +{"_airbyte_extracted_at": "1970-01-01T00:00:01.000000Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "San Francisco", "state": "CA"}}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 43} +{"_airbyte_extracted_at": "1970-01-01T00:00:01.000000Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-01T00:01:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Los Angeles", "state": "CA"}}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 43} +{"_airbyte_extracted_at": "1970-01-01T00:00:01.000000Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-01T00:02:00Z", "name": "Bob", "address": {"city": "Boston", "state": "MA"}}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 43} +{"_airbyte_extracted_at": "1970-01-01T00:00:01.000000Z", "_airbyte_data": {"id1": 2, "id2": 200, "updated_at": "2000-01-01T00:03:00Z", "name": "Charlie", "age": 42, "registration_date": "2023-12-23"}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 43} +{"_airbyte_extracted_at": "1970-01-01T00:00:01.000000Z", "_airbyte_data": {"id1": 3, "id2": 200, "updated_at": "2000-01-01T00:04:00Z", "name": "a\bb\fc\nd\re\tf`~!@#$%^&*()_+-=[]\\{}|'\",./<>?"}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 43} +// And append the records from the second sync +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_data": {"id1": 1, "id2": 200, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Alice", "address": {"city": "Seattle", "state": "WA"}}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44} +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:00:00Z", "_ab_cdc_deleted_at": null, "name": "Bob", "address": {"city": "New York", "state": "NY"}}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44} +{"_airbyte_extracted_at": "1970-01-01T00:00:02.000000Z", "_airbyte_data": {"id1": 1, "id2": 201, "updated_at": "2000-01-02T00:01:00Z", "_ab_cdc_deleted_at": "1970-01-01T00:00:00Z"}, "_airbyte_meta": {"changes":[],"sync_id":42}, "_airbyte_generation_id": 44} diff --git a/airbyte-integrations/connectors/destination-snowflake/build.gradle b/airbyte-integrations/connectors/destination-snowflake/build.gradle index c08ac313273e..96441157c9f5 100644 --- a/airbyte-integrations/connectors/destination-snowflake/build.gradle +++ b/airbyte-integrations/connectors/destination-snowflake/build.gradle @@ -3,7 +3,7 @@ plugins { } airbyteJavaConnector { - cdkVersionRequired = '0.40.11' + cdkVersionRequired = '0.41.2' features = ['db-destinations', 's3-destinations', 'typing-deduping'] useLocalCdk = false } diff --git a/airbyte-integrations/connectors/destination-snowflake/metadata.yaml b/airbyte-integrations/connectors/destination-snowflake/metadata.yaml index a7e518ecb8af..079cf49d34cd 100644 --- a/airbyte-integrations/connectors/destination-snowflake/metadata.yaml +++ b/airbyte-integrations/connectors/destination-snowflake/metadata.yaml @@ -5,7 +5,7 @@ data: connectorSubtype: database connectorType: destination definitionId: 424892c4-daac-4491-b35d-c6688ba547ba - dockerImageTag: 3.11.1 + dockerImageTag: 3.11.2 dockerRepository: airbyte/destination-snowflake documentationUrl: https://docs.airbyte.com/integrations/destinations/snowflake githubIssueLabel: destination-snowflake diff --git a/airbyte-integrations/connectors/source-facebook-marketing/metadata.yaml b/airbyte-integrations/connectors/source-facebook-marketing/metadata.yaml index e9078f4ad485..7c2e82271ec8 100644 --- a/airbyte-integrations/connectors/source-facebook-marketing/metadata.yaml +++ b/airbyte-integrations/connectors/source-facebook-marketing/metadata.yaml @@ -25,6 +25,7 @@ data: registries: cloud: enabled: true + dockerImageTag: 3.3.10 oss: enabled: true releaseStage: generally_available diff --git a/docs/connector-development/config-based/low-code-cdk-overview.md b/docs/connector-development/config-based/low-code-cdk-overview.md index f93bbc675f44..3a772d978b8d 100644 --- a/docs/connector-development/config-based/low-code-cdk-overview.md +++ b/docs/connector-development/config-based/low-code-cdk-overview.md @@ -89,7 +89,7 @@ We recommend iterating on this YAML file is via the [connector builder UI](https ## Configuring the YAML file -The low-code framework involves editing a boilerplate YAML file. The general structure of the YAML file is as follows: +The low-code framework involves editing the Connector Manifest, which is a boilerplate YAML file. The general structure of the YAML file is as follows: ``` version: "0.1.0" diff --git a/docs/contributing-to-airbyte/writing-docs.md b/docs/contributing-to-airbyte/writing-docs.md index b1ef91436e7a..177b8417f0b8 100644 --- a/docs/contributing-to-airbyte/writing-docs.md +++ b/docs/contributing-to-airbyte/writing-docs.md @@ -56,8 +56,6 @@ To make complex changes or edit multiple files, edit the files on your local mac cd airbyte ``` - While cloning on Windows, you might encounter errors about long filenames. Refer to the instructions [here](../deploying-airbyte/quickstart#handling-long-filename-error) to correct it. - 3. Create a feature branch from which to make changes: ```bash diff --git a/docs/deploying-airbyte/deploying-airbyte.md b/docs/deploying-airbyte/deploying-airbyte.md new file mode 100644 index 000000000000..4d09aaa0e57d --- /dev/null +++ b/docs/deploying-airbyte/deploying-airbyte.md @@ -0,0 +1,185 @@ + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Deploying Airbyte + +The Airbyte platform is a sophisticated data integration platform that enables you to handle large amounts of data movement. +To quickly deploy Airbyte on your local machine you can visit the [Quickstart](../using-airbyte/getting-started/oss-quickstart) guide. +If setting up an Airbyte server does not fit your usecase needs (i.e. you're using Jupyter Notebooks or iterating on an early prototype for your project) you may find the [PyAirbyte](../using-airbyte/pyairbyte/getting-started) documentation useful. + +:::tip +Enterprise Customers should follow the steps outlined in our docs on [Airbyte Self-Managed Enterprise](../enterprise-setup/README.md) and the associated [implementation guide](../enterprise-setup/implementation-guide.md). +::: + +## Understanding the Airbyte Deployment + +Airbyte is built to be deployed into a Kubernetes cluster. +You can use a Cloud Provider, such as, AWS, GCP, Azure, or onto a single node, such as an EC2 VM, or even locally on your computer. + +We highly recommend deploying Airbyte using Helm and the documented Helm chart values. + +Helm is a Kubernetes package manager for automating deployment and management of complex applications with microservices on Kubernetes. Refer to our [Helm Chart Usage Guide](https://airbytehq.github.io/helm-charts/) for more information about how to get started. + + +The [Infrastructure](infrastructure/aws) section describes the Airbyte's recommended cloud infrastructure to set up for each supported platform. Keep in mind that these guides are meant to assist you, but you are not required to follow them. Airbyte is designed to be as flexible as possible in order to fit into your existing infrastructure. + +## Adding the Helm Repository + +Charts are stored in `helm-repo`. As a result, you do not need to clone the repo each time you need to deploy the chart. + +To add remote helm repo: +1. Run: `helm repo add airbyte https://airbytehq.github.io/helm-charts`. In this example, `airbyte` is being used to represent the name of the repository that will be indexed locally. + +2. After adding the repo, perform the repo indexing process by running `helm repo update`. + +3. You can now browse all charts uploaded to repository by running `helm search repo airbyte` + +An example of the chart output: + +```text +NAME CHART VERSION APP VERSION DESCRIPTION +airbyte/airbyte 0.290.0 0.63.6 Helm chart to deploy airbyte +airbyte/airbyte-api-server 0.290.0 0.63.6 Helm chart to deploy airbyte-api-server +airbyte/airbyte-bootloader 0.290.0 0.63.6 Helm chart to deploy airbyte-bootloader +airbyte/airbyte-cron 0.40.37 0.40.17 Helm chart to deploy airbyte-cron +airbyte/airbyte-workload-api-server 0.49.18 0.50.33 Helm chart to deploy airbyte-api-server +airbyte/connector-builder-server 0.290.0 0.63.6 Helm chart to deploy airbyte-connector-builder-... +airbyte/cron 0.290.0 0.63.6 Helm chart to deploy airbyte-cron +airbyte/keycloak 0.290.0 0.63.6 Helm chart to deploy airbyte-keycloak +airbyte/keycloak-setup 0.290.0 0.63.6 Helm chart to deploy airbyte-keycloak-setup +airbyte/metrics 0.290.0 0.63.6 Helm chart to deploy airbyte-metrics +airbyte/pod-sweeper 0.290.0 0.63.6 Helm chart to deploy airbyte-pod-sweeper +airbyte/server 0.290.0 0.63.6 Helm chart to deploy airbyte-server +airbyte/temporal 0.290.0 0.63.6 Helm chart to deploy airbyte-temporal +airbyte/webapp 0.290.0 0.63.6 Helm chart to deploy airbyte-webapp +airbyte/worker 0.290.0 0.63.6 Helm chart to deploy airbyte-worker +airbyte/workload-api 0.50.3 0.50.35 Helm chart to deploy the workload-api service +airbyte/workload-api-server 0.290.0 0.63.6 Helm chart to deploy the workload-api service +airbyte/workload-launcher 0.290.0 0.63.6 Helm chart to deploy airbyte-workload-launcher +``` + + +## Creating a Namespace for Airbyte + +While it is not strictly necessary to isolate the Airbyte installation into its own namespace, it is good practice and recommended as a part of the installation. +This documentation assumes that you chose the name `airbyte` for the namespace, but you may choose a different name if required. + +To create a namespace run the following + +```sh +kubectl create namespace airbyte +``` + + +## Preconfiguring Kubernetes Secrets + +Deploying Airbyte requires specifying a number of sensitive values. These can be API keys, usernames and passwords, etc. +In order to protect these sensitive values, the Helm Chart assumes that these values are pre-configured and stored in a Kubernetes Secret *before* the Helm installation begins. Each [integration](#integrations) will provide the Secret values that are required for the specific integration. + +While you can set the name of the secret to whatever you prefer, you will need to set that name in various places in your values.yaml file. For this reason we suggest that you keep the name of `airbyte-config-secrets` unless you have a reason to change it. + + + + +You can apply your yaml to the cluster with `kubectl apply -f secrets.yaml -n airbyte` to create the secrets. + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: airbyte-config-secrets +type: Opaque +stringData: + # Examples + key-1: "value-1" + key-2: "value-2" +``` + + + + +You can also use `kubectl` to create the secret directly from the CLI: + +```sh +kubectl create secret generic airbyte-config-secrets \ + --from-literal=key-1='value-1' \ + --from-literal=key2='value-2' \ + --namespace airbyte +``` + + + + +## Creating a values.yaml override file + +To configure your installation of Airbyte, you will need to override specific parts of the Helm Chart. To do this you should create a new file called `values.yaml` somewhere that is accessible during the installation process. +The documentation has been created to "build up" a values.yaml, so there is no need to copy the whole of the Chart values.yaml. You only need to provide the specific overrides. + +Each [Integration](#integrations) will provide a section of the specific values that you should override and provide examples of what the values should look like. An example `values.yaml` file may look like the following: + +```yaml +global: + airbyteUrl: https://airbyte.company.example + storage: + type: "S3" + bucket: ## S3 bucket names that you've created. We recommend storing the following all in one bucket. + log: airbyte-bucket + state: airbyte-bucket + workloadOutput: airbyte-bucket + s3: + region: "us-east-1" + authenticationType: "instanceProfile" + + secretsManager: + type: awsSecretManager + awsSecretManager: + region: "us-east-1" + authenticationType: "instanceProfile" +``` + + +## Integrations + +The Airbyte platform is built to integrate with your existing cloud infrastructure. You can configure various components of the platform to suit your needs. This includes an object store, such as S3 or GCS for storing logs and state, a database for externalizing state, and a secrets manager for keep your secrets secure. + +Each of these integrations can be configured to suit your specific needs and is described in the [Integration](#integrations) section. Each of these integrations has its own section where you'll find an explanation for why it's useful to configure the integration. There, you'll also find details about how to configure the integration. + +- [State and Logging Storage](./integrations/storage) +- [Secret Management](./integrations/secrets) +- [External Database](./integrations/database) +- [Ingress](./integrations/ingress) + + +## Installing Airbyte + +After you have applied your Secret values to the Cluster and you have filled out a values.yaml file appropriately for your specific configuration, you can begin a Helm Install. To do this, make sure that you have the [Helm Client](https://helm.sh/docs/intro/install/) installed and on your path. +Then you can run: + +```sh +helm install \ +airbyte \ +airbyte/airbyte +--namespace airbyte \ +--values ./values.yaml \ +``` + +After the installation has completed, you can configure your [Ingress](./integrations/ingress) by following the directions for your specific Ingress provider. + + + diff --git a/docs/deploying-airbyte/infrastructure/aws.md b/docs/deploying-airbyte/infrastructure/aws.md new file mode 100644 index 000000000000..d5090d6cd4f2 --- /dev/null +++ b/docs/deploying-airbyte/infrastructure/aws.md @@ -0,0 +1,148 @@ +# Amazon Web Services (AWS) + +Airbyte supports Amazon Web Services as a Cloud Provider. There are several ways that you can deploy Airbyte using AWS. + +You can use the AWS managed Kubernetes solution EKS, using `abctl` on an EC2 instance, or on a Kubernetes distribution +that has been deployed on EC2 instances. + +## Policies + +You will need to create an AWS Role and associate that Role with either an AWS User when using Access Credentials, or an +Instance Profile or Kubernetes Service Account when using IAM Roles for Service Accounts. That Role will need the +following policies depending on in for integrate with S3 and AWS Secret Manager respectively. + +### AWS S3 Policy + +The [following policies](https://docs.aws.amazon.com/AmazonS3/latest/userguide/example-policies-s3.html#iam-policy-ex0), allow the cluster to communicate with S3 storage + +```yaml +{ + "Version": "2012-10-17", + "Statement": + [ + { "Effect": "Allow", "Action": "s3:ListAllMyBuckets", "Resource": "*" }, + { + "Effect": "Allow", + "Action": ["s3:ListBucket", "s3:GetBucketLocation"], + "Resource": "arn:aws:s3:::YOUR-S3-BUCKET-NAME", + }, + { + "Effect": "Allow", + "Action": + [ + "s3:PutObject", + "s3:PutObjectAcl", + "s3:GetObject", + "s3:GetObjectAcl", + "s3:DeleteObject", + ], + "Resource": "arn:aws:s3:::YOUR-S3-BUCKET-NAME/*", + }, + ], +} +``` + +### AWS Secret Manager Policy + +The [following policies](https://docs.aws.amazon.com/mediaconnect/latest/ug/iam-policy-examples-asm-secrets.html), allow the cluster to communicate with AWS Secret Manager + +```yaml +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "secretsmanager:GetSecretValue", + "secretsmanager:CreateSecret", + "secretsmanager:ListSecrets", + "secretsmanager:DescribeSecret", + "secretsmanager:TagResource", + "secretsmanager:UpdateSecret" + ], + "Resource": [ + "*" + ], + "Condition": { + "ForAllValues:StringEquals": { + "secretsmanager:ResourceTag/AirbyteManaged": "true" + } + } + } + ] +} +``` + +## Using an EC2 Instance with abctl + +This guide will assume that you are using the Amazon Linux distribution. However. any distribution that supports a docker engine should work with `abctl`. The launching and connecting to your EC2 Instance is outside the scope of this guide. You can find more information on how to launch and connect to EC2 Instances in the [Get started with Amazon EC2](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html) documentation from Amazon. + +:::tip +`abctl` runs by default on port 8000. You can change the port by passing the `--port` flag to the `local install` command. Make sure that the security group that you have configured for the EC2 Instance allows traffic in on the port that you deploy Airbyte on. See the [Control traffic to your AWS resources using security groups](https://docs.aws.amazon.com/vpc/latest/userguide/vpc-security-groups.html) documentation for more information. +::: + + +1. Install the docker engine: + +```shell +sudo yum install -y docker +``` + +2. Add the ec2-user (or whatever your distros default user) to the docker group: + +```shell +sudo usermod -a -G docker ec2-user +``` + +3. Start and optionally enable (start on boot) the docker engine: + +```shell +sudo systemctl start docker +sudo systemctl enable docker +``` + +4. Exit the shell and reconnect to the ec2 instance, an example would look like: + +```shell +exit +ssh -i ec2-user-key.pem ec2-user@1.2.3.4 +``` + +5. Download the latest version of abctl and install it in your path: + +```shell +curl -LsfS https://get.airbyte.com | bash - +``` + +6. Run the `abctl` command and install Airbyte: + +```shell +abctl local install +``` + +### Editing the Ingress + +By default `abctl` will install and Nginx Ingress and set the host name to `localhost`. You will need to edit this to +match the host name that you have deployed Airbyte to. To do this you will need to have the `kubectl` command installed +on your EC2 Instance and available on your path. + +If you do not already have the CLI tool kubectl installed, please [follow these instructions to install](https://kubernetes.io/docs/tasks/tools/). + +Then you can run `kubectl edit ingress -n airbyte-abctl --kubeconfig ~/.airbyte/abctl/abctl.kubeconfig` and edit the `host` +key under the spec.rules section of the Ingress definition. The host should match the FQDN name that you are trying to +host Airbyte at, for example: `airbyte.company.example`. + +## Using an ALB for Ingress + +The recommended method for Cluster Ingress is an AWS ALB. The [Ingress](../integrations/ingress) section of the documentation +shows how to configure the Kubernetes Ingress using the AWS Load Balancer Controller. This assumes that you have already +correctly configured your Cluster with the AWS Load Balancer Controller. This configuration is outside the scope of this +documentation. You can find more information on how to correctly configure an ALB Ingress Controller by reading the official +[Route application and HTTP traffic with Application Load Balancers](https://docs.aws.amazon.com/eks/latest/userguide/alb-ingress.html) +documentation provided by Amazon. + +Once the AWS Load Balancer Controller has been correctly installed the Airbyte installation process will be able to +automatically create an ALB for you. You can combine the ALB with AWS Certificate Manager (ACM) to secure your instance +with TLS. The ACM documentation can be found here: [Getting Started with AWS Certificate Manager](https://aws.amazon.com/certificate-manager/getting-started/). +To use the ACM certificate, you can specify the certificate-arn when creating the Kubernetes Ingress. For more information +see the [Kubernetes Ingress Annotations documentation](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.1/guide/ingress/annotations/#certificate-arn). \ No newline at end of file diff --git a/docs/deploying-airbyte/infrastructure/azure.md b/docs/deploying-airbyte/infrastructure/azure.md new file mode 100644 index 000000000000..e139ffc718c2 --- /dev/null +++ b/docs/deploying-airbyte/infrastructure/azure.md @@ -0,0 +1 @@ +# Microsoft Azure \ No newline at end of file diff --git a/docs/deploying-airbyte/infrastructure/gcp.md b/docs/deploying-airbyte/infrastructure/gcp.md new file mode 100644 index 000000000000..c0e1dc176607 --- /dev/null +++ b/docs/deploying-airbyte/infrastructure/gcp.md @@ -0,0 +1 @@ +# Google Cloud Platform (GCP) \ No newline at end of file diff --git a/docs/deploying-airbyte/integrations/database.md b/docs/deploying-airbyte/integrations/database.md new file mode 100644 index 000000000000..33e5d20d5e4b --- /dev/null +++ b/docs/deploying-airbyte/integrations/database.md @@ -0,0 +1,59 @@ +# External Database + +For production deployments, we recommend using a dedicated database instance for better reliability, and backups (such as AWS RDS or GCP Cloud SQL) instead of the default internal Postgres database (`airbyte/db`) that Airbyte spins up within the Kubernetes cluster. + +The following instructions assume that you've already configured a Postgres instance: + +## Secrets + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: airbyte-config-secrets +type: Opaque +stringData: + # Database Secrets + ## database-host: ## e.g. database.internla + ## database-port: ## e.g. 5432 + ## database-name: ## e.g. airbyte + ## database-user: ## e.g. airbyte + database-password: ## e.g. password +``` + +## Values + +Add external database details to your `values.yaml` file. This disables the default internal Postgres database (`airbyte/db`), and configures your external Postgres database. You can override all of the values below by setting them in the airbyte-config-secrets or set them directly in the `values.yaml` file. **The database password is a special case in that it must be set in the [airbyte-config-secrets](#secrets).** + +```yaml +postgresql: + enabled: false + +global: + database: + # -- Secret name where database credentials are stored + secretName: "" # e.g. "airbyte-config-secrets" + + # -- The database host + host: "" + # -- The key within `secretName` where host is stored + #hostSecretKey: "" # e.g. "database-host" + + # -- The database port + port: "" + # -- The key within `secretName` where port is stored + #portSecretKey: "" # e.g. "database-port" + + # -- The database name + database: "" + # -- The key within `secretName` where the database name is stored + #databaseSecretKey: "" # e.g. "database-name" + + # -- The database user + user: "" # -- The key within `secretName` where the user is stored + #userSecretKey: "" # e.g. "database-user" + + # -- The key within `secretName` where password is stored + passwordSecretKey: "" # e.g."database-password" +``` + diff --git a/docs/deploying-airbyte/integrations/ingress.md b/docs/deploying-airbyte/integrations/ingress.md new file mode 100644 index 000000000000..2696f2ff6cb4 --- /dev/null +++ b/docs/deploying-airbyte/integrations/ingress.md @@ -0,0 +1,113 @@ + + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Ingress + +:::tip +If you are using `abctl` to manage your deployment, a nginx ingress is automatically provided for you. There is no need to provision an additional ingress. +::: + +To access the Airbyte UI, you will need to manually attach an ingress configuration to your deployment. +The following is a simplified definition of an ingress resource you could use for your Airbyte instance: + + + + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: # ingress name, example: airbyte-production + annotations: + ingress.kubernetes.io/ssl-redirect: "false" +spec: + ingressClassName: nginx + rules: + - host: # host, example: airbyte.company.example + http: + paths: + - backend: + service: + # format is ${RELEASE_NAME}-airbyte-webapp-svc + name: airbyte-airbyte-webapp-svc + port: + number: 80 # service port, example: 8080 + path: / + pathType: Prefix + - backend: + service: + # format is ${RELEASE_NAME}-airbyte-keycloak-svc + name: airbyte-airbyte-keycloak-svc + port: + number: 8180 + path: /auth + pathType: Prefix + - backend: + service: + # format is ${RELEASE_NAME}-airbyte--server-svc + name: airbyte-airbyte-server-svc + port: + number: 8001 + path: /api/public + pathType: Prefix +``` + + + +If you intend to use Amazon Application Load Balancer (ALB) for ingress, this ingress definition will be close to what's needed to get up and running: + + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: # ingress name, e.g. airbyte-production + annotations: + # Specifies that the Ingress should use an AWS ALB. + kubernetes.io/ingress.class: "alb" + # Redirects HTTP traffic to HTTPS. + ingress.kubernetes.io/ssl-redirect: "true" + # Creates an internal ALB, which is only accessible within your VPC or through a VPN. + alb.ingress.kubernetes.io/scheme: internal + # Specifies the ARN of the SSL certificate managed by AWS ACM, essential for HTTPS. + alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:us-east-x:xxxxxxxxx:certificate/xxxxxxxxx-xxxxx-xxxx-xxxx-xxxxxxxxxxx + # Sets the idle timeout value for the ALB. + alb.ingress.kubernetes.io/load-balancer-attributes: idle_timeout.timeout_seconds=30 + # [If Applicable] Specifies the VPC subnets and security groups for the ALB + # alb.ingress.kubernetes.io/subnets: '' e.g. 'subnet-12345, subnet-67890' + # alb.ingress.kubernetes.io/security-groups: +spec: + rules: + - host: # e.g. airbyte.company.example + http: + paths: + - backend: + service: + name: airbyte-airbyte-webapp-svc + port: + number: 80 + path: / + pathType: Prefix + - backend: + service: + name: airbyte-airbyte-keycloak-svc + port: + number: 8180 + path: /auth + pathType: Prefix + - backend: + service: + # format is ${RELEASE_NAME}-airbyte-server-svc + name: airbyte-airbyte-server-svc + port: + number: 8001 + path: /api/public + pathType: Prefix +``` + +The ALB controller uses a `ServiceAccount` that requires the [following IAM policy](https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/main/docs/install/iam_policy.json) to be attached. + + + diff --git a/docs/deploying-airbyte/integrations/monitoring.md b/docs/deploying-airbyte/integrations/monitoring.md new file mode 100644 index 000000000000..2ddbe89cb91f --- /dev/null +++ b/docs/deploying-airbyte/integrations/monitoring.md @@ -0,0 +1 @@ +# Monitoring \ No newline at end of file diff --git a/docs/deploying-airbyte/integrations/secrets.md b/docs/deploying-airbyte/integrations/secrets.md new file mode 100644 index 000000000000..1f9a64827429 --- /dev/null +++ b/docs/deploying-airbyte/integrations/secrets.md @@ -0,0 +1,89 @@ + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Secret Management + + +Airbyte's default behavior is to store connector secrets on your configured database. Airbyte recommends storing connector secrets in an external secret manager. The currently supported Secret managers are: AWS Secrets Manager, Google Secrets Manager or Hashicorp Vault. Upon creating a new connector, secrets (e.g. OAuth tokens, database passwords) will be written to and read from the configured Secrets manager. + +## Secrets + + + + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: airbyte-config-secrets +type: Opaque +stringData: + # AWS Secret Manager + aws-secret-manager-access-key-id: ## e.g. AKIAIOSFODNN7EXAMPLE + aws-secret-manager-secret-access-key: ## e.g. wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY +``` + + + + + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: airbyte-config-secrets +type: Opaque +stringData: + gcp.json: ## {CREDENTIALS_JSON_BLOB} +``` + + + + +## Values + +Modifing the configuration of connector secret storage will cause all existing connectors to fail. You will need to recreate these connectors to ensure they are reading from the appropriate secret store. + + + + +If authenticating with credentials, ensure you've already created a Kubernetes secret containing both your AWS Secrets Manager access key ID, and secret access key. By default, secrets are expected in the `airbyte-config-secrets` Kubernetes secret, under the `aws-secret-manager-access-key-id` and `aws-secret-manager-secret-access-key` keys. Steps to configure these are in the above [prerequisites](#secrets). + +```yaml +global: + secretsManager: + type: awsSecretManager + awsSecretManager: + region: + authenticationType: credentials ## Use "credentials" or "instanceProfile" + tags: ## Optional - You may add tags to new secrets created by Airbyte. + - key: ## e.g. team + value: ## e.g. deployments + - key: business-unit + value: engineering + kms: ## Optional - ARN for KMS Decryption. +``` + +Set `authenticationType` to `instanceProfile` if the compute infrastructure running Airbyte has pre-existing permissions (e.g. IAM role) to read and write from AWS Secrets Manager. + +To decrypt secrets in the secret manager with AWS KMS, configure the `kms` field, and ensure your Kubernetes cluster has pre-existing permissions to read and decrypt secrets. + + + + +Ensure you've already created a Kubernetes secret containing the credentials blob for the service account to be assumed by the cluster. By default, secrets are expected in the `airbyte-config-secrets` Kubernetes secret, under a `gcp.json` file. Steps to configure these are in the above [prerequisites](#secrets). For simplicity, we recommend provisioning a single service account with access to both GCS and GSM. + +```yaml +global: + secretsManager: + type: googleSecretManager + storageSecretName: gcp-cred-secrets + googleSecretManager: + projectId: + credentialsSecretKey: gcp.json +``` + + + + diff --git a/docs/deploying-airbyte/integrations/storage.md b/docs/deploying-airbyte/integrations/storage.md new file mode 100644 index 000000000000..e3f0a9296245 --- /dev/null +++ b/docs/deploying-airbyte/integrations/storage.md @@ -0,0 +1,92 @@ + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# State and Logging Storage + +Airbyte recommends using an object storage solution for such as S3 and GCS for storing [State](../../understanding-airbyte/airbyte-protocol/#state--checkpointing) and [Logging information](../../operator-guides/browsing-output-logs). +You must select which type of blob store that you wish to use. Currently, S3 and GCS are supported. If you are using an S3 compatible solution, use the S3 type and provide an `endpoint` key/value as needed. + +Adding external storage details to your `values.yaml` disables the default internal Minio instance (`airbyte/minio`). While there are three separate buckets presented in the Values section below, Airbyte recommends that you use a single bucket across all three values. + +## Secrets + + + + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: airbyte-config-secrets +type: Opaque +stringData: + # AWS S3 Secrets + s3-access-key-id: ## e.g. AKIAIOSFODNN7EXAMPLE + s3-secret-access-key: ## e.g. wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + +``` + + + + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: airbyte-config-secrets +type: Opaque +stringData: + gcp.json: ## {CREDENTIALS_JSON_BLOB} +``` + + + + + + +## Values + + + + +Ensure you've already created a Kubernetes secret containing both your S3 access key ID, and secret access key. By default, secrets are expected in the `airbyte-config-secrets` Kubernetes secret, under the `aws-s3-access-key-id` and `aws-s3-secret-access-key` keys. Steps to configure these are in the above [prerequisites](#secrets). + +```yaml +global: + storage: + type: "S3" + storageSecretName: airbyte-config-secrets # Name of your Kubernetes secret. + bucket: ## S3 bucket names that you've created. We recommend storing the following all in one bucket. + log: airbyte-bucket + state: airbyte-bucket + workloadOutput: airbyte-bucket + s3: + region: "" ## e.g. us-east-1 + authenticationType: credentials ## Use "credentials" or "instanceProfile" +``` + +Set `authenticationType` to `instanceProfile` if the compute infrastructure running Airbyte has pre-existing permissions (e.g. IAM role) to read and write from the appropriate buckets. + + + + +Ensure you've already created a Kubernetes secret containing the credentials blob for the service account to be assumed by the cluster. By default, secrets are expected in the `airbyte-config-secrets` Kubernetes secret, under a `gcp.json` key. Steps to configure these are in the above [prerequisites](#secrets). + +```yaml +global: + storage: + type: "GCS" + storageSecretName: airbyte-config-secrets + bucket: ## GCS bucket names that you've created. We recommend storing the following all in one bucket. + log: airbyte-bucket + state: airbyte-bucket + workloadOutput: airbyte-bucket + gcs: + projectId: + credentialsPath: /secrets/gcs-log-creds/gcp.json +``` + + + + diff --git a/docs/deploying-airbyte/quickstart.md b/docs/deploying-airbyte/quickstart.md deleted file mode 100644 index b98b2aa96681..000000000000 --- a/docs/deploying-airbyte/quickstart.md +++ /dev/null @@ -1,48 +0,0 @@ -# Airbyte Quickstart - -Airbyte has a single binary tool called `abctl` which can be used to quickly standup Airbyte. - -## Setup & launch Airbyte - -- Install `Docker Desktop` \(see [instructions](https://docs.docker.com/desktop/install/mac-install/)\). -- Download the latest version of `abctl` from the [releases page](https://github.com/airbytehq/abctl/releases) - -:::info -Mac users can use Brew to install the `abctl` command - -```bash -brew tap airbytehq/tap -brew install abctl -``` - -::: - -Then you can run Airbyte with the following command: - -```bash -abctl local install -``` - -- Your browser should open to the Airbyte Application, if it does not visit [http://localhost:8000](http://localhost:8000) -- You will be asked for a username and password. By default, that's username `airbyte` and password `password`. You can set these values through command line flags or environment variables. For example, to set the username and password to `foo` and `bar` respectively, you can run the following command: - -```bash -abctl local install --username foo --password bar - -# Or as Environment Variables -ABCTL_LOCAL_INSTALL_PASSWORD=foo -ABCTL_LOCAL_INSTALL_USERNAME=bar -``` - -- Start moving some data! - -## Troubleshooting - -If you have any questions about the local setup and deployment process, head over to our [Getting Started FAQ](https://github.com/airbytehq/airbyte/discussions/categories/questions) on our Airbyte Forum that answers the following questions and more: - -- How long does it take to set up Airbyte? -- Where can I see my data once I've run a sync? -- Can I set a start time for my sync? - -If you find an issue with the `abctl` command, please report it as a github -issue [here](https://github.com/airbytehq/airbyte/issues) with the type of "🐛 [abctl] Report an issue with the abctl tool". diff --git a/docs/integrations/destinations/bigquery.md b/docs/integrations/destinations/bigquery.md index e74411ae99a9..8a094d9ea711 100644 --- a/docs/integrations/destinations/bigquery.md +++ b/docs/integrations/destinations/bigquery.md @@ -223,11 +223,12 @@ tutorials: | Version | Date | Pull Request | Subject | |:--------|:-----------|:-----------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 2.8.3 | 2024-07-12 | [41674](https://github.com/airbytehq/airbyte/pull/41674) | Upgrade to latest CDK | | 2.8.2 | 2024-07-08 | [41041](https://github.com/airbytehq/airbyte/pull/41041) | Fix resume logic in truncate refreshes to prevent data loss | | 2.8.1 | 2024-06-25 | [39379](https://github.com/airbytehq/airbyte/pull/39379) | Removing requirement of a redundant permission bigquery.datasets.create permission | | 2.8.0 | 2024-06-21 | [39904](https://github.com/airbytehq/airbyte/pull/39904) | Convert all production code to kotlin | | 2.7.1 | 2024-06-17 | [39526](https://github.com/airbytehq/airbyte/pull/39526) | Internal code change for improved error reporting in case of source/platform failure (`INCOMPLETE` stream status / empty ConfiguredCatalog). | -| 2.7.0 | 2024-06-17 | [38713](https://github.com/airbytehq/airbyte/pull/38713) | Support for [refreshes](../../operator-guides/refreshes.md) and resumable full refresh. WARNING: You must upgrade to platform 0.63.0 before upgrading to this connector version. | +| 2.7.0 | 2024-06-17 | [38713](https://github.com/airbytehq/airbyte/pull/38713) | Support for [refreshes](../../operator-guides/refreshes.md) and resumable full refresh. WARNING: You must upgrade to platform 0.63.7 before upgrading to this connector version. | | 2.6.3 | 2024-06-10 | [38331](https://github.com/airbytehq/airbyte/pull/38331) | Internal code changes in preparation for future feature release | | 2.6.2 | 2024-06-07 | [38764](https://github.com/airbytehq/airbyte/pull/38764) | Increase message length limit to 50MiB | | 2.6.1 | 2024-05-29 | [38770](https://github.com/airbytehq/airbyte/pull/38770) | Internal code change (switch to CDK artifact) | diff --git a/docs/integrations/destinations/redshift.md b/docs/integrations/destinations/redshift.md index 3404930e527e..2dbef703b9a9 100644 --- a/docs/integrations/destinations/redshift.md +++ b/docs/integrations/destinations/redshift.md @@ -244,6 +244,7 @@ Each stream will be output into its own raw table in Redshift. Each table will c | Version | Date | Pull Request | Subject | |:--------|:-----------|:-----------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 3.3.0 | 2024-07-02 | [40567](https://github.com/airbytehq/airbyte/pull/40567) | Support for [refreshes](../../operator-guides/refreshes.md) and resumable full refresh. WARNING: You must upgrade to platform 0.63.7 before upgrading to this connector version. | | 3.2.0 | 2024-07-02 | [40201](https://github.com/airbytehq/airbyte/pull/40201) | Add `_airbyte_generation_id` column, and add `sync_id` to `_airbyte_meta` column | | 3.1.1 | 2024-06-26 | [39008](https://github.com/airbytehq/airbyte/pull/39008) | Internal code changes | | 3.1.0 | 2024-06-26 | [39141](https://github.com/airbytehq/airbyte/pull/39141) | Remove nonfunctional "encrypted staging" option | diff --git a/docs/integrations/destinations/snowflake.md b/docs/integrations/destinations/snowflake.md index b1b0c7768e46..23bc315b71d8 100644 --- a/docs/integrations/destinations/snowflake.md +++ b/docs/integrations/destinations/snowflake.md @@ -268,8 +268,9 @@ desired namespace. | Version | Date | Pull Request | Subject | |:----------------|:-----------|:-----------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 3.11.2 | 2024-07-12 | [\#41674](https://github.com/airbytehq/airbyte/pull/41674) | Upgrade to latest CDK | | 3.11.1 | 2024-07-08 | [\#41041](https://github.com/airbytehq/airbyte/pull/41041) | Fix resume logic in truncate refreshes to prevent data loss | -| 3.11.0 | 2024-06-25 | [\#39473](https://github.com/airbytehq/airbyte/pull/39473) | Support for [refreshes](../../operator-guides/refreshes.md) and resumable full refresh. WARNING: You must upgrade to platform 0.63.0 before upgrading to this connector version. | +| 3.11.0 | 2024-06-25 | [\#39473](https://github.com/airbytehq/airbyte/pull/39473) | Support for [refreshes](../../operator-guides/refreshes.md) and resumable full refresh. WARNING: You must upgrade to platform 0.63.7 before upgrading to this connector version. | | 3.10.1 | 2024-06-11 | [\#39399](https://github.com/airbytehq/airbyte/pull/39399) | Bug fix for _airbyte_meta not migrated in OVERWRITE mode | | 3.10.0 | 2024-06-10 | [\#39107](https://github.com/airbytehq/airbyte/pull/39107) | _airbyte_meta and _airbyte_generation_id in Raw tables and final tables | | 3.9.1 | 2024-06-05 | [\#39135](https://github.com/airbytehq/airbyte/pull/39135) | Improved error handling for Staging files | diff --git a/docusaurus/redirects.yml b/docusaurus/redirects.yml index 5f864ea6807d..ba392ee43bd8 100644 --- a/docusaurus/redirects.yml +++ b/docusaurus/redirects.yml @@ -111,3 +111,6 @@ - from: - /telemetry to: /operator-guides/telemetry +- from: + - /deploying-airbyte/quickstart + to: /using-airbyte/getting-started/oss-quickstart diff --git a/docusaurus/sidebars.js b/docusaurus/sidebars.js index 2f3f1ffd13a1..4adffb32f5dd 100644 --- a/docusaurus/sidebars.js +++ b/docusaurus/sidebars.js @@ -391,58 +391,30 @@ const deployAirbyte = { type: "category", label: "Deploy Airbyte", link: { - type: "generated-index", + type: "doc", + id: "deploying-airbyte/deploying-airbyte", }, items: [ { - type: "doc", - label: "Using docker compose", - id: "deploying-airbyte/docker-compose", - }, - { - type: "doc", - label: "On AWS EC2", - id: "deploying-airbyte/on-aws-ec2", - }, - { - type: "doc", - label: "On AWS ECS", - id: "deploying-airbyte/on-aws-ecs", - }, - { - type: "doc", - label: "On Azure", - id: "deploying-airbyte/on-azure-vm-cloud-shell", - }, - { - type: "doc", - label: "On Google (GCP)", - id: "deploying-airbyte/on-gcp-compute-engine", - }, - { - type: "doc", - label: "On Kubernetes using Helm", - id: "deploying-airbyte/on-kubernetes-via-helm", - }, - { - type: "doc", - label: "On Restack", - id: "deploying-airbyte/on-restack", - }, - { - type: "doc", - label: "On Plural", - id: "deploying-airbyte/on-plural", - }, - { - type: "doc", - label: "On Oracle Cloud", - id: "deploying-airbyte/on-oci-vm", + type: "category", + label: "Infrastructure", + items: [ + "deploying-airbyte/infrastructure/aws", + // "deploying-airbyte/infrastructure/gcp", + // "deploying-airbyte/infrastructure/azure", + ], }, + { - type: "doc", - label: "On DigitalOcean", - id: "deploying-airbyte/on-digitalocean-droplet", + type: "category", + label: "Integrations", + items: [ + "deploying-airbyte/integrations/storage", + "deploying-airbyte/integrations/secrets", + "deploying-airbyte/integrations/database", + // "deploying-airbyte/integrations/monitoring", + "deploying-airbyte/integrations/ingress", + ], }, ], };