diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_common_config.md b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_common_config.md index c0aa98aeab..aaac27b793 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_common_config.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_common_config.md @@ -50,3 +50,25 @@ import Link from '@docusaurus/Link'; telemetry.userProvidedId Optional. See here for more information. + + inMemBatchBytes + Optional. Default value 25600000. Controls how many events are buffered in memory before saving the batch to local disk. The default value works well for most reasonably sized VMs. + + + cpuParallelismFactor + + Optional. Default value 0.75. + Controls how the app splits the workload into concurrent batches which can be run in parallel. + E.g. If there are 4 available processors, and cpuParallelismFraction = 0.75, then we process 3 batches concurrently. + The default value works well for most workloads. + + + + numEagerWindows + + Optional. Default value 1. + Controls how eagerly the loader starts processing the next timed window even when the previous timed window is still finalizing (committing into the lake). + By default, we start processing a timed windows if the previous 1 window is still finalizing, but we do not start processing a timed window if any more older windows are still finalizing. + The default value works well for most workloads. + + diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_delta_config.md b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_delta_config.md index 905aeec54d..8c58ea874f 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_delta_config.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_delta_config.md @@ -1,3 +1,7 @@ +```mdx-code-block +import Link from '@docusaurus/Link'; +``` + output.good.location Required, e.g. gs://mybucket/events. URI of the bucket location to which to write Snowplow enriched events in Delta format. The URI should start with the following prefix: @@ -9,6 +13,9 @@ - output.good.dataSkippingColumns - Optional. A list of column names which will be brought to the "left-hand-side" of the events table, to enable Delta's data skipping feature. Defaults to the important Snowplow timestamp columns: load_tstamp, collector_tstamp, derived_tstamp, dvce_created_tstamp. + output.good.deltaTableProperties.* + + Optional. A map of key/value strings corresponding to Delta's table properties. + These can be anything from the Delta table properties documentation. + The default properties include configuring Delta's data skipping feature for the important Snowplow timestamp columns: load_tstamp, collector_tstamp, derived_tstamp, dvce_created_tstamp. diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_hudi_config.md b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_hudi_config.md index f35064eb69..acc1dbd04b 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_hudi_config.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_hudi_config.md @@ -10,9 +10,9 @@ output.good.hudiWriteOptions.* - Optional. A map of key/value strings corresponding to Hudi's configuration options for writing into a table. The default options configure `load_tstamp` as the table's partition field. + Optional. A map of key/value strings corresponding to Hudi's configuration options for writing into a table. The default options configure load_tstamp as the table's partition field. - output.good.hudiTableOptions.* - Optional. A map of key/value strings corresponding to Hudi's configuration options for creating a table. The default options configure `load_tstamp` as the table's partition field. + output.good.hudiTableProperties.* + Optional. A map of key/value strings corresponding to Hudi's configuration options for creating a table. The default options configure load_tstamp as the table's partition field. diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_iceberg_biglake_config.md b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_iceberg_biglake_config.md index 668f651d3f..a71f9664d7 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_iceberg_biglake_config.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_iceberg_biglake_config.md @@ -1,11 +1,7 @@ - - output.good.type - Required, set this to Iceberg. - - - output.good.catalog.type - Required, set this to BigLake - +```mdx-code-block +import Link from '@docusaurus/Link'; +``` + output.good.location Required, e.g. gs://mybucket/. URI of the bucket location to which to write Snowplow enriched events in Iceberg format. The URI should start with gs://. @@ -18,6 +14,14 @@ output.good.table Required. The name of the table in the BigLake database + + output.good.icebergTableProperties.* + + Optional. A map of key/value strings corresponding to Iceberg's table properties. + These can be anything from the Iceberg table properties documentation. + The default properties include configuring Iceberg's column-level statistics for the important Snowplow timestamp columns: load_tstamp, collector_tstamp, derived_tstamp, dvce_created_tstamp. + + output.good.catalog.project Required. The GCP project owning the BigLake catalog diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_iceberg_glue_config.md b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_iceberg_glue_config.md index f9b20fccda..2b6f522a0e 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_iceberg_glue_config.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_iceberg_glue_config.md @@ -22,6 +22,14 @@ import Link from '@docusaurus/Link'; output.good.table Required. The name of the table in the Glue database + + output.good.icebergTableProperties.* + + Optional. A map of key/value strings corresponding to Iceberg's table properties. + These can be anything from the Iceberg table properties documentation. + The default properties include configuring Iceberg's column-level statistics for the important Snowplow timestamp columns: load_tstamp, collector_tstamp, derived_tstamp, dvce_created_tstamp. + + output.good.catalog.options.* diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_kafka_config.md b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_kafka_config.md index c48c622428..4ecf6a981d 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_kafka_config.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_kafka_config.md @@ -1,3 +1,7 @@ +```mdx-code-block +import Link from '@docusaurus/Link'; +``` + input.topicName Required. Name of the Kafka topic for the source of enriched events. @@ -20,5 +24,5 @@ output.bad.producerConf.* - Optional. A map of key/value pairs for any standard Kafka producer configuration option. + Optional. A map of key/value pairs for any standard Kafka producer configuration option. diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_kinesis_config.md b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_kinesis_config.md index ddaeae6566..3e43113035 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_kinesis_config.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/lake-loader/configuration-reference/_kinesis_config.md @@ -26,6 +26,14 @@ input.bufferSize Optional. Default value 1. The number of batches of events which are pre-fetched from kinesis. The default value is known to work well. + + input.workerIdentifier + Optional. Defaults to the HOSTNAME environment variable. The name of this KCL worker used in the dynamodb lease table. + + + input.leaseDuration + Optional. Default value 10 seconds. The duration of shard leases. KCL workers must periodically refresh leases in the dynamodb table before this duration expires. + output.bad.streamName Required. Name of the Kinesis stream that will receive failed events. diff --git a/src/componentVersions.js b/src/componentVersions.js index e0b7d0f794..bb26446052 100644 --- a/src/componentVersions.js +++ b/src/componentVersions.js @@ -36,7 +36,7 @@ export const versions = { rdbLoader: '6.0.0', s3Loader: '2.2.8', s3Loader22x: '2.2.8', - lakeLoader: '0.3.0', + lakeLoader: '0.4.1', snowflakeStreamingLoader: '0.2.2', // Data Modelling