Support to parse numbers in text-based input formats (#17082)

Text-based input formats like csv and tsv currently parse inputs only as strings, following the RFC4180Parser spec). To workaround this, the web-console and other tools need to further inspect the sample data returned to sample data returned by the Druid sampler API to parse them as numbers. This patch introduces a new optional config, tryParseNumbers, for the csv and tsv input formats. If enabled, any numbers present in the input will be parsed in the following manner -- long data type for integer types and double for floating-point numbers, and if parsing fails for whatever reason, the input is treated as a string. By default, this configuration is set to false, so numeric strings will be treated as strings.
apache · Sep 19, 2024 · 635e418 · 635e418
1 parent 4f137d2
commit 635e418
Show file tree

Hide file tree

Showing 57 changed files with 858 additions and 194 deletions.
diff --git a/benchmarks/src/test/java/org/apache/druid/benchmark/DelimitedInputFormatBenchmark.java b/benchmarks/src/test/java/org/apache/druid/benchmark/DelimitedInputFormatBenchmark.java
@@ -132,7 +132,7 @@ public void prepareData() throws Exception
   @Setup(Level.Trial)
   public void prepareFormat()
   {
-    format = new DelimitedInputFormat(fromHeader ? null : COLUMNS, null, "\t", null, fromHeader, fromHeader ? 0 : 1);
+    format = new DelimitedInputFormat(fromHeader ? null : COLUMNS, null, "\t", null, fromHeader, fromHeader ? 0 : 1, null);
   }
 
   @Benchmark

diff --git a/docs/ingestion/data-formats.md b/docs/ingestion/data-formats.md
@@ -125,6 +125,7 @@ Configure the CSV `inputFormat` to load CSV data as follows:
 | columns | JSON array | Specifies the columns of the data. The columns should be in the same order with the columns of your data. | yes if `findColumnsFromHeader` is false or missing |
 | findColumnsFromHeader | Boolean | If this is set, the task will find the column names from the header row. Note that `skipHeaderRows` will be applied before finding column names from the header. For example, if you set `skipHeaderRows` to 2 and `findColumnsFromHeader` to true, the task will skip the first two lines and then extract column information from the third line. `columns` will be ignored if this is set to true. | no (default = false if `columns` is set; otherwise null) |
 | skipHeaderRows | Integer | If this is set, the task will skip the first `skipHeaderRows` rows. | no (default = 0) |
+| tryParseNumbers| Boolean| If this is set, the task will attempt to parse numeric strings into long or double data type, in that order. This parsing also applies to values separated by `listDelimiter`. If the value cannot be parsed as a number, it is retained as a string. | no (default = false) |
 
 For example:
 
@@ -150,6 +151,7 @@ Configure the TSV `inputFormat` to load TSV data as follows:
 | columns | JSON array | Specifies the columns of the data. The columns should be in the same order with the columns of your data. | yes if `findColumnsFromHeader` is false or missing |
 | findColumnsFromHeader | Boolean | If this is set, the task will find the column names from the header row. Note that `skipHeaderRows` will be applied before finding column names from the header. For example, if you set `skipHeaderRows` to 2 and `findColumnsFromHeader` to true, the task will skip the first two lines and then extract column information from the third line. `columns` will be ignored if this is set to true. | no (default = false if `columns` is set; otherwise null) |
 | skipHeaderRows | Integer | If this is set, the task will skip the first `skipHeaderRows` rows. | no (default = 0) |
+| tryParseNumbers| Boolean| If this is set, the task will attempt to parse numeric strings into long or double data type, in that order. This parsing also applies to values separated by `listDelimiter`. If the value cannot be parsed as a number, it is retained as a string. | no (default = false) |
 
 Be sure to change the `delimiter` to the appropriate delimiter for your data. Like CSV, you must specify the columns and which subset of the columns you want indexed.
 

diff --git a/...n-oss-extensions/src/test/java/org/apache/druid/data/input/aliyun/OssInputSourceTest.java b/...n-oss-extensions/src/test/java/org/apache/druid/data/input/aliyun/OssInputSourceTest.java
@@ -537,7 +537,7 @@ public void testReader() throws IOException
 
     InputSourceReader reader = inputSource.reader(
         someSchema,
-        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0),
+        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0, null),
         temporaryFolder.newFolder()
     );
 
@@ -584,7 +584,7 @@ public void testCompressedReader() throws IOException
 
     InputSourceReader reader = inputSource.reader(
         someSchema,
-        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0),
+        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0, null),
         temporaryFolder.newFolder()
     );
 

diff --git a/...sions-core/druid-catalog/src/test/java/org/apache/druid/catalog/storage/CatalogTests.java b/...sions-core/druid-catalog/src/test/java/org/apache/druid/catalog/storage/CatalogTests.java
@@ -23,8 +23,6 @@
 import com.google.common.collect.ImmutableMap;
 import org.apache.druid.catalog.storage.sql.CatalogManager;
 import org.apache.druid.catalog.storage.sql.SQLCatalogManager;
-import org.apache.druid.data.input.InputFormat;
-import org.apache.druid.data.input.impl.CsvInputFormat;
 import org.apache.druid.jackson.DefaultObjectMapper;
 import org.apache.druid.metadata.TestDerbyConnector.DerbyConnectorRule;
 import org.apache.druid.server.security.Access;
@@ -35,8 +33,6 @@
 import org.apache.druid.server.security.Resource;
 import org.apache.druid.server.security.ResourceType;
 
-import java.util.Arrays;
-
 public class CatalogTests
 {
   public static final String TEST_AUTHORITY = "test";
@@ -74,17 +70,6 @@ public Access authorize(
     }
   }
 
-  public static InputFormat csvFormat()
-  {
-    return new CsvInputFormat(
-        Arrays.asList("x", "y", "z"),
-        null,  // listDelimiter
-        false, // hasHeaderRow
-        false, // findColumnsFromHeader
-        0      // skipHeaderRows
-    );
-  }
-
   public static final ObjectMapper JSON_MAPPER = new DefaultObjectMapper();
 
   public static class DbFixture

diff --git a/...s/src/test/java/org/apache/druid/data/input/google/GoogleCloudStorageInputSourceTest.java b/...s/src/test/java/org/apache/druid/data/input/google/GoogleCloudStorageInputSourceTest.java
@@ -406,7 +406,7 @@ public void testReader() throws IOException
 
     InputSourceReader reader = inputSource.reader(
         someSchema,
-        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0),
+        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0, null),
         null
     );
 
@@ -453,7 +453,7 @@ public void testCompressedReader() throws IOException
 
     InputSourceReader reader = inputSource.reader(
         someSchema,
-        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0),
+        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0, null),
         null
     );
 

diff --git a/...ore/hdfs-storage/src/test/java/org/apache/druid/inputsource/hdfs/HdfsInputSourceTest.java b/...ore/hdfs-storage/src/test/java/org/apache/druid/inputsource/hdfs/HdfsInputSourceTest.java
@@ -90,7 +90,8 @@ public class HdfsInputSourceTest extends InitializedNullHandlingTest
       null,
       false,
       null,
-      0
+      0,
+      null
   );
 
   public static class ConstructorTest

diff --git a/...ng-service/src/test/java/org/apache/druid/data/input/kafkainput/KafkaInputFormatTest.java b/...ng-service/src/test/java/org/apache/druid/data/input/kafkainput/KafkaInputFormatTest.java
@@ -712,7 +712,8 @@ public void testValueInCsvFormat() throws IOException
             null,
             false,
             false,
-            0
+            0,
+            null
         ),
         "kafka.newheader.",
         "kafka.newkey.key",

diff --git a/...ing-service/src/test/java/org/apache/druid/data/input/kinesis/KinesisInputFormatTest.java b/...ing-service/src/test/java/org/apache/druid/data/input/kinesis/KinesisInputFormatTest.java
@@ -713,7 +713,8 @@ public void testValueInCsvFormat() throws IOException
             null,
             false,
             false,
-            0
+            0,
+            null
         ),
         "kinesis.newts.partitionKey",
         "kinesis.newts.timestamp"

diff --git a/...-global/src/main/java/org/apache/druid/query/lookup/namespace/UriExtractionNamespace.java b/...-global/src/main/java/org/apache/druid/query/lookup/namespace/UriExtractionNamespace.java
@@ -327,7 +327,7 @@ public CSVFlatDataParser(
           this.valueColumn,
           Arrays.toString(columns.toArray())
       );
-      CSVParser csvParser = new CSVParser(null, columns, hasHeaderRow, skipHeaderRows);
+      CSVParser csvParser = new CSVParser(null, columns, hasHeaderRow, skipHeaderRows, false);
       csvParser.startFileFromBeginning();
       this.parser = new DelegateParser(
           csvParser,
@@ -355,13 +355,13 @@ public List<String> getColumns()
     @JsonProperty
     public String getKeyColumn()
     {
-      return this.keyColumn;
+      return keyColumn;
     }
 
     @JsonProperty
     public String getValueColumn()
     {
-      return this.valueColumn;
+      return valueColumn;
     }
 
     @Override
@@ -431,7 +431,8 @@ public TSVFlatDataParser(
           StringUtils.emptyToNullNonDruidDataString(delimiter),
           StringUtils.emptyToNullNonDruidDataString(listDelimiter),
           hasHeaderRow,
-          skipHeaderRows
+          skipHeaderRows,
+          false
       );
       delegate.startFileFromBeginning();
       Preconditions.checkArgument(

diff --git a/...bal/src/test/java/org/apache/druid/query/lookup/namespace/UriExtractionNamespaceTest.java b/...bal/src/test/java/org/apache/druid/query/lookup/namespace/UriExtractionNamespaceTest.java
@@ -115,6 +115,7 @@ public void testCSVWithHeader()
     // The third row will parse to data
     Assert.assertEquals(ImmutableMap.of("val2", "val3"), parser.getParser().parseToMap("val1,val2,val3"));
   }
+
   @Test(expected = IllegalArgumentException.class)
   public void testBadCSV()
   {

diff --git a/...ore/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQParseExceptionsTest.java b/...ore/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQParseExceptionsTest.java
@@ -157,7 +157,7 @@ public void testIngestWithSanitizedNullByte() throws IOException
             .dataSource(
                 new ExternalDataSource(
                     new LocalInputSource(null, null, ImmutableList.of(toRead), SystemFields.none()),
-                    new CsvInputFormat(null, null, null, true, 0),
+                    new CsvInputFormat(null, null, null, true, 0, null),
                     RowSignature.builder()
                                 .add("timestamp", ColumnType.STRING)
                                 .add("agent_category", ColumnType.STRING)
@@ -255,7 +255,7 @@ public void testIngestWithSanitizedNullByteUsingContextParameter() throws IOExce
             .dataSource(
                 new ExternalDataSource(
                     new LocalInputSource(null, null, ImmutableList.of(toRead), SystemFields.none()),
-                    new CsvInputFormat(null, null, null, true, 0),
+                    new CsvInputFormat(null, null, null, true, 0, null),
                     RowSignature.builder()
                                 .add("timestamp", ColumnType.STRING)
                                 .add("agent_category", ColumnType.STRING)

diff --git a/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java b/extensions-core/multi-stage-query/src/test/java/org/apache/druid/msq/exec/MSQSelectTest.java
@@ -1779,7 +1779,7 @@ public void testGroupByWithLimitAndOrdering(String contextName, Map<String, Obje
                                      .setDataSource(
                                          new ExternalDataSource(
                                              new InlineInputSource("dim1\nabc\nxyz\ndef\nxyz\nabc\nxyz\nabc\nxyz\ndef\nbbb\naaa"),
-                                             new CsvInputFormat(null, null, null, true, 0),
+                                             new CsvInputFormat(null, null, null, true, 0, null),
                                              RowSignature.builder().add("dim1", ColumnType.STRING).build()
                                          )
                                      )
@@ -2376,7 +2376,7 @@ public void testSelectRowsGetUntruncatedByDefault(String contextName, Map<String
                                    Collections.nCopies(numFiles, toRead),
                                    SystemFields.none()
                                ),
-                               new CsvInputFormat(null, null, null, true, 0),
+                               new CsvInputFormat(null, null, null, true, 0, null),
                                RowSignature.builder().add("timestamp", ColumnType.STRING).build()
                            ))
                            .intervals(querySegmentSpec(Filtration.eternity()))

diff --git a/...ns-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java b/...ns-core/s3-extensions/src/test/java/org/apache/druid/data/input/s3/S3InputSourceTest.java
@@ -1015,7 +1015,7 @@ public void testReader() throws IOException
 
     InputSourceReader reader = inputSource.reader(
         someSchema,
-        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0),
+        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0, null),
         temporaryFolder.newFolder()
     );
 
@@ -1063,7 +1063,7 @@ public void testReaderRetriesOnSdkClientExceptionButNeverSucceedsThenThrows() th
 
     InputSourceReader reader = inputSource.reader(
         someSchema,
-        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0),
+        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0, null),
         temporaryFolder.newFolder()
     );
     try (CloseableIterator<InputRow> readerIterator = reader.read()) {
@@ -1111,7 +1111,7 @@ public void testCompressedReader() throws IOException
 
     InputSourceReader reader = inputSource.reader(
         someSchema,
-        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0),
+        new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0, null),
         temporaryFolder.newFolder()
     );
 

diff --git a/...ce/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskParallelRunTest.java b/...ce/src/test/java/org/apache/druid/indexing/common/task/CompactionTaskParallelRunTest.java
@@ -970,7 +970,8 @@ private void runIndexTask(@Nullable PartitionsSpec partitionsSpec, boolean appen
             "|",
             null,
             false,
-            0
+            0,
+            null
         ),
         appendToExisting,
         null

diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IndexTaskTest.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IndexTaskTest.java
@@ -163,7 +163,8 @@ public class IndexTaskTest extends IngestionTestBase
       null,
       null,
       false,
-      0
+      0,
+      null
   );
 
   private static final DataSchema DATA_SCHEMA =
@@ -473,7 +474,7 @@ public void testTransformSpec() throws Exception
       indexIngestionSpec = createIngestionSpec(
           DEFAULT_TIMESTAMP_SPEC,
           dimensionsSpec,
-          new CsvInputFormat(columns, listDelimiter, null, false, 0),
+          new CsvInputFormat(columns, listDelimiter, null, false, 0, null),
           transformSpec,
           null,
           tuningConfig,
@@ -901,7 +902,7 @@ public void testCSVFileWithHeader() throws Exception
       ingestionSpec = createIngestionSpec(
           timestampSpec,
           DimensionsSpec.EMPTY,
-          new CsvInputFormat(null, null, null, true, 0),
+          new CsvInputFormat(null, null, null, true, 0, null),
           null,
           null,
           tuningConfig,
@@ -941,7 +942,7 @@ public void testCSVFileWithHeaderColumnOverride() throws Exception
       ingestionSpec = createIngestionSpec(
           timestampSpec,
           DimensionsSpec.EMPTY,
-          new CsvInputFormat(columns, null, null, true, 0),
+          new CsvInputFormat(columns, null, null, true, 0, null),
           null,
           null,
           tuningConfig,
@@ -1341,7 +1342,7 @@ public void testIgnoreParseException() throws Exception
       parseExceptionIgnoreSpec = createIngestionSpec(
           timestampSpec,
           DimensionsSpec.EMPTY,
-          new CsvInputFormat(columns, null, null, true, 0),
+          new CsvInputFormat(columns, null, null, true, 0, null),
           null,
           null,
           tuningConfig,
@@ -1391,7 +1392,7 @@ public void testReportParseException() throws Exception
       indexIngestionSpec = createIngestionSpec(
           timestampSpec,
           DimensionsSpec.EMPTY,
-          new CsvInputFormat(columns, null, null, true, 0),
+          new CsvInputFormat(columns, null, null, true, 0, null),
           null,
           null,
           tuningConfig,
@@ -1632,7 +1633,7 @@ public void testMultipleParseExceptionsFailure() throws Exception
       ingestionSpec = createIngestionSpec(
           timestampSpec,
           dimensionsSpec,
-          new CsvInputFormat(columns, null, null, true, 0),
+          new CsvInputFormat(columns, null, null, true, 0, null),
           null,
           null,
           tuningConfig,
@@ -1751,7 +1752,7 @@ public void testMultipleParseExceptionsFailureAtDeterminePartitions() throws Exc
       ingestionSpec = createIngestionSpec(
           timestampSpec,
           dimensionsSpec,
-          new CsvInputFormat(columns, null, null, true, 0),
+          new CsvInputFormat(columns, null, null, true, 0, null),
           null,
           null,
           tuningConfig,
@@ -1845,7 +1846,7 @@ public void testCsvWithHeaderOfEmptyColumns() throws Exception
       ingestionSpec = createIngestionSpec(
           DEFAULT_TIMESTAMP_SPEC,
           DimensionsSpec.EMPTY,
-          new CsvInputFormat(null, null, null, true, 0),
+          new CsvInputFormat(null, null, null, true, 0, null),
           null,
           null,
           tuningConfig,
@@ -1915,7 +1916,7 @@ public void testCsvWithHeaderOfEmptyTimestamp() throws Exception
       ingestionSpec = createIngestionSpec(
           DEFAULT_TIMESTAMP_SPEC,
           DimensionsSpec.EMPTY,
-          new CsvInputFormat(columns, null, null, true, 0),
+          new CsvInputFormat(columns, null, null, true, 0, null),
           null,
           null,
           tuningConfig,

diff --git a/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java b/indexing-service/src/test/java/org/apache/druid/indexing/common/task/IngestionTestBase.java
@@ -313,7 +313,8 @@ public static InputFormat createInputFormatFromParseSpec(ParseSpec parseSpec)
           csvParseSpec.getListDelimiter(),
           getColumnsFromHeader ? null : true,
           getColumnsFromHeader ? true : null,
-          csvParseSpec.getSkipHeaderRows()
+          csvParseSpec.getSkipHeaderRows(),
+          null
       );
     } else if (parseSpec instanceof DelimitedParseSpec) {
       DelimitedParseSpec delimitedParseSpec = (DelimitedParseSpec) parseSpec;
@@ -324,7 +325,8 @@ public static InputFormat createInputFormatFromParseSpec(ParseSpec parseSpec)
           delimitedParseSpec.getDelimiter(),
           getColumnsFromHeader ? null : true,
           getColumnsFromHeader ? true : null,
-          delimitedParseSpec.getSkipHeaderRows()
+          delimitedParseSpec.getSkipHeaderRows(),
+          null
       );
     } else if (parseSpec instanceof RegexParseSpec) {
       RegexParseSpec regexParseSpec = (RegexParseSpec) parseSpec;

diff --git a/...he/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java b/...he/druid/indexing/common/task/batch/parallel/AbstractParallelIndexSupervisorTaskTest.java
@@ -163,7 +163,8 @@ public class AbstractParallelIndexSupervisorTaskTest extends IngestionTestBase
       null,
       false,
       false,
-      0
+      0,
+      null
   );
   public static final ParallelIndexTuningConfig DEFAULT_TUNING_CONFIG_FOR_PARALLEL_INDEXING =
       TuningConfigBuilder.forParallelIndexTask()

diff --git a/...ruid/indexing/common/task/batch/parallel/HashPartitionAdjustingCorePartitionSizeTest.java b/...ruid/indexing/common/task/batch/parallel/HashPartitionAdjustingCorePartitionSizeTest.java
@@ -61,7 +61,8 @@ public class HashPartitionAdjustingCorePartitionSizeTest extends AbstractMultiPh
       null,
       false,
       false,
-      0
+      0,
+      null
   );
   private static final Interval INTERVAL_TO_INDEX = Intervals.of("2020-01-01/P1M");
 

diff --git a/...ruid/indexing/common/task/batch/parallel/HashPartitionMultiPhaseParallelIndexingTest.java b/...ruid/indexing/common/task/batch/parallel/HashPartitionMultiPhaseParallelIndexingTest.java
@@ -84,7 +84,8 @@ public class HashPartitionMultiPhaseParallelIndexingTest extends AbstractMultiPh
       null,
       false,
       false,
-      0
+      0,
+      null
   );
   private static final Interval INTERVAL_TO_INDEX = Intervals.of("2017-12/P1M");
   private static final String INPUT_FILTER = "test_*";

diff --git a/.../java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionTaskKillTest.java b/.../java/org/apache/druid/indexing/common/task/batch/parallel/HashPartitionTaskKillTest.java
@@ -79,7 +79,8 @@ public class HashPartitionTaskKillTest extends AbstractMultiPhaseParallelIndexin
       null,
       false,
       false,
-      0
+      0,
+      null
   );
   private static final Interval INTERVAL_TO_INDEX = Intervals.of("2017-12/P1M");