Skip to content

Commit

Permalink
[Feature][Transforms-V2] LLM transforms Support custom field name (ap…
Browse files Browse the repository at this point in the history
  • Loading branch information
hawk9821 authored Sep 14, 2024
1 parent f831f7a commit 6da7491
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 16 deletions.
9 changes: 7 additions & 2 deletions docs/en/transform-v2/llm.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ more.
## Options

| name | type | required | default value |
| ---------------------- | ------ | -------- | ------------- |
|------------------------| ------ | -------- |---------------|
| model_provider | enum | yes | |
| output_data_type | enum | no | String |
| output_column_name | string | no | llm_output |
| prompt | string | yes | |
| inference_columns | list | no | |
| inference_columns | list | no | |
| model | string | yes | |
| api_key | string | yes | |
| api_path | string | no | |
Expand All @@ -35,6 +36,10 @@ The data type of the output data. The available options are:
STRING,INT,BIGINT,DOUBLE,BOOLEAN.
Default value is STRING.

### output_column_name

Custom output data field name. A custom field name that is the same as an existing field name is replaced with 'llm_output'.

### prompt

The prompt to send to the LLM. This parameter defines how LLM will process and return data, eg:
Expand Down
31 changes: 18 additions & 13 deletions docs/zh/transform-v2/llm.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,20 @@

## 属性

| 名称 | 类型 | 是否必须 | 默认值 |
| ---------------------- | ------ | -------- | ------ |
| model_provider | enum | yes | |
| output_data_type | enum | no | String |
| prompt | string | yes | |
| inference_columns | list | no | |
| model | string | yes | |
| api_key | string | yes | |
| api_path | string | no | |
| custom_config | map | no | |
| custom_response_parse | string | no | |
| custom_request_headers | map | no | |
| custom_request_body | map | no | |
| 名称 | 类型 | 是否必须 | 默认值 |
|------------------------| ------ | -------- |-------------|
| model_provider | enum | yes | |
| output_data_type | enum | no | String |
| output_column_name | string | no | llm_output |
| prompt | string | yes | |
| inference_columns | list | no | |
| model | string | yes | |
| api_key | string | yes | |
| api_path | string | no | |
| custom_config | map | no | |
| custom_response_parse | string | no | |
| custom_request_headers | map | no | |
| custom_request_body | map | no | |

### model_provider

Expand All @@ -33,6 +34,10 @@ OPENAI、DOUBAO、KIMIAI、CUSTOM
STRING,INT,BIGINT,DOUBLE,BOOLEAN.
默认值为 STRING。

### output_column_name

自定义输出数据字段名称。自定义字段名称与现有字段名称相同时,将替换为`llm_output`

### prompt

发送到 LLM 的提示。此参数定义 LLM 将如何处理和返回数据,例如:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,14 @@ public void testLLMWithOpenAIColumns(TestContainer container)
Assertions.assertEquals(0, execResult.getExitCode());
}

@TestTemplate
public void testLLMWithOpenAIOutputColumnName(TestContainer container)
throws IOException, InterruptedException {
Container.ExecResult execResult =
container.executeJob("/llm_openai_transform_custom_output_name.conf");
Assertions.assertEquals(0, execResult.getExitCode());
}

@TestTemplate
public void testLLMWithCustomModel(TestContainer container)
throws IOException, InterruptedException {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
######
###### This config file is a demonstration of streaming processing in seatunnel config
######

env {
job.mode = "BATCH"
}

source {
FakeSource {
row.num = 5
schema = {
fields {
id = "int"
name = "string"
}
}
rows = [
{fields = [1, "Jia Fan"], kind = INSERT}
{fields = [2, "Hailin Wang"], kind = INSERT}
{fields = [3, "Tomas"], kind = INSERT}
{fields = [4, "Eric"], kind = INSERT}
{fields = [5, "Guangdong Liu"], kind = INSERT}
]
result_table_name = "fake"
}
}

transform {
LLM {
source_table_name = "fake"
model_provider = OPENAI
model = gpt-4o-mini
api_key = sk-xxx
output_column_name = "nationality"
prompt = "Determine whether someone is Chinese or American by their name"
openai.api_path = "http://mockserver:1080/v1/chat/completions"
result_table_name = "llm_output"
}
}

sink {
Assert {
source_table_name = "llm_output"
rules =
{
field_rules = [
{
field_name = "nationality"
field_type = string
field_value = [
{
rule_type = NOT_NULL
}
]
}
]
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import lombok.NonNull;
import lombok.SneakyThrows;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;

Expand Down Expand Up @@ -150,8 +151,15 @@ protected Object getOutputFieldValue(SeaTunnelRowAccessor inputRow) {

@Override
protected Column getOutputColumn() {
String customFieldName = config.get(LLMTransformConfig.OUTPUT_COLUMN_NAME);
String[] fieldNames = inputCatalogTable.getTableSchema().getFieldNames();
boolean isExist = Arrays.asList(fieldNames).contains(customFieldName);
if (isExist) {
throw new IllegalArgumentException(
String.format("llm inference field name %s already exists", customFieldName));
}
return PhysicalColumn.of(
"llm_output", outputDataType, (Long) null, true, null, "Output column of LLM");
customFieldName, outputDataType, (Long) null, true, null, "Output column of LLM");
}

@SneakyThrows
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ public class LLMTransformConfig extends ModelTransformConfig {
.noDefaultValue()
.withDescription("The row projection field of each inference");

public static final Option<String> OUTPUT_COLUMN_NAME =
Options.key("output_column_name")
.stringType()
.defaultValue("llm_output")
.withDescription("custom field name for the llm output data");

public static final Option<Integer> INFERENCE_BATCH_SIZE =
Options.key("inference_batch_size")
.intType()
Expand Down

0 comments on commit 6da7491

Please sign in to comment.