Add knative target utilization annotation when concurrency target is …

…below 1
caraml-dev · Apr 29, 2024 · e09e014 · e09e014
1 parent 8c7865d
commit e09e014
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 0 deletions.
diff --git a/api/cluster/resource/templater.go b/api/cluster/resource/templater.go
@@ -643,6 +643,27 @@ func (t *InferenceServiceTemplater) createAnnotations(modelService *models.Servi
 
 			annotations[knautoscaling.MetricAnnotationKey] = autoscalingMetrics
 			annotations[knautoscaling.TargetAnnotationKey] = targetValue
+
+			// Since the autoscaling target is an INTEGER, users who have set a concurrency target=1 for their
+			// deployments and would like them to scale up even earlier are unable to do so by simply setting the target
+			// as any float < 1. Instead, they would need to tweak the target-utilization-percentage annotation, which
+			// in turn defines the average number of concurrent requests across all existing replicas at which the
+			// Autoscaler will trigger a scale up on the number of replicas.
+			//
+			// Thus, when the user-defined target < 1, we set:
+			//
+			// target utilization = the default target utilization * the user-defined target
+			//
+			// in order to allow the deployment to scale up earlier than when the default target utilization (70) and
+			// user-defined target = 1 is set. Note however, that this does not change the actual 'soft limit' on the
+			// number of requests that each replica can serve, which remains as 1.
+			//
+			// For more details, see: https://knative.dev/docs/serving/autoscaling/concurrency/#target-utilization
+			if modelService.AutoscalingPolicy.MetricsType == autoscaling.Concurrency &&
+				modelService.AutoscalingPolicy.TargetValue < 1 {
+				annotations[knautoscaling.TargetUtilizationPercentageKey] = fmt.Sprintf("%.0f",
+					70*modelService.AutoscalingPolicy.TargetValue)
+			}
 		}
 	}
 

diff --git a/api/cluster/resource/templater_test.go b/api/cluster/resource/templater_test.go
@@ -1595,6 +1595,70 @@ func TestCreateInferenceServiceSpec(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "serverless deployment using concurrency autoscaling where target < 1",
+			modelSvc: &models.Service{
+				Name:           modelSvc.Name,
+				ModelName:      modelSvc.ModelName,
+				ModelVersion:   modelSvc.ModelVersion,
+				Namespace:      project.Name,
+				ArtifactURI:    modelSvc.ArtifactURI,
+				Type:           models.ModelTypeTensorflow,
+				Options:        &models.ModelOption{},
+				Metadata:       modelSvc.Metadata,
+				DeploymentMode: deployment.ServerlessDeploymentMode,
+				AutoscalingPolicy: &autoscaling.AutoscalingPolicy{
+					MetricsType: autoscaling.Concurrency,
+					TargetValue: 0.7,
+				},
+				Protocol: protocol.HttpJson,
+			},
+			resourcePercentage: queueResourcePercentage,
+			deploymentScale:    defaultDeploymentScale,
+			exp: &kservev1beta1.InferenceService{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      modelSvc.Name,
+					Namespace: project.Name,
+					Annotations: map[string]string{
+						knserving.QueueSidecarResourcePercentageAnnotationKey: queueResourcePercentage,
+						kserveconstant.DeploymentMode:                         string(kserveconstant.Serverless),
+						knautoscaling.ClassAnnotationKey:                      knautoscaling.KPA,
+						knautoscaling.MetricAnnotationKey:                     knautoscaling.Concurrency,
+						knautoscaling.TargetAnnotationKey:                     "1",
+						knautoscaling.TargetUtilizationPercentageKey:          "49",
+						knautoscaling.InitialScaleAnnotationKey:               fmt.Sprint(testPredictorScale),
+					},
+					Labels: map[string]string{
+						"gojek.com/app":          modelSvc.Metadata.App,
+						"gojek.com/component":    models.ComponentModelVersion,
+						"gojek.com/environment":  testEnvironmentName,
+						"gojek.com/orchestrator": testOrchestratorName,
+						"gojek.com/stream":       modelSvc.Metadata.Stream,
+						"gojek.com/team":         modelSvc.Metadata.Team,
+						"sample":                 "true",
+					},
+				},
+				Spec: kservev1beta1.InferenceServiceSpec{
+					Predictor: kservev1beta1.PredictorSpec{
+						Tensorflow: &kservev1beta1.TFServingSpec{
+							PredictorExtensionSpec: kservev1beta1.PredictorExtensionSpec{
+								StorageURI: &storageUri,
+								Container: corev1.Container{
+									Name:          kserveconstant.InferenceServiceContainerName,
+									Resources:     expDefaultModelResourceRequests,
+									LivenessProbe: probeConfig,
+									Env:           []corev1.EnvVar{},
+								},
+							},
+						},
+						ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{
+							MinReplicas: &defaultModelResourceRequests.MinReplica,
+							MaxReplicas: defaultModelResourceRequests.MaxReplica,
+						},
+					},
+				},
+			},
+		},
 		{
 			name: "serverless deployment using rps autoscaling",
 			modelSvc: &models.Service{