Skip to content

Commit

Permalink
Add knative target utilization annotation when concurrency target is …
Browse files Browse the repository at this point in the history
…below 1
  • Loading branch information
deadlycoconuts committed Apr 29, 2024
1 parent 8c7865d commit e09e014
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 0 deletions.
21 changes: 21 additions & 0 deletions api/cluster/resource/templater.go
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,27 @@ func (t *InferenceServiceTemplater) createAnnotations(modelService *models.Servi

annotations[knautoscaling.MetricAnnotationKey] = autoscalingMetrics
annotations[knautoscaling.TargetAnnotationKey] = targetValue

// Since the autoscaling target is an INTEGER, users who have set a concurrency target=1 for their
// deployments and would like them to scale up even earlier are unable to do so by simply setting the target
// as any float < 1. Instead, they would need to tweak the target-utilization-percentage annotation, which
// in turn defines the average number of concurrent requests across all existing replicas at which the
// Autoscaler will trigger a scale up on the number of replicas.
//
// Thus, when the user-defined target < 1, we set:
//
// target utilization = the default target utilization * the user-defined target
//
// in order to allow the deployment to scale up earlier than when the default target utilization (70) and
// user-defined target = 1 is set. Note however, that this does not change the actual 'soft limit' on the
// number of requests that each replica can serve, which remains as 1.
//
// For more details, see: https://knative.dev/docs/serving/autoscaling/concurrency/#target-utilization
if modelService.AutoscalingPolicy.MetricsType == autoscaling.Concurrency &&
modelService.AutoscalingPolicy.TargetValue < 1 {
annotations[knautoscaling.TargetUtilizationPercentageKey] = fmt.Sprintf("%.0f",
70*modelService.AutoscalingPolicy.TargetValue)
}
}
}

Expand Down
64 changes: 64 additions & 0 deletions api/cluster/resource/templater_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1595,6 +1595,70 @@ func TestCreateInferenceServiceSpec(t *testing.T) {
},
},
},
{
name: "serverless deployment using concurrency autoscaling where target < 1",
modelSvc: &models.Service{
Name: modelSvc.Name,
ModelName: modelSvc.ModelName,
ModelVersion: modelSvc.ModelVersion,
Namespace: project.Name,
ArtifactURI: modelSvc.ArtifactURI,
Type: models.ModelTypeTensorflow,
Options: &models.ModelOption{},
Metadata: modelSvc.Metadata,
DeploymentMode: deployment.ServerlessDeploymentMode,
AutoscalingPolicy: &autoscaling.AutoscalingPolicy{
MetricsType: autoscaling.Concurrency,
TargetValue: 0.7,
},
Protocol: protocol.HttpJson,
},
resourcePercentage: queueResourcePercentage,
deploymentScale: defaultDeploymentScale,
exp: &kservev1beta1.InferenceService{
ObjectMeta: metav1.ObjectMeta{
Name: modelSvc.Name,
Namespace: project.Name,
Annotations: map[string]string{
knserving.QueueSidecarResourcePercentageAnnotationKey: queueResourcePercentage,
kserveconstant.DeploymentMode: string(kserveconstant.Serverless),
knautoscaling.ClassAnnotationKey: knautoscaling.KPA,
knautoscaling.MetricAnnotationKey: knautoscaling.Concurrency,
knautoscaling.TargetAnnotationKey: "1",
knautoscaling.TargetUtilizationPercentageKey: "49",
knautoscaling.InitialScaleAnnotationKey: fmt.Sprint(testPredictorScale),
},
Labels: map[string]string{
"gojek.com/app": modelSvc.Metadata.App,
"gojek.com/component": models.ComponentModelVersion,
"gojek.com/environment": testEnvironmentName,
"gojek.com/orchestrator": testOrchestratorName,
"gojek.com/stream": modelSvc.Metadata.Stream,
"gojek.com/team": modelSvc.Metadata.Team,
"sample": "true",
},
},
Spec: kservev1beta1.InferenceServiceSpec{
Predictor: kservev1beta1.PredictorSpec{
Tensorflow: &kservev1beta1.TFServingSpec{
PredictorExtensionSpec: kservev1beta1.PredictorExtensionSpec{
StorageURI: &storageUri,
Container: corev1.Container{
Name: kserveconstant.InferenceServiceContainerName,
Resources: expDefaultModelResourceRequests,
LivenessProbe: probeConfig,
Env: []corev1.EnvVar{},
},
},
},
ComponentExtensionSpec: kservev1beta1.ComponentExtensionSpec{
MinReplicas: &defaultModelResourceRequests.MinReplica,
MaxReplicas: defaultModelResourceRequests.MaxReplica,
},
},
},
},
},
{
name: "serverless deployment using rps autoscaling",
modelSvc: &models.Service{
Expand Down

0 comments on commit e09e014

Please sign in to comment.