-
Notifications
You must be signed in to change notification settings - Fork 41
/
model_types.go
160 lines (128 loc) · 5.76 KB
/
model_types.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
/*
Copyright 2024.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// ModelSpec defines the desired state of Model.
// +kubebuilder:validation:XValidation:rule="!has(self.cacheProfile) || self.url.startsWith(\"hf://\")", message="cacheProfile is only supported with a huggingface url (\"hf://...\") at the moment."
// +kubebuilder:validation:XValidation:rule="!has(self.maxReplicas) || self.minReplicas <= self.maxReplicas", message="minReplicas should be less than or equal to maxReplicas."
type ModelSpec struct {
// URL of the model to be served.
// Currently only the following formats are supported:
// For VLLM & FasterWhisper engines: "hf://<model-repo>/<model-name>"
// For OLlama engine: "ollama://<model>
// +kubebuilder:validation:Required
// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="url is immutable."
// +kubebuilder:validation:XValidation:rule="self.startsWith(\"hf://\") || self.startsWith(\"ollama://\")", message="url must start with \"hf://\" or \"ollama://\" and not be empty."
URL string `json:"url"`
// Features that the model supports.
// Dictates the APIs that are available for the model.
Features []ModelFeature `json:"features"`
// Engine to be used for the server process.
// +kubebuilder:validation:Enum=OLlama;VLLM;FasterWhisper;Infinity
// +kubebuilder:validation:Required
Engine string `json:"engine"`
// ResourceProfile required to serve the model.
// Use the format "<resource-profile-name>:<count>".
// Example: "nvidia-gpu-l4:2" - 2x NVIDIA L4 GPUs.
// Must be a valid ResourceProfile defined in the system config.
ResourceProfile string `json:"resourceProfile,omitempty"`
// CacheProfile to be used for caching model artifacts.
// Must be a valid CacheProfile defined in the system config.
// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="cacheProfile is immutable."
CacheProfile string `json:"cacheProfile,omitempty"`
// Image to be used for the server process.
// Will be set from ResourceProfile + Engine if not specified.
Image string `json:"image,omitempty"`
// Args to be added to the server process.
Args []string `json:"args,omitempty"`
// Env variables to be added to the server process.
Env map[string]string `json:"env,omitempty"`
// Replicas is the number of Pod replicas that should be actively
// serving the model. KubeAI will manage this field unless AutoscalingDisabled
// is set to true.
Replicas *int32 `json:"replicas,omitempty"`
// MinReplicas is the minimum number of Pod replicas that the model can scale down to.
// Note: 0 is a valid value.
// +kubebuilder:validation:Minimum=0
// +kubebuilder:validation:Optional
MinReplicas int32 `json:"minReplicas"`
// MaxReplicas is the maximum number of Pod replicas that the model can scale up to.
// Empty value means no limit.
// +kubebuilder:validation:Minimum=1
MaxReplicas *int32 `json:"maxReplicas,omitempty"`
// AutoscalingDisabled will stop the controller from managing the replicas
// for the Model. When disabled, metrics will not be collected on server Pods.
AutoscalingDisabled bool `json:"autoscalingDisabled,omitempty"`
// TargetRequests is average number of active requests that the autoscaler
// will try to maintain on model server Pods.
// +kubebuilder:validation:Minimum=1
// +kubebuilder:default=100
TargetRequests *int32 `json:"targetRequests"`
// ScaleDownDelay is the minimum time before a deployment is scaled down after
// the autoscaling algorithm determines that it should be scaled down.
// +kubebuilder:default=30
ScaleDownDelaySeconds *int64 `json:"scaleDownDelaySeconds"`
// Owner of the model. Used solely to populate the owner field in the
// OpenAI /v1/models endpoint.
// DEPRECATED.
// +kubebuilder:validation:Optional
Owner string `json:"owner"`
}
// +kubebuilder:validation:Enum=TextGeneration;TextEmbedding;SpeechToText
type ModelFeature string
const (
ModelFeatureTextGeneration = "TextGeneration"
ModelFeatureTextEmbedding = "TextEmbedding"
// TODO (samos123): Add validation that Speech to Text only supports Faster Whisper.
ModelFeatureSpeechToText = "SpeechToText"
)
const (
OLlamaEngine = "OLlama"
VLLMEngine = "VLLM"
FasterWhisperEngine = "FasterWhisper"
InfinityEngine = "Infinity"
)
// ModelStatus defines the observed state of Model.
type ModelStatus struct {
Replicas ModelStatusReplicas `json:"replicas,omitempty"`
Cache *ModelStatusCache `json:"cache,omitempty"`
}
type ModelStatusReplicas struct {
All int32 `json:"all"`
Ready int32 `json:"ready"`
}
type ModelStatusCache struct {
Loaded bool `json:"loaded"`
}
// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas.all
// Model resources define the ML models that will be served by KubeAI.
type Model struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`
Spec ModelSpec `json:"spec,omitempty"`
Status ModelStatus `json:"status,omitempty"`
}
// +kubebuilder:object:root=true
// ModelList contains a list of Models.
type ModelList struct {
metav1.TypeMeta `json:",inline"`
metav1.ListMeta `json:"metadata,omitempty"`
Items []Model `json:"items"`
}
func init() {
SchemeBuilder.Register(&Model{}, &ModelList{})
}