



apiVersion: batch/v1kind: Jobmetadata:name: download-modellabels:app: download-modelspec:template:metadata:name: download-modellabels:app: download-modelannotations:eks.tke.cloud.tencent.com/root-cbs-size: "100" # The system disk capacity of a super node is only 20 Gi by default. After decompressing the vllm mirror, the disk will be full. Use this annotation to customize the system disk capacity (charges will apply for the part exceeding 20 Gi).spec:containers:- name: vllmimage: vllm/vllm-openai:v0.7.1command:- modelscope- download- --local_dir=/data/model/Qwen2.5-Coder-7B-Instruct- --model=Qwen/Qwen2.5-Coder-7B-InstructvolumeMounts:- name: datamountPath: /data/modelvolumes:- name: datapersistentVolumeClaim:claimName: ai-model # Name of the created PVCrestartPolicy: OnFailure
# Install component dependencieskubectl create -f https://github.com/vllm-project/aibrix/releases/download/v0.2.1/aibrix-dependency-v0.2.1.yaml# Install aibrix componentskubectl create -f https://github.com/vllm-project/aibrix/releases/download/v0.2.1/aibrix-core-v0.2.1.yaml
kubectl -n aibrix-system get pods
apiVersion: orchestration.aibrix.ai/v1alpha1kind: RayClusterFleetmetadata:labels:app.kubernetes.io/name: aibrixapp.kubernetes.io/managed-by: kustomizename: qwen-coder-7b-instructspec:replicas: 1selector:matchLabels:model.aibrix.ai/name: qwen-coder-7b-instructstrategy:rollingUpdate:maxSurge: 25%maxUnavailable: 25%type: RollingUpdatetemplate:metadata:labels:model.aibrix.ai/name: qwen-coder-7b-instructannotations:ray.io/overwrite-container-cmd: "true"spec:rayVersion: "2.10.0" # Must match the Ray version within the containerheadGroupSpec:rayStartParams:dashboard-host: "0.0.0.0"template:metadata:annotations:eks.tke.cloud.tencent.com/gpu-type: V100 # Specify the GPU card modeleks.tke.cloud.tencent.com/root-cbs-size: '100' # The system disk capacity of a super node is only 20 Gi by default. After decompressing the vllm image, the disk will be full. Use this annotation to customize the system disk capacity (charges will apply for the part exceeding 20 Gi).spec:containers:- name: ray-headimage: vllm/vllm-openai:v0.7.1ports:- containerPort: 6379name: gcs-server- containerPort: 8265name: dashboard- containerPort: 10001name: client- containerPort: 8000name: servicecommand: ["/bin/bash", "-lc", "--"]args:- |ulimit -n 65536;echo head;$KUBERAY_GEN_RAY_START_CMD & KUBERAY_GEN_WAIT_FOR_RAY_NODES_CMDS;vllm serve /data/model/Qwen2.5-Coder-7B-Instruct \\--served-model-name Qwen/Qwen2.5-Coder-7B-Instruct \\--tensor-parallel-size 2 \\--distributed-executor-backend ray \\--dtype=halfresources:limits:cpu: "4"nvidia.com/gpu: 1requests:cpu: "4"nvidia.com/gpu: 1volumeMounts:- name: datamountPath: /data/modelvolumes:- name: datapersistentVolumeClaim:claimName: ai-model # Name of the created PVCworkerGroupSpecs:- replicas: 1minReplicas: 1maxReplicas: 5groupName: small-grouprayStartParams: {}template:metadata:annotations:eks.tke.cloud.tencent.com/gpu-type: V100 # Assign the GPU card modeleks.tke.cloud.tencent.com/root-cbs-size: '100' # The system disk capacity of a super node is only 20 Gi by default. After decompressing the vllm image, the disk will be full. Use this annotation to customize the system disk capacity (charges will apply for the part exceeding 20 Gi).spec:containers:- name: ray-workerimage: vllm/vllm-openai:v0.7.1command: ["/bin/bash", "-lc", "--"]args:["ulimit -n 65536; echo worker; $KUBERAY_GEN_RAY_START_CMD"]lifecycle:preStop:exec:command: ["/bin/sh", "-c", "ray stop"]resources:limits:cpu: "4"nvidia.com/gpu: 1requests:cpu: "4"nvidia.com/gpu: 1volumeMounts:- name: datamountPath: /data/modelvolumes:- name: datapersistentVolumeClaim:claimName: ai-model # Name of the created PVC
# Get service namesvc=$(kubectl get svc -o name | grep qwen-coder-7b-instruct)# Use the forward function to expose the API to port 18000 locallykubectl port-forward $svc 18000:8000# Start another terminal and run the following command to test the APIcurl -X POST "http://localhost:18000/v1/chat/completions" \\-H "Content-Type: application/json" \\-d '{"model": "Qwen/Qwen2.5-Coder-7B-Instruct","messages": [{"role": "system", "content": "You are an AI programming assistant."},{"role": "user", "content": "Implement quick sort algorithm in Python"}],"temperature": 0.3,"max_tokens": 512,"top_p": 0.9}'
runtime/cgo: pthread_create failed: Operation not permittedeks.tke.cloud.tencent.com/cpu-type: intel # Specify the CPU type as Intel
--api-key parameter.VLLM_API_KEY.Authorization: Bearer <VLLM_API_KEY>
masukan