增强友好性

This commit is contained in:
ysh 2025-07-18 18:15:18 +08:00
parent 43ada048c4
commit 58383646e2
6 changed files with 17 additions and 6 deletions

View File

@ -296,6 +296,7 @@ def create_or_update_statefulset(apps_v1, v1, namespace_name, statefulset_name,
"kind": "StatefulSet", "kind": "StatefulSet",
"metadata": {"name": statefulset_name, "namespace": namespace_name}, "metadata": {"name": statefulset_name, "namespace": namespace_name},
"spec": { "spec": {
"runtimeClassName": "nvidia", # 指定使用 nvidia runtime"
"replicas": replicas, "replicas": replicas,
"selector": {"matchLabels": source_selflabel}, "selector": {"matchLabels": source_selflabel},
"serviceName": statefulset_name, "serviceName": statefulset_name,

View File

@ -207,6 +207,7 @@ async def new_cluster_install(params):
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml", # "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml", "files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
"files/config.toml": "/opt/config.toml", "files/config.toml": "/opt/config.toml",
"files/runtimeclass-nvidia.yaml": "/opt/runtimeclass-nvidia.yaml",
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml", "files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb", "files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb", "files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",

View File

@ -83,7 +83,7 @@ version = 2
setup_serially = false setup_serially = false
[plugins."io.containerd.grpc.v1.cri".containerd] [plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia" default_runtime_name = "runc"
disable_snapshot_annotations = true disable_snapshot_annotations = true
discard_unpacked_layers = false discard_unpacked_layers = false
ignore_blockio_not_enabled_errors = false ignore_blockio_not_enabled_errors = false

View File

@ -28,13 +28,14 @@ spec:
labels: labels:
name: nvidia-device-plugin-ds name: nvidia-device-plugin-ds
spec: spec:
runtimeClassName: nvidia # 指定使用 nvidia runtime
tolerations: tolerations:
- key: nvidia.com/gpu - key: nvidia.com/gpu
operator: Exists operator: Exists
effect: NoSchedule effect: NoSchedule
priorityClassName: "system-node-critical" priorityClassName: "system-node-critical"
containers: containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.13.0 - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1
name: nvidia-device-plugin-ctr name: nvidia-device-plugin-ctr
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false

View File

@ -0,0 +1,6 @@
# runtimeclass-nvidia.yaml
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia # 与 Pod 中引用的名称一致
handler: nvidia # 对应 containerd 配置中的 runtime 名称

View File

@ -544,10 +544,12 @@ if [ "$1" == "master" ]; then
# kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败" # kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败"
log_info "正在安装nfs-client-provisioner插件" log_info "正在安装nfs-client-provisioner插件"
aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7 aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7
# log_info "正在安装MetricsServer插件" log_info "正在安装MetricsServer插件"
# kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败" kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败"
# log_info "正在安装GPU模式必要插件" log_info "正在安装GPU模式必要插件runtimeclass-nvidia.yaml"
# kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败" kubectl apply -f /opt/runtimeclass-nvidia.yaml || log_error "本地安装GPU模式必要插件runtimeclass-nvidia.yaml失败"
log_info "正在安装GPU模式必要插件nvidia-device-plugin.yml"
kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "NFS 服务器端安装失败,请检查网络连接或软件源。" echo "NFS 服务器端安装失败,请检查网络连接或软件源。"
exit 1 exit 1