diff --git a/app/k8sManager/k8s_utils_linuxos_ubuntu.py b/app/k8sManager/k8s_utils_linuxos_ubuntu.py index b6069b0..413246f 100644 --- a/app/k8sManager/k8s_utils_linuxos_ubuntu.py +++ b/app/k8sManager/k8s_utils_linuxos_ubuntu.py @@ -296,6 +296,7 @@ def create_or_update_statefulset(apps_v1, v1, namespace_name, statefulset_name, "kind": "StatefulSet", "metadata": {"name": statefulset_name, "namespace": namespace_name}, "spec": { + "runtimeClassName": "nvidia", # 指定使用 nvidia runtime" "replicas": replicas, "selector": {"matchLabels": source_selflabel}, "serviceName": statefulset_name, diff --git a/app/k8sManager/multiple_clusters.py b/app/k8sManager/multiple_clusters.py index 21f08a1..66f62f6 100644 --- a/app/k8sManager/multiple_clusters.py +++ b/app/k8sManager/multiple_clusters.py @@ -207,6 +207,7 @@ async def new_cluster_install(params): # "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml", "files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml", "files/config.toml": "/opt/config.toml", + "files/runtimeclass-nvidia.yaml": "/opt/runtimeclass-nvidia.yaml", "files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml", "files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb", "files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb", diff --git a/files/config.toml b/files/config.toml index dc942bf..01066cf 100644 --- a/files/config.toml +++ b/files/config.toml @@ -83,7 +83,7 @@ version = 2 setup_serially = false [plugins."io.containerd.grpc.v1.cri".containerd] - default_runtime_name = "nvidia" + default_runtime_name = "runc" disable_snapshot_annotations = true discard_unpacked_layers = false ignore_blockio_not_enabled_errors = false diff --git a/files/nvidia-device-plugin.yml b/files/nvidia-device-plugin.yml index eee27cd..cf6c140 100644 --- a/files/nvidia-device-plugin.yml +++ b/files/nvidia-device-plugin.yml @@ -28,13 +28,14 @@ spec: labels: name: nvidia-device-plugin-ds spec: + runtimeClassName: nvidia # 指定使用 nvidia runtime tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule priorityClassName: "system-node-critical" containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.13.0 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1 name: nvidia-device-plugin-ctr securityContext: allowPrivilegeEscalation: false diff --git a/files/runtimeclass-nvidia.yaml b/files/runtimeclass-nvidia.yaml new file mode 100644 index 0000000..256dbb6 --- /dev/null +++ b/files/runtimeclass-nvidia.yaml @@ -0,0 +1,6 @@ +# runtimeclass-nvidia.yaml +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia # 与 Pod 中引用的名称一致 +handler: nvidia # 对应 containerd 配置中的 runtime 名称 \ No newline at end of file diff --git a/script/k8s_install.sh b/script/k8s_install.sh index e209bea..8e64067 100644 --- a/script/k8s_install.sh +++ b/script/k8s_install.sh @@ -540,14 +540,16 @@ if [ "$1" == "master" ]; then # 安装网络插件 log_info "正在安装网络插件(flannel)" kubectl apply -f /opt/kube-flannel.yml || log_error "本地安装flannel网络插件失败" - log_info "正在安装MetricsServer插件" - kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败" # log_info "正在安装Ingress-nginx-controller插件" # kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败" - # log_info "正在安装GPU模式必要插件" - # kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败" log_info "正在安装nfs-client-provisioner插件" aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7 + log_info "正在安装MetricsServer插件" + kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败" + log_info "正在安装GPU模式必要插件runtimeclass-nvidia.yaml" + kubectl apply -f /opt/runtimeclass-nvidia.yaml || log_error "本地安装GPU模式必要插件runtimeclass-nvidia.yaml失败" + log_info "正在安装GPU模式必要插件nvidia-device-plugin.yml" + kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败" if [ $? -ne 0 ]; then echo "NFS 服务器端安装失败,请检查网络连接或软件源。" exit 1