Merge pull request 'gpu功能初始化' (#1) from dev1 into main

Reviewed-on: #1
This commit is contained in:
ysh 2025-07-16 21:55:39 +08:00
commit 8f07804099
8 changed files with 86 additions and 13 deletions

View File

@ -207,6 +207,10 @@ def new_cluster_install(params):
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml", # "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml", "files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml", "files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",
"files/nvidia-container-toolkit_1.17.8-1_amd64.deb": "/opt/nvidia-container-toolkit_1.17.8-1_amd64.deb",
"files/nvidia-container-toolkit-base_1.17.8-1_amd64.deb": "/opt/nvidia-container-toolkit-base_1.17.8-1_amd64.deb",
"script/k8s_uninstall.sh": "/opt/k8s_uninstall.sh", "script/k8s_uninstall.sh": "/opt/k8s_uninstall.sh",
"script/import_images.sh": "/opt/import_images.sh", "script/import_images.sh": "/opt/import_images.sh",
} }

View File

@ -186,7 +186,7 @@ def execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, us
if time.time() - start_time > sudo_timeout: if time.time() - start_time > sudo_timeout:
raise Exception(f"等待sudo密码提示超时{sudo_timeout}秒): {sudo_cmd}") raise Exception(f"等待sudo密码提示超时{sudo_timeout}秒): {sudo_cmd}")
if not ready: if not ready:
time.sleep(1.5) # 避免CPU占用过高 time.sleep(0.5) # 避免CPU占用过高
# 如果没有收到密码提示但命令执行超时,可能是权限问题 # 如果没有收到密码提示但命令执行超时,可能是权限问题
if not password_prompt: if not password_prompt:
@ -218,7 +218,7 @@ def execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, us
error += line error += line
if channel.exit_status_ready(): if channel.exit_status_ready():
break break
time.sleep(1.5) # 避免CPU占用过高 time.sleep(0.1) # 避免CPU占用过高
else: else:
# 非实时模式读取输出 # 非实时模式读取输出
result += channel.recv(-1).decode(errors="replace") if channel.recv_ready() else "" result += channel.recv(-1).decode(errors="replace") if channel.recv_ready() else ""

View File

@ -0,0 +1,49 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
priorityClassName: "system-node-critical"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.13.0
name: nvidia-device-plugin-ctr
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins

View File

@ -309,16 +309,30 @@ echo "containerd配置初始纠正完成."
echo "开始更新containerd配置以适配GPU实例" echo "开始更新containerd配置以适配GPU实例"
# 检查是否有 NVIDIA GPU # 检查 NVIDIA GPU
if lspci | grep -i nvidia > /dev/null 2>&1; then if lspci | grep -i nvidia > /dev/null 2>&1; then
log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..." if [ "$1" == "worker" ]; then
log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..."
dpkg -i /opt/*.deb || log_error "安装nvidia-container-runtime及其依赖失败!" # 检查 .deb 文件是否存在
if [ ! "$(ls /opt/*.deb 2>/dev/null | wc -l)" -ge 1 ]; then
log_error "/opt/ 下没有 .deb 文件"
exit 1
fi
# 配置 containerd 支持 nvidia runtime # 安装 .deb 包
CONTAINERD_CONFIG="/etc/containerd/config.toml" for deb in /opt/*.deb; do
if ! grep -q '\[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia\]' "$CONTAINERD_CONFIG"; then dpkg -i "$deb" || log_error "安装 $deb 失败"
cat <<EOF >> "$CONTAINERD_CONFIG" done
# 配置 containerd
CONTAINERD_CONFIG="/etc/containerd/config.toml"
if ! grep -q '
$$
plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia
$$
' "$CONTAINERD_CONFIG"; then
cat <<EOF >> "$CONTAINERD_CONFIG"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false privileged_without_host_devices = false
@ -326,11 +340,17 @@ if lspci | grep -i nvidia > /dev/null 2>&1; then
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime" BinaryName = "/usr/bin/nvidia-container-runtime"
EOF EOF
fi fi
# 重启 containerd # 设置 default_runtime_name = "nvidia"
systemctl restart containerd if ! grep -q '^default_runtime_name = "nvidia"$' "$CONTAINERD_CONFIG"; then
log_info "nvidia-container-runtime 配置完成,containerd已重启" sed -i '/$$plugins."io.containerd.grpc.v1.cri"$$/{n;s/.*/ default_runtime_name = "nvidia"/;}' "$CONTAINERD_CONFIG"
fi
# 重启 containerd
systemctl restart containerd
log_info "nvidia-container-runtime 配置完成,containerd已重启"
fi
else else
log_info "未检测到NVIDIA GPU,跳过nvidia-container-runtime配置" log_info "未检测到NVIDIA GPU,跳过nvidia-container-runtime配置"
fi fi