commit
8f07804099
@ -207,6 +207,10 @@ def new_cluster_install(params):
|
|||||||
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
|
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
|
||||||
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
|
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
|
||||||
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
|
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
|
||||||
|
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
|
||||||
|
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",
|
||||||
|
"files/nvidia-container-toolkit_1.17.8-1_amd64.deb": "/opt/nvidia-container-toolkit_1.17.8-1_amd64.deb",
|
||||||
|
"files/nvidia-container-toolkit-base_1.17.8-1_amd64.deb": "/opt/nvidia-container-toolkit-base_1.17.8-1_amd64.deb",
|
||||||
"script/k8s_uninstall.sh": "/opt/k8s_uninstall.sh",
|
"script/k8s_uninstall.sh": "/opt/k8s_uninstall.sh",
|
||||||
"script/import_images.sh": "/opt/import_images.sh",
|
"script/import_images.sh": "/opt/import_images.sh",
|
||||||
}
|
}
|
||||||
|
|||||||
@ -186,7 +186,7 @@ def execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, us
|
|||||||
if time.time() - start_time > sudo_timeout:
|
if time.time() - start_time > sudo_timeout:
|
||||||
raise Exception(f"等待sudo密码提示超时({sudo_timeout}秒): {sudo_cmd}")
|
raise Exception(f"等待sudo密码提示超时({sudo_timeout}秒): {sudo_cmd}")
|
||||||
if not ready:
|
if not ready:
|
||||||
time.sleep(1.5) # 避免CPU占用过高
|
time.sleep(0.5) # 避免CPU占用过高
|
||||||
|
|
||||||
# 如果没有收到密码提示但命令执行超时,可能是权限问题
|
# 如果没有收到密码提示但命令执行超时,可能是权限问题
|
||||||
if not password_prompt:
|
if not password_prompt:
|
||||||
@ -218,7 +218,7 @@ def execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, us
|
|||||||
error += line
|
error += line
|
||||||
if channel.exit_status_ready():
|
if channel.exit_status_ready():
|
||||||
break
|
break
|
||||||
time.sleep(1.5) # 避免CPU占用过高
|
time.sleep(0.1) # 避免CPU占用过高
|
||||||
else:
|
else:
|
||||||
# 非实时模式读取输出
|
# 非实时模式读取输出
|
||||||
result += channel.recv(-1).decode(errors="replace") if channel.recv_ready() else ""
|
result += channel.recv(-1).decode(errors="replace") if channel.recv_ready() else ""
|
||||||
|
|||||||
49
files/nvidia-device-plugin.yml
Normal file
49
files/nvidia-device-plugin.yml
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
name: nvidia-device-plugin-daemonset
|
||||||
|
namespace: kube-system
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
name: nvidia-device-plugin-ds
|
||||||
|
updateStrategy:
|
||||||
|
type: RollingUpdate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
name: nvidia-device-plugin-ds
|
||||||
|
spec:
|
||||||
|
tolerations:
|
||||||
|
- key: nvidia.com/gpu
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
priorityClassName: "system-node-critical"
|
||||||
|
containers:
|
||||||
|
- image: nvcr.io/nvidia/k8s-device-plugin:v0.13.0
|
||||||
|
name: nvidia-device-plugin-ctr
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
volumeMounts:
|
||||||
|
- name: device-plugin
|
||||||
|
mountPath: /var/lib/kubelet/device-plugins
|
||||||
|
volumes:
|
||||||
|
- name: device-plugin
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/kubelet/device-plugins
|
||||||
@ -309,16 +309,30 @@ echo "containerd配置初始纠正完成."
|
|||||||
|
|
||||||
echo "开始更新containerd配置以适配GPU实例"
|
echo "开始更新containerd配置以适配GPU实例"
|
||||||
|
|
||||||
# 检查是否有 NVIDIA GPU
|
# 检查 NVIDIA GPU
|
||||||
if lspci | grep -i nvidia > /dev/null 2>&1; then
|
if lspci | grep -i nvidia > /dev/null 2>&1; then
|
||||||
log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..."
|
if [ "$1" == "worker" ]; then
|
||||||
|
log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..."
|
||||||
|
|
||||||
dpkg -i /opt/*.deb || log_error "安装nvidia-container-runtime及其依赖失败!"
|
# 检查 .deb 文件是否存在
|
||||||
|
if [ ! "$(ls /opt/*.deb 2>/dev/null | wc -l)" -ge 1 ]; then
|
||||||
|
log_error "/opt/ 下没有 .deb 文件"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# 配置 containerd 支持 nvidia runtime
|
# 安装 .deb 包
|
||||||
CONTAINERD_CONFIG="/etc/containerd/config.toml"
|
for deb in /opt/*.deb; do
|
||||||
if ! grep -q '\[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia\]' "$CONTAINERD_CONFIG"; then
|
dpkg -i "$deb" || log_error "安装 $deb 失败"
|
||||||
cat <<EOF >> "$CONTAINERD_CONFIG"
|
done
|
||||||
|
|
||||||
|
# 配置 containerd
|
||||||
|
CONTAINERD_CONFIG="/etc/containerd/config.toml"
|
||||||
|
if ! grep -q '
|
||||||
|
$$
|
||||||
|
plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia
|
||||||
|
$$
|
||||||
|
' "$CONTAINERD_CONFIG"; then
|
||||||
|
cat <<EOF >> "$CONTAINERD_CONFIG"
|
||||||
|
|
||||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
|
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
|
||||||
privileged_without_host_devices = false
|
privileged_without_host_devices = false
|
||||||
@ -326,11 +340,17 @@ if lspci | grep -i nvidia > /dev/null 2>&1; then
|
|||||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
|
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
|
||||||
BinaryName = "/usr/bin/nvidia-container-runtime"
|
BinaryName = "/usr/bin/nvidia-container-runtime"
|
||||||
EOF
|
EOF
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# 重启 containerd
|
# 设置 default_runtime_name = "nvidia"
|
||||||
systemctl restart containerd
|
if ! grep -q '^default_runtime_name = "nvidia"$' "$CONTAINERD_CONFIG"; then
|
||||||
log_info "nvidia-container-runtime 配置完成,containerd已重启"
|
sed -i '/$$plugins."io.containerd.grpc.v1.cri"$$/{n;s/.*/ default_runtime_name = "nvidia"/;}' "$CONTAINERD_CONFIG"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 重启 containerd
|
||||||
|
systemctl restart containerd
|
||||||
|
log_info "nvidia-container-runtime 配置完成,containerd已重启"
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
log_info "未检测到NVIDIA GPU,跳过nvidia-container-runtime配置"
|
log_info "未检测到NVIDIA GPU,跳过nvidia-container-runtime配置"
|
||||||
fi
|
fi
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user