Compare commits
2 Commits
3f6944adb0
...
8f07804099
| Author | SHA1 | Date | |
|---|---|---|---|
| 8f07804099 | |||
|
|
a6632b7e14 |
@ -207,6 +207,10 @@ def new_cluster_install(params):
|
||||
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
|
||||
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
|
||||
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
|
||||
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
|
||||
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",
|
||||
"files/nvidia-container-toolkit_1.17.8-1_amd64.deb": "/opt/nvidia-container-toolkit_1.17.8-1_amd64.deb",
|
||||
"files/nvidia-container-toolkit-base_1.17.8-1_amd64.deb": "/opt/nvidia-container-toolkit-base_1.17.8-1_amd64.deb",
|
||||
"script/k8s_uninstall.sh": "/opt/k8s_uninstall.sh",
|
||||
"script/import_images.sh": "/opt/import_images.sh",
|
||||
}
|
||||
|
||||
@ -186,7 +186,7 @@ def execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, us
|
||||
if time.time() - start_time > sudo_timeout:
|
||||
raise Exception(f"等待sudo密码提示超时({sudo_timeout}秒): {sudo_cmd}")
|
||||
if not ready:
|
||||
time.sleep(1.5) # 避免CPU占用过高
|
||||
time.sleep(0.5) # 避免CPU占用过高
|
||||
|
||||
# 如果没有收到密码提示但命令执行超时,可能是权限问题
|
||||
if not password_prompt:
|
||||
@ -218,7 +218,7 @@ def execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, us
|
||||
error += line
|
||||
if channel.exit_status_ready():
|
||||
break
|
||||
time.sleep(1.5) # 避免CPU占用过高
|
||||
time.sleep(0.1) # 避免CPU占用过高
|
||||
else:
|
||||
# 非实时模式读取输出
|
||||
result += channel.recv(-1).decode(errors="replace") if channel.recv_ready() else ""
|
||||
|
||||
49
files/nvidia-device-plugin.yml
Normal file
49
files/nvidia-device-plugin.yml
Normal file
@ -0,0 +1,49 @@
|
||||
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: nvidia-device-plugin-daemonset
|
||||
namespace: kube-system
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
name: nvidia-device-plugin-ds
|
||||
updateStrategy:
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
name: nvidia-device-plugin-ds
|
||||
spec:
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
priorityClassName: "system-node-critical"
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/k8s-device-plugin:v0.13.0
|
||||
name: nvidia-device-plugin-ctr
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop: ["ALL"]
|
||||
volumeMounts:
|
||||
- name: device-plugin
|
||||
mountPath: /var/lib/kubelet/device-plugins
|
||||
volumes:
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: /var/lib/kubelet/device-plugins
|
||||
@ -309,16 +309,30 @@ echo "containerd配置初始纠正完成."
|
||||
|
||||
echo "开始更新containerd配置以适配GPU实例"
|
||||
|
||||
# 检查是否有 NVIDIA GPU
|
||||
# 检查 NVIDIA GPU
|
||||
if lspci | grep -i nvidia > /dev/null 2>&1; then
|
||||
log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..."
|
||||
if [ "$1" == "worker" ]; then
|
||||
log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..."
|
||||
|
||||
dpkg -i /opt/*.deb || log_error "安装nvidia-container-runtime及其依赖失败!"
|
||||
# 检查 .deb 文件是否存在
|
||||
if [ ! "$(ls /opt/*.deb 2>/dev/null | wc -l)" -ge 1 ]; then
|
||||
log_error "/opt/ 下没有 .deb 文件"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 配置 containerd 支持 nvidia runtime
|
||||
CONTAINERD_CONFIG="/etc/containerd/config.toml"
|
||||
if ! grep -q '\[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia\]' "$CONTAINERD_CONFIG"; then
|
||||
cat <<EOF >> "$CONTAINERD_CONFIG"
|
||||
# 安装 .deb 包
|
||||
for deb in /opt/*.deb; do
|
||||
dpkg -i "$deb" || log_error "安装 $deb 失败"
|
||||
done
|
||||
|
||||
# 配置 containerd
|
||||
CONTAINERD_CONFIG="/etc/containerd/config.toml"
|
||||
if ! grep -q '
|
||||
$$
|
||||
plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia
|
||||
$$
|
||||
' "$CONTAINERD_CONFIG"; then
|
||||
cat <<EOF >> "$CONTAINERD_CONFIG"
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
|
||||
privileged_without_host_devices = false
|
||||
@ -326,11 +340,17 @@ if lspci | grep -i nvidia > /dev/null 2>&1; then
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
|
||||
BinaryName = "/usr/bin/nvidia-container-runtime"
|
||||
EOF
|
||||
fi
|
||||
fi
|
||||
|
||||
# 重启 containerd
|
||||
systemctl restart containerd
|
||||
log_info "nvidia-container-runtime 配置完成,containerd已重启"
|
||||
# 设置 default_runtime_name = "nvidia"
|
||||
if ! grep -q '^default_runtime_name = "nvidia"$' "$CONTAINERD_CONFIG"; then
|
||||
sed -i '/$$plugins."io.containerd.grpc.v1.cri"$$/{n;s/.*/ default_runtime_name = "nvidia"/;}' "$CONTAINERD_CONFIG"
|
||||
fi
|
||||
|
||||
# 重启 containerd
|
||||
systemctl restart containerd
|
||||
log_info "nvidia-container-runtime 配置完成,containerd已重启"
|
||||
fi
|
||||
else
|
||||
log_info "未检测到NVIDIA GPU,跳过nvidia-container-runtime配置"
|
||||
fi
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user