k8s-deploy/installer/templates/worker_gpu.sh.j2
2025-11-24 16:07:11 +08:00

56 lines
1.8 KiB
Django/Jinja
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
source ./common.sh
echo "[INFO] === 配置 GPU H100 工作节点 ==="
DRIVER_FILE="$BUNDLE_ROOT/drivers/{{ gpu.driver_filename }}"
echo "[STEP 1] 处理 Nouveau 驱动冲突..."
if lsmod | grep -q nouveau; then
echo "[WARN] 检测到 Nouveau 驱动已加载!"
cat <<BL > /etc/modprobe.d/blacklist-nouveau.conf
blacklist nouveau
options nouveau modeset=0
BL
update-initramfs -u
echo "[ACTION REQUIRED] Nouveau 已禁用。请重启机器,然后再次运行此脚本!"
exit 1
fi
echo "[STEP 2] 检查编译环境..."
# H100 驱动安装需要编译内核模块
if ! dpkg -l | grep -q build-essential; then
echo "[WARN] 未检测到 build-essential。如果在完全离线环境且没有 GCC.run 安装将失败。"
echo "尝试继续,但如果失败,请先安装 gcc, make 和 linux-headers-$(uname -r)。"
fi
echo "[STEP 3] 安装 NVIDIA 驱动..."
if [ -f "$DRIVER_FILE" ]; then
# -s: 静默安装
# --no-questions: 不提问
# --accept-license: 接受协议
# --no-dkms: 离线环境通常没有 DKMS除非我们特意下载了
chmod +x "$DRIVER_FILE"
bash "$DRIVER_FILE" -s --no-questions --accept-license
else
echo "[ERROR] 驱动文件不存在: $DRIVER_FILE"
exit 1
fi
echo "[STEP 4] 配置 NVIDIA Container Toolkit..."
# common.sh 中已经安装了 nvidia-container-toolkit deb 包
# 配置 Containerd 运行时
nvidia-ctk runtime configure --runtime=containerd
systemctl restart containerd
echo "[INFO] 正在加入集群..."
if [ -f "./join_cluster.sh" ]; then
bash ./join_cluster.sh
else
echo "[ERROR] 未找到 join_cluster.sh。"
exit 1
fi
echo "[INFO] 节点加入成功。请在 Master 执行以下命令启用 GPU Operator:"
echo "kubectl label node $(hostname) nvidia.com/gpu.present=true"