56 lines
1.8 KiB
Django/Jinja
56 lines
1.8 KiB
Django/Jinja
#!/bin/bash
|
||
source ./common.sh
|
||
|
||
echo "[INFO] === 配置 GPU H100 工作节点 ==="
|
||
|
||
DRIVER_FILE="$BUNDLE_ROOT/drivers/{{ gpu.driver_filename }}"
|
||
|
||
echo "[STEP 1] 处理 Nouveau 驱动冲突..."
|
||
if lsmod | grep -q nouveau; then
|
||
echo "[WARN] 检测到 Nouveau 驱动已加载!"
|
||
cat <<BL > /etc/modprobe.d/blacklist-nouveau.conf
|
||
blacklist nouveau
|
||
options nouveau modeset=0
|
||
BL
|
||
update-initramfs -u
|
||
echo "[ACTION REQUIRED] Nouveau 已禁用。请重启机器,然后再次运行此脚本!"
|
||
exit 1
|
||
fi
|
||
|
||
echo "[STEP 2] 检查编译环境..."
|
||
# H100 驱动安装需要编译内核模块
|
||
if ! dpkg -l | grep -q build-essential; then
|
||
echo "[WARN] 未检测到 build-essential。如果在完全离线环境且没有 GCC,.run 安装将失败。"
|
||
echo "尝试继续,但如果失败,请先安装 gcc, make 和 linux-headers-$(uname -r)。"
|
||
fi
|
||
|
||
echo "[STEP 3] 安装 NVIDIA 驱动..."
|
||
if [ -f "$DRIVER_FILE" ]; then
|
||
# -s: 静默安装
|
||
# --no-questions: 不提问
|
||
# --accept-license: 接受协议
|
||
# --no-dkms: 离线环境通常没有 DKMS,除非我们特意下载了
|
||
chmod +x "$DRIVER_FILE"
|
||
bash "$DRIVER_FILE" -s --no-questions --accept-license
|
||
else
|
||
echo "[ERROR] 驱动文件不存在: $DRIVER_FILE"
|
||
exit 1
|
||
fi
|
||
|
||
echo "[STEP 4] 配置 NVIDIA Container Toolkit..."
|
||
# common.sh 中已经安装了 nvidia-container-toolkit deb 包
|
||
# 配置 Containerd 运行时
|
||
nvidia-ctk runtime configure --runtime=containerd
|
||
systemctl restart containerd
|
||
|
||
echo "[INFO] 正在加入集群..."
|
||
if [ -f "./join_cluster.sh" ]; then
|
||
bash ./join_cluster.sh
|
||
else
|
||
echo "[ERROR] 未找到 join_cluster.sh。"
|
||
exit 1
|
||
fi
|
||
|
||
echo "[INFO] 节点加入成功。请在 Master 执行以下命令启用 GPU Operator:"
|
||
echo "kubectl label node $(hostname) nvidia.com/gpu.present=true"
|