95 lines
2.4 KiB
Bash
95 lines
2.4 KiB
Bash
#!/bin/bash
|
||
# worker-gpu-install.sh
|
||
# 在每个有 A100 的 GPU 节点上运行
|
||
|
||
set -e
|
||
|
||
OFFLINE_DIR=/opt/offline
|
||
|
||
# 安装 containerd、k8s 二进制(同上)
|
||
tar --no-overwrite-dir -C /usr/local -xzf ${OFFLINE_DIR}/containerd.tar.gz
|
||
mkdir -p /opt/cni/bin
|
||
tar -xzf ${OFFLINE_DIR}/cni-plugins.tgz -C /opt/cni/bin/
|
||
|
||
cp ${OFFLINE_DIR}/k8s-binaries/kubeadm /usr/bin/
|
||
cp ${OFFLINE_DIR}/k8s-binaries/kubelet /usr/bin/
|
||
chmod +x /usr/bin/kubeadm /usr/bin/kubelet
|
||
|
||
# 配置 containerd 和 kubelet(同上)
|
||
cat > /etc/systemd/system/containerd.service << 'EOF'
|
||
[Unit]
|
||
Description=containerd daemon
|
||
After=network.target
|
||
|
||
[Service]
|
||
ExecStartPre=/sbin/modprobe overlay
|
||
ExecStart=/usr/local/bin/containerd
|
||
Restart=always
|
||
Type=notify
|
||
Delegate=yes
|
||
KillMode=process
|
||
|
||
[Install]
|
||
WantedBy=multi-user.target
|
||
EOF
|
||
|
||
systemctl enable containerd
|
||
systemctl start containerd
|
||
|
||
cat > /etc/systemd/system/kubelet.service << 'EOF'
|
||
[Unit]
|
||
Description=kubelet
|
||
After=containerd.service
|
||
Requires=containerd.service
|
||
|
||
[Service]
|
||
ExecStart=/usr/bin/kubelet
|
||
Restart=always
|
||
StartLimitInterval=0
|
||
VolumeMountPropagation=private
|
||
Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock"
|
||
|
||
[Install]
|
||
WantedBy=multi-user.target
|
||
EOF
|
||
|
||
systemctl enable kubelet
|
||
|
||
# 安装 NVIDIA 驱动
|
||
echo "=== 安装 NVIDIA 驱动 ==="
|
||
chmod +x ${OFFLINE_DIR}/nvidia/NVIDIA-Linux-x86_64-*.run
|
||
${OFFLINE_DIR}/nvidia/NVIDIA-Linux-x86_64-535.161.08.run -s --dkms --no-opengl-files
|
||
|
||
# 加载内核模块
|
||
modprobe nvidia
|
||
modprobe nvidia-uvm
|
||
|
||
# 安装 NVIDIA Container Toolkit
|
||
dpkg -i ${OFFLINE_DIR}/nvidia/nvidia-container-toolkit*.deb
|
||
systemctl restart containerd
|
||
|
||
# 开启 MIG 模式(A100 必须)
|
||
echo "=== 配置 MIG 模式 ==="
|
||
# 示例:每张卡切分为 2 个 MIG 实例(可根据需求调整)
|
||
nvidia-smi -i 0 -mig 1
|
||
sleep 5
|
||
# 创建实例(示例:创建两个 3g.20gb 实例)
|
||
nvidia-smi mig -i 0 -cgi 3g.20gb,3g.20gb -C
|
||
nvidia-smi mig -i 1 -cgi 3g.20gb,3g.20gb -C
|
||
# ... 对所有卡重复
|
||
|
||
# 标记节点为 GPU 节点
|
||
cat > /tmp/gpu-label.yaml << 'EOF'
|
||
apiVersion: v1
|
||
kind: Node
|
||
metadata:
|
||
name: $(hostname)
|
||
labels:
|
||
node-type: gpu-worker
|
||
nvidia.com/gpu.present: "true"
|
||
EOF
|
||
|
||
# 注意:join 后再应用 label
|
||
echo "✅ 安装完成,请先加入集群"
|
||
echo "然后在 master 上运行:kubectl label node $(hostname) node-type=gpu-worker nvidia.com/gpu.present=true"
|