bugfix
This commit is contained in:
parent
e7665ebf9e
commit
aee0ca2658
23
README.md
23
README.md
@ -1,2 +1,25 @@
|
|||||||
# k8s-deploy
|
# k8s-deploy
|
||||||
|
|
||||||
|
## 环境说明
|
||||||
|
集群基本信息,k8s+kubevirt, NFS共享存储,外网带宽总计容量XXX, 外网IP池总计YYY个IP
|
||||||
|
|
||||||
|
一个控制节点,1到多个CPU工作节点,
|
||||||
|
一到多个a100gpu工作节点,
|
||||||
|
本地镜像仓库,镜像仓库部署在控制节点上,端口5000
|
||||||
|
共享存储ip:XX.xxxx.xx.xx,路径:
|
||||||
|
vm的存储从NFS共享存储中分配
|
||||||
|
|
||||||
|
所有节点安装ubuntu 22.04
|
||||||
|
|
||||||
|
## 创建一个集群
|
||||||
|
输入参数
|
||||||
|
```
|
||||||
|
{
|
||||||
|
nfs_node
|
||||||
|
nfs_path
|
||||||
|
repo_port
|
||||||
|
ctl_node
|
||||||
|
}
|
||||||
|
|
||||||
|
控制节点:192.168.16.5, 本地>镜像仓库, 位于:192.168.16.5:5000,NFS共享存储:192.168.16.2:/d/share/101206, 外网带宽共计100G,外网地址池1000个ip
|
||||||
|
|
||||||
|
|||||||
19
config/config.yaml
Normal file
19
config/config.yaml
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
cluster:
|
||||||
|
name: "h100-cluster"
|
||||||
|
kubernetes_version: "1.28.2"
|
||||||
|
# 对应下载脚本中的 pause 版本
|
||||||
|
pause_image: "registry.k8s.io/pause:3.9"
|
||||||
|
pod_cidr: "10.244.0.0/16"
|
||||||
|
service_cidr: "10.96.0.0/12"
|
||||||
|
api_server_ip: "192.168.16.5" # [请修改] Master IP
|
||||||
|
|
||||||
|
storage:
|
||||||
|
nfs_server: "192.168.16.2" # [请修改] NFS IP
|
||||||
|
nfs_path: "/d/share/101206"
|
||||||
|
|
||||||
|
gpu:
|
||||||
|
driver_filename: "NVIDIA-Linux-x86_64-535.129.03.run" # [请修改] 实际下载的文件名
|
||||||
|
|
||||||
|
registry:
|
||||||
|
ip: "192.168.16.5" # [请修改] 本地仓库IP (通常是Master)
|
||||||
|
port: 5000
|
||||||
171
downloader/dl.sh
Normal file
171
downloader/dl.sh
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
# https://org.ngc.nvidia.com/setup/api-keys
|
||||||
|
# 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
|
||||||
|
# =================配置区域=================
|
||||||
|
ARCH=amd64
|
||||||
|
WORKDIR=$(pwd)/k8s-offline-bundle
|
||||||
|
K8S_VERSION="1.28.2"
|
||||||
|
HELM_VERSION="v3.13.1"
|
||||||
|
CNI_VERSION="v1.3.0"
|
||||||
|
CALICO_VERSION="v3.26.1"
|
||||||
|
KUBEVIRT_VERSION="v1.1.0" # 升级到更稳定的版本
|
||||||
|
NVIDIA_DRIVER_VERSION="535.129.03"
|
||||||
|
# =========================================
|
||||||
|
|
||||||
|
echo ">>> [0/6] 初始化目录..."
|
||||||
|
mkdir -p $WORKDIR/{bin,debs,images,drivers,charts,manifests,scripts}
|
||||||
|
PKGS_TO_DOWNLOAD="nfs-common socat conntrack ipset ebtables lvm2 gnupg2 software-properties-common curl ca-certificates apt-transport-https redis-server"
|
||||||
|
cd $WORKDIR/debs
|
||||||
|
sudo apt-get update -q
|
||||||
|
for pkg in $PKGS_TO_DOWNLOAD; do
|
||||||
|
echo "Processing package: $pkg"
|
||||||
|
# 使用 apt-rdepends 找出依赖并下载 (需要先安装: sudo apt install apt-rdepends)
|
||||||
|
# 如果没有 apt-rdepends,可以用简化的 apt-get download,但可能漏掉深层依赖
|
||||||
|
# 这里使用一种更通用的方法,尝试下载包本身
|
||||||
|
apt-get download "$pkg" 2>/dev/null || echo "Warning: Failed to download $pkg"
|
||||||
|
done
|
||||||
|
apt-get download python3-pip python3-venv
|
||||||
|
apt-get download build-essential linux-headers-$(uname -r) pkg-config 2>/dev/null
|
||||||
|
# 然后使用 apt-get download 下载包及其所有依赖
|
||||||
|
sudo apt-get download nvidia-container-toolkit libnvidia-container-tools libnvidia-container1 nvidia-container-runtime cuda-keyring
|
||||||
|
ls -l $WORKDIR/debs
|
||||||
|
|
||||||
|
# 检查 Docker 是否存在 (下载镜像必须)
|
||||||
|
if ! command -v docker &> /dev/null; then
|
||||||
|
echo "正在安装 Docker (用于拉取镜像)..."
|
||||||
|
apt-get update && apt-get install -y docker.io
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ================= 1. 二进制文件 =================
|
||||||
|
echo ">>> [1/6] 下载二进制工具 (Helm, CNI)..."
|
||||||
|
cd $WORKDIR/bin
|
||||||
|
|
||||||
|
# 1. Kubernetes Binaries (kubelet, kubeadm, kubectl)
|
||||||
|
curl -L --retry 3 https://dl.k8s.io/v${K8S_VERSION}/bin/linux/${ARCH}/kubeadm -o kubeadm
|
||||||
|
curl -L --retry 3 https://dl.k8s.io/v${K8S_VERSION}/bin/linux/${ARCH}/kubelet -o kubelet
|
||||||
|
curl -L --retry 3 https://dl.k8s.io/v${K8S_VERSION}/bin/linux/${ARCH}/kubectl -o kubectl
|
||||||
|
chmod +x kubeadm kubelet kubectl
|
||||||
|
|
||||||
|
# Helm
|
||||||
|
if [ ! -f "helm" ]; then
|
||||||
|
echo "Downloading Helm..."
|
||||||
|
wget -q https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz
|
||||||
|
tar -zxvf helm-${HELM_VERSION}-linux-amd64.tar.gz
|
||||||
|
mv linux-amd64/helm .
|
||||||
|
rm -rf linux-amd64 helm-*.tar.gz
|
||||||
|
fi
|
||||||
|
|
||||||
|
# CNI Plugins
|
||||||
|
if [ ! -f "cni-plugins-linux-amd64-${CNI_VERSION}.tgz" ]; then
|
||||||
|
echo "Downloading CNI Plugins..."
|
||||||
|
wget -q https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-amd64-${CNI_VERSION}.tgz
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Binaries ready."
|
||||||
|
|
||||||
|
# ================= 2. 容器镜像 =================
|
||||||
|
echo ">>> [2/6] 拉取并打包容器镜像 (这需要较长时间)..."
|
||||||
|
# 确保 Docker 守护进程在运行
|
||||||
|
service docker start || true
|
||||||
|
|
||||||
|
# 定义镜像列表
|
||||||
|
# 包含: K8s 核心, Calico, Multus, KubeVirt, NFS, Nvidia相关
|
||||||
|
# 注意: Pause 镜像版本需与 kubeadm config 中一致
|
||||||
|
NVIDIA_REPO="nvcr.io/nvidia"
|
||||||
|
IMAGES=(
|
||||||
|
"registry.k8s.io/kube-apiserver:v${K8S_VERSION}"
|
||||||
|
"registry.k8s.io/kube-controller-manager:v${K8S_VERSION}"
|
||||||
|
"registry.k8s.io/kube-scheduler:v${K8S_VERSION}"
|
||||||
|
"registry.k8s.io/kube-proxy:v${K8S_VERSION}"
|
||||||
|
"registry.k8s.io/pause:3.9"
|
||||||
|
"registry.k8s.io/etcd:3.5.12-0"
|
||||||
|
"registry.k8s.io/coredns/coredns:v1.10.1"
|
||||||
|
"docker.io/calico/cni:${CALICO_VERSION}"
|
||||||
|
"docker.io/calico/node:${CALICO_VERSION}"
|
||||||
|
"docker.io/calico/kube-controllers:${CALICO_VERSION}"
|
||||||
|
"docker.io/library/registry:2"
|
||||||
|
"ghcr.io/k8snetworkplumbingwg/multus-cni:v4.0.2"
|
||||||
|
"quay.io/kubevirt/virt-operator:${KUBEVIRT_VERSION}"
|
||||||
|
"quay.io/kubevirt/virt-api:${KUBEVIRT_VERSION}"
|
||||||
|
"quay.io/kubevirt/virt-controller:${KUBEVIRT_VERSION}"
|
||||||
|
"quay.io/kubevirt/virt-handler:${KUBEVIRT_VERSION}"
|
||||||
|
"quay.io/kubevirt/virt-launcher:${KUBEVIRT_VERSION}"
|
||||||
|
"registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2"
|
||||||
|
"nvcr.io/nvidia/k8s-device-plugin:v0.14.1"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ${NVIDIA_REPO}/container-toolkit:v1.13.5-ubuntu20.04
|
||||||
|
# ${NVIDIA_REPO}/dcgm-exporter:3.2.5-3.1.7-ubuntu20.04
|
||||||
|
# ${NVIDIA_REPO}/gpu-feature-discovery:v0.8.1
|
||||||
|
# ${NVIDIA_REPO}/driver:535.104.05-ubuntu22.04
|
||||||
|
|
||||||
|
cd $WORKDIR/images
|
||||||
|
for img in "${IMAGES[@]}"; do
|
||||||
|
# 将 / 和 : 替换为 _ 作为文件名
|
||||||
|
FILENAME=$(echo $img | tr '/:' '__').tar
|
||||||
|
if [ -f "$FILENAME" ]; then
|
||||||
|
echo "跳过已存在: $FILENAME"
|
||||||
|
else
|
||||||
|
echo "Pulling $img ..."
|
||||||
|
docker pull $img
|
||||||
|
echo "Saving to $FILENAME ..."
|
||||||
|
docker save $img -o $FILENAME
|
||||||
|
# 节省空间,保存后删除本地 docker缓存
|
||||||
|
docker rmi $img
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ================= 3. NVIDIA 驱动 =================
|
||||||
|
echo ">>> [3/6] 下载 NVIDIA H100 驱动 (.run)..."
|
||||||
|
cd $WORKDIR/drivers
|
||||||
|
DRIVER_NAME="NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run"
|
||||||
|
if [ ! -f "$DRIVER_NAME" ]; then
|
||||||
|
echo "Downloading NVIDIA Driver..."
|
||||||
|
wget -q https://us.download.nvidia.com/tesla/${NVIDIA_DRIVER_VERSION}/${DRIVER_NAME}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ================= 4. YAML Manifests =================
|
||||||
|
echo ">>> [4/6] 下载 K8s YAML 配置文件..."
|
||||||
|
cd $WORKDIR/manifests
|
||||||
|
|
||||||
|
# Calico
|
||||||
|
curl -L -o calico.yaml https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/calico.yaml
|
||||||
|
|
||||||
|
# KubeVirt
|
||||||
|
KUBEVIRT_REL="https://github.com/kubevirt/kubevirt/releases/download/${KUBEVIRT_VERSION}"
|
||||||
|
curl -L -o kubevirt-operator.yaml ${KUBEVIRT_REL}/kubevirt-operator.yaml
|
||||||
|
curl -L -o kubevirt-cr.yaml ${KUBEVIRT_REL}/kubevirt-cr.yaml
|
||||||
|
|
||||||
|
# Multus
|
||||||
|
curl -L -o multus-daemonset.yaml https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/master/deployments/multus-daemonset.yml
|
||||||
|
|
||||||
|
# ================= 5. Helm Charts =================
|
||||||
|
echo ">>> [5/6] 下载 Helm Charts..."
|
||||||
|
cd $WORKDIR/charts
|
||||||
|
|
||||||
|
# 添加 repo (如果 helm 命令可用)
|
||||||
|
if command -v helm &> /dev/null; then
|
||||||
|
helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner/
|
||||||
|
helm repo update
|
||||||
|
helm pull nfs-subdir-external-provisioner/nfs-subdir-external-provisioner --version 4.0.18
|
||||||
|
else
|
||||||
|
echo "Helm not installed on host, downloading chart directly via wget..."
|
||||||
|
wget -q https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd $WORKDIR/pypkgs
|
||||||
|
pip download git+https://git.opencomputing.cn/yumoqing/apppublic
|
||||||
|
pip download git+https://git.opencomputing.cn/yumoqing/sqlor
|
||||||
|
pip download git+https://git.opencomputing.cn/yumoqing/ahserver
|
||||||
|
pip download git+https://git.opencomputing.cn/yumoqing/pcapi
|
||||||
|
# ================= 6. 验证 =================
|
||||||
|
echo "---------------------------------------------"
|
||||||
|
echo ">>> 下载工作全部完成!正在统计文件大小..."
|
||||||
|
cd $WORKDIR
|
||||||
|
du -sh *
|
||||||
|
echo "---------------------------------------------"
|
||||||
|
echo "请检查 debs 目录是否依然有文件 (这是之前下载的)。"
|
||||||
|
echo "images 目录应该有几 GB 大小。"
|
||||||
|
echo "drivers 目录应该有 400MB+。"
|
||||||
|
|
||||||
35
installer/render.py
Executable file
35
installer/render.py
Executable file
@ -0,0 +1,35 @@
|
|||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
from jinja2 import Environment, FileSystemLoader
|
||||||
|
|
||||||
|
def render():
|
||||||
|
config_path = '../config/config.yaml'
|
||||||
|
if not os.path.exists(config_path):
|
||||||
|
print("Config file not found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
with open(config_path, 'r') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
env = Environment(loader=FileSystemLoader('templates'))
|
||||||
|
templates = ['common.sh.j2', 'master.sh.j2', 'worker_cpu.sh.j2', 'worker_gpu.sh.j2']
|
||||||
|
|
||||||
|
output_dir = "."
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for temp_name in templates:
|
||||||
|
try:
|
||||||
|
template = env.get_template(temp_name)
|
||||||
|
output_name = temp_name.split('.')[0] + '.sh'
|
||||||
|
rendered = template.render(config)
|
||||||
|
|
||||||
|
out_path = os.path.join(output_dir, output_name)
|
||||||
|
with open(out_path, 'w') as f:
|
||||||
|
f.write(rendered)
|
||||||
|
os.chmod(out_path, 0o755)
|
||||||
|
print(f"Generated: {out_path}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error rendering {temp_name}: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
render()
|
||||||
93
installer/templates/common.sh.j2
Normal file
93
installer/templates/common.sh.j2
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Generated by Installer V2
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# 获取脚本所在目录的绝对路径,确保能找到 ../../debs
|
||||||
|
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
|
||||||
|
BUNDLE_ROOT=$(dirname "$SCRIPT_DIR")
|
||||||
|
DEBS_DIR="$BUNDLE_ROOT/debs"
|
||||||
|
IMAGES_DIR="$BUNDLE_ROOT/images"
|
||||||
|
BIN_DIR="$BUNDLE_ROOT/bin"
|
||||||
|
|
||||||
|
echo "[INFO] 1. 系统基础配置..."
|
||||||
|
swapoff -a
|
||||||
|
sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab
|
||||||
|
|
||||||
|
cat <<MOD > /etc/modules-load.d/k8s.conf
|
||||||
|
overlay
|
||||||
|
br_netfilter
|
||||||
|
MOD
|
||||||
|
modprobe overlay
|
||||||
|
modprobe br_netfilter
|
||||||
|
|
||||||
|
cat <<SYS > /etc/sysctl.d/k8s.conf
|
||||||
|
net.bridge.bridge-nf-call-iptables = 1
|
||||||
|
net.bridge.bridge-nf-call-ip6tables = 1
|
||||||
|
net.ipv4.ip_forward = 1
|
||||||
|
SYS
|
||||||
|
sysctl --system
|
||||||
|
|
||||||
|
echo "[INFO] 2. 安装离线依赖包 (使用 APT 智能解析)..."
|
||||||
|
if [ -d "$DEBS_DIR" ]; then
|
||||||
|
# 卸载可能冲突的默认包
|
||||||
|
apt-get remove -y containerd docker docker.io || true
|
||||||
|
|
||||||
|
# 使用通配符安装所有 debs,由 apt 解决顺序
|
||||||
|
# 这里的 "./*.deb" 指向的是传入的路径
|
||||||
|
cd "$DEBS_DIR"
|
||||||
|
dpkg -i ./*.deb
|
||||||
|
cd "$SCRIPT_DIR"
|
||||||
|
else
|
||||||
|
echo "[ERROR] 找不到 debs 目录: $DEBS_DIR"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] 3. 配置 Containerd..."
|
||||||
|
mkdir -p /etc/containerd
|
||||||
|
containerd config default > /etc/containerd/config.toml
|
||||||
|
|
||||||
|
# 关键配置1: 启用 SystemdCgroup
|
||||||
|
sed -i 's/SystemdCgroup = false/SystemdCgroup = true/g' /etc/containerd/config.toml
|
||||||
|
|
||||||
|
# 关键配置2: 强制指定 sandbox_image 为本地导入的版本 (pause:3.9)
|
||||||
|
# 防止去 registry.k8s.io 拉取导致卡住
|
||||||
|
sed -i 's|sandbox_image = .*|sandbox_image = "{{ cluster.pause_image }}"|g' /etc/containerd/config.toml
|
||||||
|
|
||||||
|
# 关键配置3: 配置本地镜像仓库
|
||||||
|
sed -i 's|config_path = ""|config_path = "/etc/containerd/certs.d"|g' /etc/containerd/config.toml
|
||||||
|
|
||||||
|
mkdir -p /etc/containerd/certs.d/{{ registry.ip }}:{{ registry.port }}
|
||||||
|
cat <<REG > /etc/containerd/certs.d/{{ registry.ip }}:{{ registry.port }}/hosts.toml
|
||||||
|
server = "http://{{ registry.ip }}:{{ registry.port }}"
|
||||||
|
[host."http://{{ registry.ip }}:{{ registry.port }}"]
|
||||||
|
capabilities = ["pull", "resolve"]
|
||||||
|
REG
|
||||||
|
|
||||||
|
systemctl restart containerd
|
||||||
|
systemctl enable containerd
|
||||||
|
|
||||||
|
echo "[INFO] 4. 安装 K8s 二进制与 CNI..."
|
||||||
|
# 只有当 bin 目录存在且不为空时才拷贝 (防止覆盖 apt 安装的)
|
||||||
|
# 实际上我们前面 apt install kubelet 已经安装了二进制,这里主要是 CNI
|
||||||
|
if [ -f "$BIN_DIR/cni-plugins-linux-amd64-v1.3.0.tgz" ]; then
|
||||||
|
mkdir -p /opt/cni/bin
|
||||||
|
tar -C /opt/cni/bin -zxvf "$BIN_DIR/cni-plugins-linux-amd64-v1.3.0.tgz"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] 5. 导入离线镜像..."
|
||||||
|
if [ -d "$IMAGES_DIR" ]; then
|
||||||
|
for img in "$IMAGES_DIR"/*.tar; do
|
||||||
|
[ -e "$img" ] || continue
|
||||||
|
echo "Importing $img..."
|
||||||
|
ctr -n k8s.io images import "$img"
|
||||||
|
p_img=${img//_//}
|
||||||
|
base_pimg=$(basename p_img)
|
||||||
|
limg="{{registry.ip}}:{{ registry.port}}/$base_pimg"
|
||||||
|
echo "Importing $limg..."
|
||||||
|
ctr -n k8s.io images tag $p_img $limg
|
||||||
|
ctr -n k8s.io images push --plain-http $limg
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] 6. 启动 Kubelet..."
|
||||||
|
systemctl enable --now kubelet
|
||||||
86
installer/templates/master.sh.j2
Normal file
86
installer/templates/master.sh.j2
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
source ./common.sh
|
||||||
|
|
||||||
|
echo "[INFO] === 初始化 Master 节点 ==="
|
||||||
|
|
||||||
|
cat <<CFG > kubeadm-config.yaml
|
||||||
|
apiVersion: kubeadm.k8s.io/v1beta3
|
||||||
|
kind: ClusterConfiguration
|
||||||
|
kubernetesVersion: v{{ cluster.kubernetes_version }}
|
||||||
|
controlPlaneEndpoint: "{{ cluster.api_server_ip }}:6443"
|
||||||
|
networking:
|
||||||
|
podSubnet: "{{ cluster.pod_cidr }}"
|
||||||
|
serviceSubnet: "{{ cluster.service_cidr }}"
|
||||||
|
imageRepository: {{ registry.ip }}:{{ registry.port }}
|
||||||
|
---
|
||||||
|
apiVersion: kubelet.config.k8s.io/v1beta1
|
||||||
|
kind: KubeletConfiguration
|
||||||
|
cgroupDriver: systemd
|
||||||
|
CFG
|
||||||
|
|
||||||
|
# 预先检查
|
||||||
|
kubeadm init phase preflight --config kubeadm-config.yaml --ignore-preflight-errors=all
|
||||||
|
|
||||||
|
# 正式初始化
|
||||||
|
# 注意:因为我们已经手动导入了镜像,不需要 kubeadm pull
|
||||||
|
kubeadm init --config kubeadm-config.yaml --upload-certs | tee kubeadm-init.log
|
||||||
|
|
||||||
|
mkdir -p $HOME/.kube
|
||||||
|
cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
|
||||||
|
chown $(id -u):$(id -g) $HOME/.kube/config
|
||||||
|
|
||||||
|
echo "[INFO] 部署网络插件 (Calico)..."
|
||||||
|
kubectl apply -f "$BUNDLE_ROOT/manifests/calico.yaml"
|
||||||
|
|
||||||
|
echo "[INFO] 部署本地 Registry 容器..."
|
||||||
|
# 启动 registry 容器 (假设 registry:2 镜像已导入)
|
||||||
|
# 如果机器上没有 docker 命令,使用 nerdctl 或 ctr 比较麻烦
|
||||||
|
# 这里假设用户在 common.sh 步骤中通过 apt 安装了 containerd.io (含 docker CLI 吗? 不一定)
|
||||||
|
# 修正:直接使用静态 Pod 或 nerdctl。为简单起见,这里假设 containerd 环境下使用 nerdctl 或者 kubectl 部署
|
||||||
|
# 我们使用 kubectl 部署一个简单的 registry 到本节点
|
||||||
|
kubectl apply -f - <<REG_YAML
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: local-registry
|
||||||
|
namespace: kube-system
|
||||||
|
labels:
|
||||||
|
app: registry
|
||||||
|
spec:
|
||||||
|
hostNetwork: true
|
||||||
|
containers:
|
||||||
|
- name: registry
|
||||||
|
image: registry:2
|
||||||
|
ports:
|
||||||
|
- containerPort: 5000
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /var/lib/registry
|
||||||
|
name: registry-data
|
||||||
|
volumes:
|
||||||
|
- name: registry-data
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/registry
|
||||||
|
type: DirectoryOrCreate
|
||||||
|
REG_YAML
|
||||||
|
|
||||||
|
echo "[INFO] 部署 Storage & Virtualization..."
|
||||||
|
# 安装 Helm
|
||||||
|
cp "$BUNDLE_ROOT/bin/helm" /usr/local/bin/
|
||||||
|
|
||||||
|
# NFS
|
||||||
|
"$BUNDLE_ROOT/bin/helm" install nfs-subdir-provisioner "$BUNDLE_ROOT/charts/nfs-subdir-external-provisioner" \
|
||||||
|
--set nfs.server={{ storage.nfs_server }} \
|
||||||
|
--set nfs.path={{ storage.nfs_path }} \
|
||||||
|
--set storageClass.defaultClass=true
|
||||||
|
|
||||||
|
# KubeVirt
|
||||||
|
kubectl apply -f "$BUNDLE_ROOT/manifests/kubevirt-operator.yaml"
|
||||||
|
kubectl apply -f "$BUNDLE_ROOT/manifests/kubevirt-cr.yaml"
|
||||||
|
|
||||||
|
# Multus
|
||||||
|
kubectl apply -f "$BUNDLE_ROOT/manifests/multus-daemonset.yaml"
|
||||||
|
|
||||||
|
echo "[INFO] 生成 Worker 加入脚本..."
|
||||||
|
kubeadm token create --print-join-command > ../../output/join_cluster.sh
|
||||||
|
chmod +x ../../output/join_cluster.sh
|
||||||
|
echo "Master 部署完成!请检查 kubectl get nodes"
|
||||||
18
installer/templates/worker_cpu.sh.j2
Normal file
18
installer/templates/worker_cpu.sh.j2
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
source ./common.sh
|
||||||
|
|
||||||
|
echo "[INFO] === 配置 CPU 工作节点 ==="
|
||||||
|
|
||||||
|
echo "[CHECK] 检查虚拟化支持..."
|
||||||
|
if [ $(grep -E -c '(vmx|svm)' /proc/cpuinfo) -eq 0 ]; then
|
||||||
|
echo "[ERROR] CPU 不支持虚拟化 (VT-x/AMD-V),KubeVirt 将无法正常工作!"
|
||||||
|
# exit 1 # 视情况是否强制退出
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] 正在加入集群..."
|
||||||
|
if [ -f "./join_cluster.sh" ]; then
|
||||||
|
bash ./join_cluster.sh
|
||||||
|
else
|
||||||
|
echo "[ERROR] 未找到 join_cluster.sh。请从 Master 节点拷贝该文件到当前目录。"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
55
installer/templates/worker_gpu.sh.j2
Normal file
55
installer/templates/worker_gpu.sh.j2
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
source ./common.sh
|
||||||
|
|
||||||
|
echo "[INFO] === 配置 GPU H100 工作节点 ==="
|
||||||
|
|
||||||
|
DRIVER_FILE="$BUNDLE_ROOT/drivers/{{ gpu.driver_filename }}"
|
||||||
|
|
||||||
|
echo "[STEP 1] 处理 Nouveau 驱动冲突..."
|
||||||
|
if lsmod | grep -q nouveau; then
|
||||||
|
echo "[WARN] 检测到 Nouveau 驱动已加载!"
|
||||||
|
cat <<BL > /etc/modprobe.d/blacklist-nouveau.conf
|
||||||
|
blacklist nouveau
|
||||||
|
options nouveau modeset=0
|
||||||
|
BL
|
||||||
|
update-initramfs -u
|
||||||
|
echo "[ACTION REQUIRED] Nouveau 已禁用。请重启机器,然后再次运行此脚本!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[STEP 2] 检查编译环境..."
|
||||||
|
# H100 驱动安装需要编译内核模块
|
||||||
|
if ! dpkg -l | grep -q build-essential; then
|
||||||
|
echo "[WARN] 未检测到 build-essential。如果在完全离线环境且没有 GCC,.run 安装将失败。"
|
||||||
|
echo "尝试继续,但如果失败,请先安装 gcc, make 和 linux-headers-$(uname -r)。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[STEP 3] 安装 NVIDIA 驱动..."
|
||||||
|
if [ -f "$DRIVER_FILE" ]; then
|
||||||
|
# -s: 静默安装
|
||||||
|
# --no-questions: 不提问
|
||||||
|
# --accept-license: 接受协议
|
||||||
|
# --no-dkms: 离线环境通常没有 DKMS,除非我们特意下载了
|
||||||
|
chmod +x "$DRIVER_FILE"
|
||||||
|
bash "$DRIVER_FILE" -s --no-questions --accept-license
|
||||||
|
else
|
||||||
|
echo "[ERROR] 驱动文件不存在: $DRIVER_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[STEP 4] 配置 NVIDIA Container Toolkit..."
|
||||||
|
# common.sh 中已经安装了 nvidia-container-toolkit deb 包
|
||||||
|
# 配置 Containerd 运行时
|
||||||
|
nvidia-ctk runtime configure --runtime=containerd
|
||||||
|
systemctl restart containerd
|
||||||
|
|
||||||
|
echo "[INFO] 正在加入集群..."
|
||||||
|
if [ -f "./join_cluster.sh" ]; then
|
||||||
|
bash ./join_cluster.sh
|
||||||
|
else
|
||||||
|
echo "[ERROR] 未找到 join_cluster.sh。"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[INFO] 节点加入成功。请在 Master 执行以下命令启用 GPU Operator:"
|
||||||
|
echo "kubectl label node $(hostname) nvidia.com/gpu.present=true"
|
||||||
Loading…
x
Reference in New Issue
Block a user