From e3d1d04387ff343c1b2bc847a0647139a02df3f9 Mon Sep 17 00:00:00 2001 From: yumoqing Date: Fri, 21 Nov 2025 15:21:45 +0800 Subject: [PATCH] bugfix --- cluster-config.yaml | 77 ++++++++++++++++++++++++++++++++ control-plane-install.sh.j2 | 41 +++++++++++++++++ cpu-worker-install.sh.j2 | 24 ++++++++++ download_offline_packages.sh.j2 | 50 +++++++++++++++++++++ gpu-worker-install.sh.j2 | 29 ++++++++++++ install_nvidia_driver.sh.j2 | 13 ++++++ manifests/cdi-cr.yaml | 6 +++ manifests/cdi-operator.yaml | 5 +++ manifests/gpu-operator.yaml | 9 ++++ manifests/kubevirt-cr.yaml | 8 ++++ manifests/kubevirt-operator.yaml | 9 ++++ manifests/nfs-csi.yaml | 7 +++ render.sh | 20 +++++++++ rendered/README.md | 0 14 files changed, 298 insertions(+) create mode 100644 cluster-config.yaml create mode 100644 control-plane-install.sh.j2 create mode 100644 cpu-worker-install.sh.j2 create mode 100644 download_offline_packages.sh.j2 create mode 100644 gpu-worker-install.sh.j2 create mode 100644 install_nvidia_driver.sh.j2 create mode 100644 manifests/cdi-cr.yaml create mode 100644 manifests/cdi-operator.yaml create mode 100644 manifests/gpu-operator.yaml create mode 100644 manifests/kubevirt-cr.yaml create mode 100644 manifests/kubevirt-operator.yaml create mode 100644 manifests/nfs-csi.yaml create mode 100755 render.sh create mode 100644 rendered/README.md diff --git a/cluster-config.yaml b/cluster-config.yaml new file mode 100644 index 0000000..8d53084 --- /dev/null +++ b/cluster-config.yaml @@ -0,0 +1,77 @@ +# cluster-config.yaml - 修改为你的实际集群配置 + +kubernetes: + version: "1.29.3" + pod_cidr: "10.244.0.0/16" + service_cidr: "10.96.0.0/12" + cluster_name: "offline-k8s-cluster" + +k8s_version: "1.29.3" +containerd_version: "1.7.13" +crictl_version: "1.29.0" + +kubevirt_version: "v1.28.0" +cdi_version: "v1.65.0" + +nfs_server: "192.168.16.2" +nfs_path: "/d/share/101206" + + +registry: "registry.local:5000" # 可选:若使用私有 registry + + +control_plane_ip: "192.168.16.5" + +nodes: + control_plane: + hostname: "k8s-master" + ip: "192.168.16.5" + +join: + token: "abcdef.0123456789abcdef" + hash: "123456abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234" + +# ===================================================== +# GPU Operator / NVIDIA 配置 +# ===================================================== +gpu: + driver_version: "535" + cuda_version: "12.4" + +# ===================================================== +# KubeVirt configuration +# ===================================================== +kubevirt: + version: "1.3.0" + namespace: "kubevirt" + +# ===================================================== +# CDI (Containerized Data Importer) +# ===================================================== +cdi: + version: "1.58.0" + namespace: "cdi" + +# ===================================================== +# GPU Operator +# ===================================================== +gpu_operator: + version: "v23.9.2" + driver_version: "535" + namespace: "gpu-operator" + +# ===================================================== +# NFS shared storage for VM disks +# ===================================================== +storage: + nfs_server: "i192.168.16.2" + nfs_path: "/d/share/11157" + storage_class_name: "nfs-kubevirt" + +# ===================================================== +# Offline bundle paths +# ===================================================== +offline_bundle: + output_dir: "/opt/k8s-offline" + output_file: "k8s-offline.tgz" + diff --git a/control-plane-install.sh.j2 b/control-plane-install.sh.j2 new file mode 100644 index 0000000..f2cd7f4 --- /dev/null +++ b/control-plane-install.sh.j2 @@ -0,0 +1,41 @@ +#!/bin/bash +set -e + +OFFLINE=/opt/k8s-offline + +echo "[1] 解压离线包" +mkdir -p $OFFLINE +tar xf k8s-offline.tgz -C $OFFLINE + +echo "[2] 安装 kubeadm/kubelet/kubectl" +install -m755 $OFFLINE/offline-cache/bin/* /usr/local/bin/ + +echo "[3] 初始化控制平面" +kubeadm init \ + --kubernetes-version={{ kubernetes.version }} \ + --pod-network-cidr={{ kubernetes.pod_cidr }} \ + --service-cidr={{ kubernetes.service_cidr }} \ + --upload-certs + +mkdir -p ~/.kube +cp /etc/kubernetes/admin.conf ~/.kube/config + +echo "[4] 加载所有离线镜像" +for img in $OFFLINE/offline-cache/images/*.tar; do + ctr -n=k8s.io images import "$img" +done + +echo "[5] 部署 CNI(flannel)" +kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/master/Documentation/kube-flannel.yml + +echo "[6] 部署 KubeVirt 与 CDI" +kubectl apply -f $OFFLINE/offline-cache/manifests/kubevirt-operator.yaml +kubectl apply -f $OFFLINE/offline-cache/manifests/kubevirt-cr.yaml +kubectl apply -f $OFFLINE/offline-cache/manifests/cdi-operator.yaml +kubectl apply -f $OFFLINE/offline-cache/manifests/cdi-cr.yaml + +echo "[7] 部署 NFS-CSI" +kubectl apply -f $OFFLINE/offline-cache/manifests/nfs-csi.yaml + +echo "控制平面安装完成。" + diff --git a/cpu-worker-install.sh.j2 b/cpu-worker-install.sh.j2 new file mode 100644 index 0000000..073c45d --- /dev/null +++ b/cpu-worker-install.sh.j2 @@ -0,0 +1,24 @@ +#!/bin/bash +set -e + +OFFLINE=/opt/k8s-offline + +echo "[1] 解压离线包" +mkdir -p $OFFLINE +tar xf k8s-offline.tgz -C $OFFLINE + +echo "[2] 安装 kubeadm/kubelet/kubectl" +install -m755 $OFFLINE/offline-cache/bin/* /usr/local/bin/ + +echo "[3] 加载所有离线镜像" +for img in $OFFLINE/offline-cache/images/*.tar; do + ctr -n=k8s.io images import "$img" +done + +echo "[4] 加入集群" +kubeadm join {{ nodes.control_plane.ip }}:6443 \ + --token {{ join.token }} \ + --discovery-token-ca-cert-hash sha256:{{ join.hash }} + +echo "CPU 工作节点已加入集群" + diff --git a/download_offline_packages.sh.j2 b/download_offline_packages.sh.j2 new file mode 100644 index 0000000..2bb3a04 --- /dev/null +++ b/download_offline_packages.sh.j2 @@ -0,0 +1,50 @@ +#!/bin/bash +# 在下载主机上需要安装docker + +set -e + +curdir=$(pwd) +OUT=./k8s-offline.tgz +TMP=./offline-cache + +apt install podman-docker +mkdir -p $TMP/bin $TMP/manifests $TMP/images $TMP/deps + +echo "[1] 下载 依赖包" +cd $TMP/deps +apt install --downloadonly nfs-common nfs-utils rpcbind +echo "📥 下载 crictl" +curl -L https://github.com/kubernetes-sigs/cri-tools/releases/download/${CRICLT_VERSION}/crictl-${CRICLT_VERSION}-linux-${ARCH}.tar.gz | tar xz -C . +echo "📥 下载 CNI plugins" +curl -L https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-${ARCH}-${CNI_VERSION}.tgz -o cni-plugins.tgz + +echo "📥 下载 containerd" +CONTAINERD_URL="https://github.com/containerd/containerd/releases/download/v${CONTAINERD_VERSION}/containerd-${CONTAINERD_VERSION}-linux-${ARCH}.tar.gz" +curl -L ${CONTAINERD_URL} -o containerd.tar.gz + + +echo "[2] 下载 Kubernetes 二进制 {{ kubernetes.version }}" +cd $TMP/bin +curl -LO https://dl.k8s.io/release/v{{ kubernetes.version }}/bin/linux/amd64/kubeadm +curl -LO https://dl.k8s.io/release/v{{ kubernetes.version }}/bin/linux/amd64/kubelet +curl -LO https://dl.k8s.io/release/v{{ kubernetes.version }}/bin/linux/amd64/kubectl +chmod +x kubeadm kubelet kubectl + + +cd $curdir +echo "[3] 下载镜像(kubeadm config images)" +$TMP/bin/kubeadm config images list --kubernetes-version {{ kubernetes.version }} > $TMP/images/images.txt +for img in "${images[@]}"; do + echo "Pull image: $img" + docker pull $img + docker save -o $TMP/images/$(echo $img | tr '/:' '_').tar $img +done + +echo "[4] 复制 manifests" +cp -r ../manifests/* $TMP/manifests/ + +echo "[5] 打包离线资源" +tar czf $OUT offline-cache + +echo "已生成离线包: $OUT" + diff --git a/gpu-worker-install.sh.j2 b/gpu-worker-install.sh.j2 new file mode 100644 index 0000000..eeeecd6 --- /dev/null +++ b/gpu-worker-install.sh.j2 @@ -0,0 +1,29 @@ +#!/bin/bash +set -e + +OFFLINE=/opt/k8s-offline + +mkdir -p $OFFLINE +tar xf k8s-offline.tgz -C $OFFLINE + +echo "[1] 安装 nvidia driver(离线)" +bash ./install_nvidia_driver.sh + +echo "[2] 安装 kubeadm/kubelet/kubectl" +install -m755 $OFFLINE/offline-cache/bin/* /usr/local/bin/ + +echo "[3] 导入镜像" +for img in $OFFLINE/offline-cache/images/*.tar; do + ctr -n=k8s.io images import "$img" +done + +echo "[4] 加入集群" +kubeadm join {{ nodes.control_plane.ip }}:6443 \ + --token {{ join.token }} \ + --discovery-token-ca-cert-hash sha256:{{ join.hash }} + +echo "[5] 自动部署 GPU Operator" +kubectl apply -f $OFFLINE/offline-cache/manifests/gpu-operator.yaml + +echo "GPU 工作节点初始化完成" + diff --git a/install_nvidia_driver.sh.j2 b/install_nvidia_driver.sh.j2 new file mode 100644 index 0000000..575309f --- /dev/null +++ b/install_nvidia_driver.sh.j2 @@ -0,0 +1,13 @@ +#!/bin/bash +set -e + +echo "安装 NVIDIA 驱动 {{ gpu.driver_version }}(离线方式)" + +bash NVIDIA-Linux-x86_64-{{ gpu.driver_version }}.run --silent + +echo "加载 nvidia 模块" +modprobe nvidia +modprobe nvidia_uvm + +echo "NVIDIA 驱动安装完成" + diff --git a/manifests/cdi-cr.yaml b/manifests/cdi-cr.yaml new file mode 100644 index 0000000..fe731b2 --- /dev/null +++ b/manifests/cdi-cr.yaml @@ -0,0 +1,6 @@ +apiVersion: cdi.kubevirt.io/v1beta1 +kind: CDI +metadata: + name: cdi + namespace: cdi + diff --git a/manifests/cdi-operator.yaml b/manifests/cdi-operator.yaml new file mode 100644 index 0000000..dc88ba1 --- /dev/null +++ b/manifests/cdi-operator.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: cdi + diff --git a/manifests/gpu-operator.yaml b/manifests/gpu-operator.yaml new file mode 100644 index 0000000..1104ab6 --- /dev/null +++ b/manifests/gpu-operator.yaml @@ -0,0 +1,9 @@ +# GPU Operator 示例,可替换为最新版本 +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gpu-operator + namespace: gpu-operator +spec: + replicas: 1 + diff --git a/manifests/kubevirt-cr.yaml b/manifests/kubevirt-cr.yaml new file mode 100644 index 0000000..cbd525a --- /dev/null +++ b/manifests/kubevirt-cr.yaml @@ -0,0 +1,8 @@ +apiVersion: kubevirt.io/v1 +kind: KubeVirt +metadata: + namespace: kubevirt + name: kubevirt +spec: + workloadUpdateStrategy: LiveMigrate + diff --git a/manifests/kubevirt-operator.yaml b/manifests/kubevirt-operator.yaml new file mode 100644 index 0000000..fb73747 --- /dev/null +++ b/manifests/kubevirt-operator.yaml @@ -0,0 +1,9 @@ +# 示例(可替换为最新版) +apiVersion: operator.kubevirt.io/v1 +kind: KubeVirt +metadata: + name: kubevirt + namespace: kubevirt +spec: + imagePullPolicy: IfNotPresent + diff --git a/manifests/nfs-csi.yaml b/manifests/nfs-csi.yaml new file mode 100644 index 0000000..badba61 --- /dev/null +++ b/manifests/nfs-csi.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: nfs-csi +--- +# 这里填你需要的 nfs-csi manifest... + diff --git a/render.sh b/render.sh new file mode 100755 index 0000000..76bc0a7 --- /dev/null +++ b/render.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e +# 渲染目录 +TEMPLATE_DIR="." +OUT_DIR="rendered" +mkdir -p ${OUT_DIR} + + +# 渲染每个模板 +jinja2 ${TEMPLATE_DIR}/download_offline_packages.sh.j2 cluster-config.yaml > ${OUT_DIR}/download_offline_packages.sh +jinja2 ${TEMPLATE_DIR}/control-plane-install.sh.j2 cluster-config.yaml > ${OUT_DIR}/control-plane-install.sh +jinja2 ${TEMPLATE_DIR}/cpu-worker-install.sh.j2 cluster-config.yaml > ${OUT_DIR}/cpu-worker-install.sh +jinja2 ${TEMPLATE_DIR}/gpu-worker-install.sh.j2 cluster-config.yaml > ${OUT_DIR}/gpu-worker-install.sh +jinja2 ${TEMPLATE_DIR}/install_nvidia_driver.sh.j2 cluster-config.yaml > ${OUT_DIR}/install_nvidia_driver.sh + + +chmod +x ${OUT_DIR}/*.sh + + +echo "渲染完成,生成脚本在 ${OUT_DIR} 目录。" diff --git a/rendered/README.md b/rendered/README.md new file mode 100644 index 0000000..e69de29