From 40087b4085def0989242e95fd43ea604b031383c Mon Sep 17 00:00:00 2001 From: yumoqing Date: Wed, 31 Dec 2025 14:08:24 +0800 Subject: [PATCH] bugfix --- deploy/README.md | 27 + deploy/dl.sh | 178 ++++ deploy/master-install.sh | 868 ++++++++++++++++++ deploy/master_remove_k8s.sh | 69 ++ deploy/tst.sh | 9 + script/ctrl_init.sh | 10 + script/download_pkgs.sh | 59 ++ script/install_offline.sh | 49 + script/k8s+kebuvirt/ctrl_install.sh | 89 ++ .../k8s+kebuvirt/deploy-kubevirt-and-gpu.sh | 33 + script/k8s+kebuvirt/dl-pkgs.sh | 111 +++ script/k8s+kebuvirt/gpuworker_install.sh | 94 ++ .../k8s+kebuvirt/nfs-client-provisioner.yaml | 47 + script/k8s+kebuvirt/worker_install.sh | 60 ++ 14 files changed, 1703 insertions(+) create mode 100644 deploy/README.md create mode 100644 deploy/dl.sh create mode 100644 deploy/master-install.sh create mode 100644 deploy/master_remove_k8s.sh create mode 100755 deploy/tst.sh create mode 100644 script/ctrl_init.sh create mode 100755 script/download_pkgs.sh create mode 100644 script/install_offline.sh create mode 100644 script/k8s+kebuvirt/ctrl_install.sh create mode 100644 script/k8s+kebuvirt/deploy-kubevirt-and-gpu.sh create mode 100644 script/k8s+kebuvirt/dl-pkgs.sh create mode 100644 script/k8s+kebuvirt/gpuworker_install.sh create mode 100644 script/k8s+kebuvirt/nfs-client-provisioner.yaml create mode 100644 script/k8s+kebuvirt/worker_install.sh diff --git a/deploy/README.md b/deploy/README.md new file mode 100644 index 0000000..49510ce --- /dev/null +++ b/deploy/README.md @@ -0,0 +1,27 @@ +# k8s + kubevirt +实现在k8s环境中分配虚拟机给客户,提高了售卖算力单元的隔离性和安全性,更好的资源管理和控制 + +## 环境说明 +* ubuntu 22.04 +* NFS共享存储提供虚拟机所需的存储 + +实现 + +* 按需分配的虚拟机算力,纯cpu算力和gpu算力 +* 算力节点全生命周期管理,创建,启动,关闭,改配,销毁 +* 提供本地镜像仓库 + +## 安装部署 +实现离线安装部署,所需安装包均在有网络的环境中下载,并传输到目标主机。 +实现控制节点和工作节点的安装部署自动化 + +安装时需要部分参数做出修改 + +### 文件说明 + +* dl.sh 环境所需软件的下载脚本,需要在有网络的环境中执行, 并且能无障碍的访问github + +* master-install.sh 控制节点一键安装脚本(需要按照实际环境修改参数) + +* worker-install.sh 工作节点一键安装脚本(需要根据实现环境修改参数) + diff --git a/deploy/dl.sh b/deploy/dl.sh new file mode 100644 index 0000000..bac5dd0 --- /dev/null +++ b/deploy/dl.sh @@ -0,0 +1,178 @@ +#!/bin/bash +set -e +# https://org.ngc.nvidia.com/setup/api-keys +# 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 +# =================配置区域================= +get_script_path(){ + # 获取脚本真实路径(解析软链接) + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)" + echo "$SCRIPT_DIR" +} +MYPATH=$(get_script_path) +ARCH=amd64 +WORKDIR=${MYPATH}/k8s-offline-bundle +K8S_VERSION="1.28.2" +HELM_VERSION="v3.13.1" +CNI_VERSION="v1.3.0" +CALICO_VERSION="v3.26.1" +KUBEVIRT_VERSION="v1.1.0" # 升级到更稳定的版本 +NVIDIA_DRIVER_VERSION="535.129.03" +# ========================================= + +echo ">>> [0/6] 初始化目录..." +mkdir -p $WORKDIR/{bin,service, debs,images,drivers,charts,manifests,scripts} + +echo ">>>[x] 下载containerd.service" +cd $WORKDIR/service +sudo curl -L https://raw.githubusercontent.com/containerd/containerd/main/containerd.service -o containerd.service + +PKGS_TO_DOWNLOAD="nfs-common socat conntrack ipset ebtables lvm2 gnupg2 software-properties-common curl ca-certificates apt-transport-https" +cd $WORKDIR/debs +sudo apt-get update -q +for pkg in $PKGS_TO_DOWNLOAD; do + echo "Processing package: $pkg" + # 使用 apt-rdepends 找出依赖并下载 (需要先安装: sudo apt install apt-rdepends) + # 如果没有 apt-rdepends,可以用简化的 apt-get download,但可能漏掉深层依赖 + # 这里使用一种更通用的方法,尝试下载包本身 + apt-get download "$pkg" 2>/dev/null || echo "Warning: Failed to download $pkg" +done +apt-get download build-essential linux-headers-$(uname -r) pkg-config 2>/dev/null +# 然后使用 apt-get download 下载包及其所有依赖 +sudo apt-get download nvidia-container-toolkit libnvidia-container-tools libnvidia-container1 nvidia-container-runtime cuda-keyring +ls -l $WORKDIR/debs + +# 检查 Docker 是否存在 (下载镜像必须) +if ! command -v docker &> /dev/null; then + echo "正在安装 Docker (用于拉取镜像)..." + apt-get update && apt-get install -y docker.io +fi + +# ================= 1. 二进制文件 ================= +echo ">>> [1/6] 下载二进制工具 (Helm, CNI)..." +cd $WORKDIR/bin + +# 1. Kubernetes Binaries (kubelet, kubeadm, kubectl) +curl -L --retry 3 https://dl.k8s.io/v${K8S_VERSION}/bin/linux/${ARCH}/kubeadm -o kubeadm +curl -L --retry 3 https://dl.k8s.io/v${K8S_VERSION}/bin/linux/${ARCH}/kubelet -o kubelet +curl -L --retry 3 https://dl.k8s.io/v${K8S_VERSION}/bin/linux/${ARCH}/kubectl -o kubectl +chmod +x kubeadm kubelet kubectl + +# Helm +if [ ! -f "helm" ]; then + echo "Downloading Helm..." + wget -q https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz + tar -zxvf helm-${HELM_VERSION}-linux-amd64.tar.gz + mv linux-amd64/helm . + rm -rf linux-amd64 helm-*.tar.gz +fi + +# CNI Plugins +if [ ! -f "cni-plugins-linux-amd64-${CNI_VERSION}.tgz" ]; then + echo "Downloading CNI Plugins..." + wget -q https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-amd64-${CNI_VERSION}.tgz +fi + +echo "Binaries ready." + +# ================= 2. 容器镜像 ================= +echo ">>> [2/6] 拉取并打包容器镜像 (这需要较长时间)..." +# 确保 Docker 守护进程在运行 +service docker start || true + +# 定义镜像列表 +# 包含: K8s 核心, Calico, Multus, KubeVirt, NFS, Nvidia相关 +# 注意: Pause 镜像版本需与 kubeadm config 中一致 +NVIDIA_REPO="nvcr.io/nvidia" +IMAGES=( + "registry.k8s.io/kube-apiserver:v${K8S_VERSION}" + "registry.k8s.io/kube-controller-manager:v${K8S_VERSION}" + "registry.k8s.io/kube-scheduler:v${K8S_VERSION}" + "registry.k8s.io/kube-proxy:v${K8S_VERSION}" + "registry.k8s.io/pause:3.9" + "registry.k8s.io/etcd:3.5.12-0" + "registry.k8s.io/coredns/coredns:v1.10.1" + "docker.io/calico/cni:${CALICO_VERSION}" + "docker.io/calico/node:${CALICO_VERSION}" + "docker.io/calico/kube-controllers:${CALICO_VERSION}" + "docker.io/library/registry:2" + "ghcr.io/k8snetworkplumbingwg/multus-cni:v4.0.2" + "quay.io/kubevirt/virt-operator:${KUBEVIRT_VERSION}" + "quay.io/kubevirt/virt-api:${KUBEVIRT_VERSION}" + "quay.io/kubevirt/virt-controller:${KUBEVIRT_VERSION}" + "quay.io/kubevirt/virt-handler:${KUBEVIRT_VERSION}" + "quay.io/kubevirt/virt-launcher:${KUBEVIRT_VERSION}" + "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2" + "nvcr.io/nvidia/k8s-device-plugin:v0.14.1" +) + +# ${NVIDIA_REPO}/container-toolkit:v1.13.5-ubuntu20.04 +# ${NVIDIA_REPO}/dcgm-exporter:3.2.5-3.1.7-ubuntu20.04 +# ${NVIDIA_REPO}/gpu-feature-discovery:v0.8.1 +# ${NVIDIA_REPO}/driver:535.104.05-ubuntu22.04 + +cd $WORKDIR/images +for img in "${IMAGES[@]}"; do + # 将 / 和 : 替换为 _ 作为文件名 + FILENAME=$(echo $img | tr '/:' '__').tar + if [ -f "$FILENAME" ]; then + echo "跳过已存在: $FILENAME" + else + echo "Pulling $img ..." + docker pull $img + echo "Saving to $FILENAME ..." + docker save $img -o $FILENAME + # 节省空间,保存后删除本地 docker缓存 + docker rmi $img + fi +done + +# ================= 3. NVIDIA 驱动 ================= +echo ">>> [3/6] 下载 NVIDIA H100 驱动 (.run)..." +cd $WORKDIR/drivers +DRIVER_NAME="NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run" +if [ ! -f "$DRIVER_NAME" ]; then + echo "Downloading NVIDIA Driver..." + wget -q https://us.download.nvidia.com/tesla/${NVIDIA_DRIVER_VERSION}/${DRIVER_NAME} +fi + +# ================= 4. YAML Manifests ================= +echo ">>> [4/6] 下载 K8s YAML 配置文件..." +cd $WORKDIR/manifests + +# Calico +curl -L -o calico.yaml https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/calico.yaml + +# KubeVirt +KUBEVIRT_REL="https://github.com/kubevirt/kubevirt/releases/download/${KUBEVIRT_VERSION}" +curl -L -o kubevirt-operator.yaml ${KUBEVIRT_REL}/kubevirt-operator.yaml +curl -L -o kubevirt-cr.yaml ${KUBEVIRT_REL}/kubevirt-cr.yaml + +# Multus +curl -L -o multus-daemonset.yaml https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/master/deployments/multus-daemonset.yml + +# ================= 5. Helm Charts ================= +echo ">>> [5/6] 下载 Helm Charts..." +cd $WORKDIR/charts + +# 添加 repo (如果 helm 命令可用) +if command -v helm &> /dev/null; then + helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner/ + helm repo update + helm pull nfs-subdir-external-provisioner/nfs-subdir-external-provisioner --version 4.0.18 +else + echo "Helm not installed on host, downloading chart directly via wget..." + wget -q https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz +fi + +# ================= 6. 验证 ================= +echo "---------------------------------------------" +echo ">>> 下载工作全部完成!正在统计文件大小..." +cd $WORKDIR +du -sh * +echo "---------------------------------------------" +echo "请检查 debs 目录是否依然有文件 (这是之前下载的)。" +echo "images 目录应该有几 GB 大小。" +echo "drivers 目录应该有 400MB+。" +cd ${MYPATH} +tar cvf - k8s-offline-bundle master-install.sh worker-install.sh | gzip > k8s-offline-bundle.tgz + diff --git a/deploy/master-install.sh b/deploy/master-install.sh new file mode 100644 index 0000000..37d77d4 --- /dev/null +++ b/deploy/master-install.sh @@ -0,0 +1,868 @@ +#!/bin/bash + +set -eo pipefail # 脚本遇到任何错误立即退出,未捕捉的管道错误也退出 + +get_script_path(){ + # 获取脚本真实路径(解析软链接) + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)" + echo "$SCRIPT_DIR" +} +# ============================================================================== +# 配置区域 +# ============================================================================== +MYPATH=$(get_script_path) +OFFLINE_ASSETS_DIR="${MYPATH}/k8s-offline-bundle" + +K8S_VERSION="v1.28.2" +CALICO_VERSION="v3.26.1" +KUBEVIRT_VERSION="v1.1.0" +MULTUS_VERSION="v4.0.2" # Multus CNI 镜像版本 +NFS_PROVISIONER_VERSION="v4.0.2" # NFS Provisioner 镜像标签 +NFS_CHART_VERSION="4.0.18" # Helm Chart 版本 + +K8S_MASTER_IP="192.168.16.5" # 控制节点的IP,用于API Server绑定和广告 +LOCAL_REGISTRY_PORT="5000" +LOCAL_REGISTRY_ADDR="${K8S_MASTER_IP}:${LOCAL_REGISTRY_PORT}" # 本地镜像仓库地址 + +K8S_APISERVER_ADVERTISE_ADDRESS="${K8S_MASTER_IP}" # kubeadm init 使用的API Server广告地址 +POD_CIDR="10.244.0.0/16" +SERVICE_CIDR="10.96.0.0/12" + +NFS_SERVER="192.168.16.2" +NFS_PATH="/d/share/101206" +NFS_STORAGE_CLASS_NAME="nfs-client" + +TEMP_DIR="/tmp/k8s-master-setup" # 临时工作目录 +NAMESPACE="default" # 默认命名空间,用于 ctr 命令 +CONTAINERD_CONFIG="/etc/containerd/config.toml" +CERTS_D_PATH="/etc/containerd/certs.d" +# /etc/containerd/config.toml文件做以下修改 +# SystemdCgroup = false 在 [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] 下。这个也需要改为 true。 +# ============================================================================== +# 启动前日志输出 +# ============================================================================== +echo "==================================================" +echo " Kubernetes 控制节点离线安装脚本 " +echo "==================================================" +echo "配置参数:" +echo " K8s 版本: ${K8S_VERSION}" +echo " 本地镜像仓库: ${LOCAL_REGISTRY_ADDR}" +echo " K8s API Server IP: ${K8S_APISERVER_ADVERTISE_ADDRESS}" +echo " Pod CIDR: ${POD_CIDR}" +echo " Service CIDR: ${SERVICE_CIDR}" +echo " NFS Server: ${NFS_SERVER}:${NFS_PATH}" +echo "--------------------------------------------------" + +# ============================================================================== +# 通用函数 +# ============================================================================== + +log_info() { + echo -e "\e[32m[INFO] $(date +'%Y-%m-%d %H:%M:%S') $1\e[0m" +} + +log_warn() { + echo -e "\e[33m[WARN] $(date +'%Y-%m-%d %H:%M:%S') $1\e[0m" >&2 +} + +log_error() { + echo -e "\e[31m[ERROR] $(date +'%Y-%m-%d %H:%M:%S') $1\e[0m" >&2 + exit 1 +} + +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本必须以 root 用户或使用 sudo 运行。" + fi +} + +configure_sysctl() { + log_info "配置系统内核参数..." + cat < /dev/null +overlay +br_netfilter +EOF + sudo modprobe overlay + sudo modprobe br_netfilter + + cat < /dev/null +net.bridge.bridge-nf-call-iptables = 1 +net.bridge.bridge-nf-call-ip6tables = 1 +net.ipv4.ip_forward = 1 +EOF + sudo sysctl --system > /dev/null + log_info "系统内核参数配置完成。" +} + +disable_swap() { + log_info "禁用 Swap 分区..." + if grep -q "swap" /etc/fstab; then + sudo swapoff -a + sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab + log_info "Swap 分区已禁用并从 fstab 中注释。" + else + log_info "未检测到 Swap 分区或已禁用。" + fi +} + +# ============================================================================== +# 0. 前置检查与环境初始化 +# ============================================================================== +check_root +configure_sysctl +disable_swap + +log_info "创建临时工作目录: ${TEMP_DIR}" +sudo mkdir -p "${TEMP_DIR}" +sudo rm -rf "${TEMP_DIR}/*" # 清理旧的临时文件 + +log_info "将离线资源目录添加到 PATH。" +export PATH="${OFFLINE_ASSETS_DIR}/bin:$PATH" +echo "export PATH=${OFFLINE_ASSETS_DIR}/bin:\$PATH" | sudo tee /etc/profile.d/offline-k8s.sh > /dev/null +# ============================================================================== +# 1. 安装操作系统依赖 (DEB 包) +# ============================================================================== +log_info "开始安装操作系统依赖 (DEB 包)..." +DEBS_DIR="${OFFLINE_ASSETS_DIR}/debs" +if [ ! -d "$DEBS_DIR" ]; then + log_error "DEB 包目录 ${DEBS_DIR} 不存在。请确保将所有 .deb 文件放在此目录中。" +fi + +cd "${DEBS_DIR}" || log_error "无法进入 DEB 包目录 ${DEBS_DIR}。" + +log_info "尝试安装所有 DEB 包。这可能需要一些时间,并会尝试多次以解决依赖顺序问题。" +# 尝试多次安装,以解决部分依赖顺序问题 +# for i in {1..3}; do +# log_info "第 ${i} 次尝试安装 DEB 包..." +# sudo dpkg -i *.deb &>/dev/null || true +# done + +# 最终检查是否有未满足的依赖,尝试修复 +log_info "检查并尝试解决任何未满足的 DEB 包依赖..." +if ! sudo apt-get install -f --assume-yes &>/dev/null; then + log_warn "部分 DEB 包依赖可能未完全满足。请手动检查并解决 (例如运行 'sudo apt-get install -f')。" +else + log_info "所有 DEB 包及其依赖已成功安装或已解决。" +fi + +cd - > /dev/null # 返回之前的工作目录 +log_info "操作系统依赖 (DEB 包) 安装完成。" + +# ============================================================================== +# 2. 安装 Docker (仅用于本地镜像仓库) +# ============================================================================== +log_info "安装 Docker daemon (仅用于本地镜像仓库) ..." +if ! command_exists docker; then + log_error "未检测到 Docker CLI。请确保已安装 Docker (或其他兼容的容器引擎如Podman)。" +fi + +log_info "配置 Docker daemon 信任本地仓库 ${LOCAL_REGISTRY_ADDR} (针对非 HTTPS)..." +sudo mkdir -p /etc/docker +cat < /dev/null +{ + "insecure-registries": ["${LOCAL_REGISTRY_ADDR}"], + "exec-opts": ["native.cgroupdriver=systemd"], + "log-driver": "json-file", + "log-opts": { + "max-size": "100m" + } +} +EOF +sudo groupadd docker &>/dev/null || true # 如果组已存在,忽略错误 +sudo systemctl daemon-reload +sudo systemctl enable docker.socket +sudo systemctl enable docker +sudo systemctl restart docker.socket +sudo systemctl restart docker +sudo systemctl status docker --no-pager || log_error "Docker daemon 启动失败。" +log_info "Docker daemon 已配置信任本地仓库并重启。" + +# ============================================================================== +# 3. 安装 Containerd 运行时 +# ============================================================================== +log_info "安装 Containerd 运行时..." +CONTAINERD_TAR_GZ=$(find "${OFFLINE_ASSETS_DIR}/bin" -name "containerd-*.tar.gz" | head -n 1) +if [ -z "$CONTAINERD_TAR_GZ" ]; then + log_error "未找到 Containerd 压缩包。" +fi + +sudo tar Cxzvf /usr/local "$CONTAINERD_TAR_GZ" || log_error "解压 Containerd 失败。" + +# 确保 containerd systemd 服务文件存在 +CONTAINERD_SERVICE_FILE="${OFFLINE_ASSETS_DIR}/service/containerd.service" +if [ ! -f "$CONTAINERD_SERVICE_FILE" ]; then + log_error "未找到 containerd.service 文件: ${CONTAINERD_SERVICE_FILE}" +fi +sudo cp "$CONTAINERD_SERVICE_FILE" /etc/systemd/system/containerd.service +sudo systemctl daemon-reload # 重新加载服务配置 + +log_info "生成并配置 Containerd 默认配置文件..." +sudo mkdir -p /etc/containerd +sudo containerd config default | sudo tee /etc/containerd/config.toml > /dev/null + +# --- 配置 containerd registry mirrors using config_path --- +log_info "配置 containerd 镜像仓库代理..." + +# 创建必要的目录 +for reg in "${LOCAL_REGISTRY_ADDR}" registry.k8s.io ghcr.io quay.io docker.io nvcr.io; do + sudo mkdir -p "${CERTS_D_PATH}/${reg}" +done + +# 为本地 Registry 配置 hosts.toml (http, skip_verify) +sudo tee "${CERTS_D_PATH}/${LOCAL_REGISTRY_ADDR}/hosts.toml" > /dev/null < /dev/null < /dev/null + else + sudo sed -i "/\[plugins.\"io.containerd.grpc.v1.cri\".registry\]/a \\\n config_path = \"${CERTS_D_PATH}\"" "$CONTAINERD_CONFIG" + fi +fi + +# 移除旧的 mirrors 和 configs (弃用警告相关的部分) +# 使用多行 sed 表达式删除整个块 +sudo sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."registry\.k8s\.io"\]/,/^endpoint = \[/d' "$CONTAINERD_CONFIG" || true +sudo sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.configs\."192\.168\.16\.5:5000"\.tls\]/,/^insecure_skip_verify = /d' "$CONTAINERD_CONFIG" || true +# 确保删除所有相关的空行或残留的块头 +sudo sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]/d' "$CONTAINERD_CONFIG" || true +sudo sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.configs\]/d' "$CONTAINERD_CONFIG" || true + +log_info "重启 containerd 服务..." +sudo systemctl daemon-reload +sudo systemctl restart containerd || log_error "Containerd 服务启动失败。" +sudo systemctl status containerd --no-pager || log_error "Containerd 服务状态异常。" +log_info "Containerd 配置完成并已启动。" + +# 配置 crictl +log_info "配置 crictl..." +cat < /dev/null +runtime-endpoint: unix:///run/containerd/containerd.sock +image-endpoint: unix:///run/containerd/containerd.sock +EOF +log_info "crictl 配置完成。" + +# ============================================================================== +# 4. 安装 CNI 插件 +# ============================================================================== +log_info "安装 CNI 插件..." +CNI_PLUGINS_TAR_GZ=$(find "${OFFLINE_ASSETS_DIR}/bin" -name "cni-plugins-*.tgz" | head -n 1) +if [ -z "$CNI_PLUGINS_TAR_GZ" ]; then + log_error "未找到 CNI 插件压缩包。" +fi + +sudo mkdir -p /opt/cni/bin +sudo tar Cxzvf /opt/cni/bin "$CNI_PLUGINS_TAR_GZ" || log_error "解压 CNI 插件失败。" +log_info "CNI 插件安装完成。" + +# ============================================================================== +# 5. 安装 Kubernetes Binaries (kubelet, kubeadm, kubectl) +# ============================================================================== +log_info "安装 Kubernetes Binaries..." +BIN_DIR="${OFFLINE_ASSETS_DIR}/bin" +for bin in kubelet kubeadm kubectl helm; do + if [ ! -f "${BIN_DIR}/${bin}" ]; then + log_error "Kubernetes 二进制文件 ${bin} 未找到在 ${BIN_DIR}。" + fi + sudo cp "${BIN_DIR}/${bin}" /usr/local/bin/ + sudo chmod +x "/usr/local/bin/${bin}" +done + +# 配置 kubelet systemd 服务 (从模板生成) +log_info "配置 kubelet systemd 服务..." +cat <<'EOF' | sudo tee /etc/systemd/system/kubelet.service +[Unit] +Description=kubelet: The Kubernetes Node Agent +Documentation=https://kubernetes.io/docs/ +After=containerd.service +Wants=containerd.service + +[Service] +ExecStart=/usr/local/bin/kubelet +Restart=always +StartLimitInterval=0 +RestartSec=10 + +[Install] +WantedBy=multi-user.target +EOF + +sudo mkdir -p /etc/systemd/system/kubelet.service.d +cat <<'EOF' | sudo tee /etc/systemd/system/kubelet.service.d/10-kubeadm.conf +[Service] +Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf" +Environment="KUBELET_CONFIG_ARGS=--config=/var/lib/kubelet/config.yaml" +EnvironmentFile=-/etc/default/kubelet +ExecStart= +ExecStart=/usr/local/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_EXTRA_ARGS +EOF + + +sudo systemctl daemon-reload +sudo systemctl enable kubelet || log_error "启用 kubelet 服务失败。" +log_info "Kubernetes Binaries 安装完成,kubelet 服务已启用但未启动。" + +# ============================================================================== +# 6. 启动本地镜像仓库 (仅在控制节点,192.168.16.5) +# ============================================================================== +log_info "启动本地镜像仓库 ${LOCAL_REGISTRY_ADDR} ..." + +# 加载 registry 镜像 +cd "${OFFLINE_ASSETS_DIR}/images" +REGISTRY_TAR=$(find . -name "registry_2.tar" | head -n 1) +if [ -z "$REGISTRY_TAR" ]; then + log_error "未找到本地镜像仓库 registry:2 的 tar 包。" +fi +sudo docker load -i "$REGISTRY_TAR" || log_error "加载 registry:2 镜像失败。" + +# 停止并删除旧的 registry 容器,确保干净启动 +sudo docker stop registry &>/dev/null || true +sudo docker rm -v registry &>/dev/null || true + +# 启动 registry 容器 +sudo docker run -d -p "${LOCAL_REGISTRY_PORT}:5000" --restart=always --name registry registry:2 || log_error "启动本地镜像仓库容器失败。" +log_info "本地镜像仓库已在 ${LOCAL_REGISTRY_ADDR} 启动。" +cd - > /dev/null + +# ============================================================================== +# 7. 导入并标记所有镜像到 containerd +# ============================================================================== +log_info "导入所有离线镜像到 containerd 仓库并标记..." + +IMAGE_DIR="${OFFLINE_ASSETS_DIR}/images" +if [ ! -d "$IMAGE_DIR" ]; then + log_error "镜像文件目录 ${IMAGE_DIR} 不存在。" +fi + +# 清理 containerd 本地存储中的所有镜像 (除registry:2外,避免误删) +log_info "清理 containerd 中已存在的镜像..." +# 使用 ctr images ls --quiet 获取所有镜像的 digest +# 然后过滤掉那些可能是本地 registry 相关的镜像,避免干扰 +ctr_images_to_delete=$(ctr -n "$NAMESPACE" images ls --quiet | while read -r digest; do + # 检查该 digest 对应的 REF 是否包含 LOCAL_REGISTRY_ADDR 或 registry:2 + # 这里有点复杂,因为一个 digest 可能有多个 REF + refs=$(ctr -n "$NAMESPACE" images ls --no-header | grep "$digest" | awk '{print $1}') + skip_delete=false + for ref in $refs; do + if [[ "$ref" == *"/registry:2"* ]]; then + log_info " 跳过删除 registry 镜像: $ref ($digest)" + skip_delete=true + break + fi + done + if [ "$skip_delete" = false ]; then + echo "$digest" # 输出需要删除的 digest + fi +done) + +if [ -n "$ctr_images_to_delete" ]; then + echo "$ctr_images_to_delete" | while read -r digest_to_delete; do + log_info " 删除 containerd 镜像 (digest): $digest_to_delete" + ctr -n "$NAMESPACE" images rm "$digest_to_delete" &>/dev/null || log_warn "删除镜像 $digest_to_delete 失败 (可能被使用或不存在)。" + done +fi +log_info "Containerd 镜像清理完成。" + +for tarfile in "$IMAGE_DIR"/*.tar; do + [ -e "$tarfile" ] || continue + + echo "" + echo ">>> Processing $tarfile" + + # 1️⃣ 获取导入前的镜像列表 + IMAGES_BEFORE=$(mktemp) + # ctr images ls 的第一列就是 REF (镜像名称),使用 awk 提取 + if ! ctr -n "$NAMESPACE" images ls | awk 'NR>1 {print $1}' | sort > "$IMAGES_BEFORE"; then + log_info "❌ Failed to get images list before import." + continue + fi + + # Debug: + log_info "Images BEFORE import for $tarfile:" + cat "$IMAGES_BEFORE" + + # 2️⃣ 导入镜像 + if ! ctr -n "$NAMESPACE" images import "$tarfile"; then + log_info "❌ Failed to import image from $tarfile." + rm -f "$IMAGES_BEFORE" # 清理临时文件 + continue + fi + + # 3️⃣ 获取导入后的镜像列表 + IMAGES_AFTER=$(mktemp) + if ! ctr -n "$NAMESPACE" images ls | awk 'NR>1 {print $1}' | sort > "$IMAGES_AFTER"; then + echo "❌ Failed to get images list after import." + rm -f "$IMAGES_BEFORE" # 清理临时文件 + continue + fi + + # Debug: + log_info "Images AFTER import for $tarfile:" + # cat "$IMAGES_AFTER" + # echo "Raw difference (comm -13):" + # comm -13 "$IMAGES_BEFORE" "$IMAGES_AFTER" + + # 4️⃣ 找出新增的镜像 (即原始镜像)。排除掉带有本地Registry前缀的镜像本身。 + # 过滤条件:排除本地 registry 已存在的镜像,以及 引用。 + # 因为导入的 tarfile 可能会包含多个 tag,我们只取第一个符合条件的 + ORIGIN_IMG=$(comm -13 "$IMAGES_BEFORE" "$IMAGES_AFTER" | grep -vE "${LOCAL_REGISTRY_ADDR}|" | head -n1|| true) + + if [ "$ORIGIN_IMG" = "" ]; then + continue + fi + log_info "JUST A TEST" + rm -f "$IMAGES_BEFORE" "$IMAGES_AFTER" # 清理临时文件 + + if [[ -z "$ORIGIN_IMG" ]]; then + echo "❌ Failed to detect original image name, skipping..." + continue + fi + echo "Original image: $ORIGIN_IMG" + + NEW_IMG="" + if [[ "$ORIGIN_IMG" == "registry.k8s.io/"* ]]; then + if [[ "$ORIGIN_IMG" == "registry.k8s.io/coredns/"* ]]; then + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#registry.k8s.io/coredns/}" + else + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#registry.k8s.io/}" + fi + elif [[ "$ORIGIN_IMG" == "ghcr.io/"* ]]; then + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#ghcr.io/}" + elif [[ "$ORIGIN_IMG" == "quay.io/"* ]]; then + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#quay.io/}" + elif [[ "$ORIGIN_IMG" == "nvcr.io/"* ]]; then + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#nvcr.io/}" + elif [[ "$ORIGIN_IMG" == "docker.io/"* ]]; then + if [[ "$ORIGIN_IMG" == "docker.io/library/"* ]]; then + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#docker.io/library/}" + else + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#docker.io/}" + fi + else + echo "Warning: Unknown original registry prefix for $ORIGIN_IMG. Directly prepending LOCAL_REGISTRY_ADDR." + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG}" + fi + + echo "Retag as: $NEW_IMG" + + # 4️⃣ 打 tag + ctr -n "$NAMESPACE" images tag "$ORIGIN_IMG" "$NEW_IMG" + + # 5️⃣ 推送到本地 registry + ctr -n "$NAMESPACE" images push --plain-http "$NEW_IMG" + echo "tarfile=$tarfile ORIGIN_IMG=$ORIGIN_IMG NEW_IMG=$NEW_IMG" + + echo "✅ Done: $NEW_IMG" +done + +log_info "所有镜像已导入 containerd 仓库并正确标记。" +log_info "当前 containerd 镜像列表 (前 20 条):" +ctr -n "$NAMESPACE" images ls | head -n 20 || true # 打印最终镜像列表以供检查 + +# ============================================================================== +# 8. 初始化 Kubernetes 控制平面 +# ============================================================================== +log_info "初始化 Kubernetes 控制平面..." + +# 确保 /etc/kubernetes 目录干净,防止 kubeadm init 失败 +log_info "清理 /etc/kubernetes 目录..." +sudo kubeadm reset --force &>/dev/null || true # 强制重置 kubeadm 配置 +sudo rm -rf /etc/kubernetes/* || log_warn "清理 /etc/kubernetes 目录失败,可能存在权限问题或文件被占用。" +sudo rm -rf "$HOME/.kube" # 清理用户 kubeconfig +log_info "已清理 /etc/kubernetes 目录和用户 .kube 配置。" + +# 生成 kubeadm 配置 +log_info "生成 kubeadm-config.yaml 配置..." +cat < /dev/null +apiVersion: kubeadm.k8s.io/v1beta3 +kind: InitConfiguration +localAPIEndpoint: + advertiseAddress: "${K8S_APISERVER_ADVERTISE_ADDRESS}" # 替换为实际 IP,比如 192.168.16.10 + bindPort: 6443 +--- +apiVersion: kubeadm.k8s.io/v1beta3 +kind: ClusterConfiguration +kubernetesVersion: ${K8S_VERSION} +imageRepository: ${LOCAL_REGISTRY_ADDR} # ⬅️ 关键!指定本地镜像仓库 +networking: + podSubnet: ${POD_CIDR} + serviceSubnet: ${SERVICE_CIDR} +--- +apiVersion: kubelet.config.k8s.io/v1beta1 +kind: KubeletConfiguration +cgroupDriver: systemd # 根据你的环境选择 systemd 或 cgroupfs +EOF + +log_info "kubeadm-config.yaml 已生成,内容如下:" +cat ${TEMP_DIR}/kubeadm-config.yaml + +# 运行 kubeadm init +log_info "运行 kubeadm init 命令..." +# --upload-certs: 上传证书到集群以便工作节点获取 +# --config: 指定配置 +# --ignore-preflight-errors=all: 忽略所有预检错误,但在生产环境建议逐一排查。 +sudo kubeadm init --config=${TEMP_DIR}/kubeadm-config.yaml --upload-certs --ignore-preflight-errors=all + +if [ $? -ne 0 ]; then + log_error "kubeadm init 失败。" +fi + +log_info "Kubernetes 控制平面初始化完成。" + +# 配置 kubectl +log_info "配置 kubectl 访问集群..." +mkdir -p "$HOME/.kube" +sudo cp /etc/kubernetes/admin.conf "$HOME/.kube/config" +sudo chown $(id -u):$(id -g) "$HOME/.kube/config" +export KUBECONFIG=$HOME/.kube/config # 确保当前会话可用 +log_info "kubectl 配置完成。" + +log_info "等待 Kubernetes 控制平面 Pod 启动 (最多 5 分钟)..." +# 等待 kube-apiserver, kube-controller-manager, kube-scheduler Pod 启动 +sleep 1 +kubectl wait --for=condition=ready pod -l component=kube-apiserver -n kube-system --timeout=300s || log_error "kube-apiserver Pod 未能在指定时间内启动。" +kubectl wait --for=condition=ready pod -l component=kube-controller-manager -n kube-system --timeout=300s || log_error "kube-controller-manager Pod 未能在指定时间内启动。" +kubectl wait --for=condition=ready pod -l component=kube-scheduler -n kube-system --timeout=300s || log_error "kube-scheduler Pod 未能在指定时间内启动。" + +log_info "核心控制平面组件已就绪。" +log_info "查看集群节点状态:" +kubectl get nodes + +# ======== +# 设置环境变量 +# ======== +mkdir -p $HOME/.kube +sudo cp /etc/kubernetes/admin.conf $HOME/.kube/config +sudo chown $(id -u):$(id -g) $HOME/.kube/config + +# ============================================================================== +# 9. 安装 CNI 网络插件 (Calico) +# ============================================================================== +log_info "安装 CNI 网络插件 (Calico)..." + +CALICO_MANIFEST_ORIG="${OFFLINE_ASSETS_DIR}/manifests/calico.yaml" +if [ ! -f "$CALICO_MANIFEST_ORIG" ]; then + log_error "Calico 原始 manifest 文件 ${CALICO_MANIFEST_ORIG} 不存在。" +fi +CALICO_MANIFEST_TEMP="${TEMP_DIR}/calico.yaml" +cp "${CALICO_MANIFEST_ORIG}" "${CALICO_MANIFEST_TEMP}" || log_error "复制 Calico manifest 文件失败。" + +# 替换 Calico 镜像地址 +log_info "替换 Calico 镜像地址为本地仓库: ${LOCAL_REGISTRY_ADDR} ..." +# 注意:Calico 的镜像通常在 docker.io 下,所以替换规则不同于 k8s.io +sudo sed -i "s|docker.io/calico/cni:${CALICO_VERSION}|${LOCAL_REGISTRY_ADDR}/calico/cni:${CALICO_VERSION}|g" "${CALICO_MANIFEST_TEMP}" +sudo sed -i "s|docker.io/calico/node:${CALICO_VERSION}|${LOCAL_REGISTRY_ADDR}/calico/node:${CALICO_VERSION}|g" "${CALICO_MANIFEST_TEMP}" +sudo sed -i "s|docker.io/calico/kube-controllers:${CALICO_VERSION}|${LOCAL_REGISTRY_ADDR}/calico/kube-controllers:${CALICO_VERSION}|g" "${CALICO_MANIFEST_TEMP}" + +# 设置 Pod CIDR +log_info "配置 Calico Pod CIDR: ${POD_CIDR} ..." +# 确保 # - name: CALICO_IPV4POOL_CIDR 及其下面的 value 行被取消注释并设置 +sudo sed -i "s|# - name: CALICO_IPV4POOL_CIDR|- name: CALICO_IPV4POOL_CIDR|g" "${CALICO_MANIFEST_TEMP}" +sudo sed -i "s|# value: \"192.168.0.0/16\"| value: \"${POD_CIDR}\"|g" "${CALICO_MANIFEST_TEMP}" + +# 在 calico.yaml 文件末尾添加 IPPool 资源 (如果文件中没有,或者确保它存在且配置正确) +if ! grep -q "kind: IPPool" "${CALICO_MANIFEST_TEMP}"; then + log_info "在 Calico manifest 中添加 IPPool 资源定义..." + echo -e "\n---\napiVersion: crd.projectcalico.org/v1\nkind: IPPool\nmetadata:\n name: default-pool-ipv4\nspec:\n cidr: ${POD_CIDR}\n natOutgoing: true\n disabled: false\n ipipMode: Always" | sudo tee -a "${CALICO_MANIFEST_TEMP}" > /dev/null +else + log_info "Calico IPPool 定义已存在,跳过添加。" +fi + +log_info "应用 Calico manifest 文件..., 内容如下:" +cat ${CALICO_MANIFEST_TEMP} +kubectl apply -f "${CALICO_MANIFEST_TEMP}" || log_error "应用 Calico manifest 失败。" +log_info "Calico 网络插件安装完成。" + +log_info "等待 Calico Pod 启动 (最多 20 分钟)..." +sleep 10 +kubectl wait --for=condition=ready pod -l k8s-app=calico-node -n kube-system --timeout=1900s || log_error "Calico Node Pod 未能在指定时间内启动。" +log_info "Calico Pods 已就绪。" + +#============ + +# ============================================================================== +# 10. 安装 Multus CNI (用于 KubeVirt 虚拟机多网卡) +# ============================================================================== +log_info "安装 Multus CNI 插件..." +MULTUS_MANIFEST_ORIG="${OFFLINE_ASSETS_DIR}/manifests/multus-daemonset.yaml" +if [ ! -f "$MULTUS_MANIFEST_ORIG" ]; then + log_error "Multus 原始 manifest 文件 ${MULTUS_MANIFEST_ORIG} 不存在。" +fi +MULTUS_MANIFEST_TEMP="${TEMP_DIR}/multus-daemonset.yaml" +cp "${MULTUS_MANIFEST_ORIG}" "${MULTUS_MANIFEST_TEMP}" || log_error "复制 Multus manifest 文件失败。" + +log_info "替换 Multus CNI 镜像地址为本地仓库: ${LOCAL_REGISTRY_ADDR} ..." +# Multus CNI 的镜像通常在 ghcr.io/k8snetworkplumbingwg/ 或 docker.io 下 +sudo sed -i "s|ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot|${LOCAL_REGISTRY_ADDR}/k8snetworkplumbingwg/multus-cni:${MULTUS_VERSION}|g" "${MULTUS_MANIFEST_TEMP}" +sudo sed -i "s|docker.io/k8snetworkplumbingwg/multus-cni:snapshot|${LOCAL_REGISTRY_ADDR}/k8snetworkplumbingwg/multus-cni:${MULTUS_VERSION}|g" "${MULTUS_MANIFEST_TEMP}" + +log_info "应用 Multus CNI manifest 文件..." +kubectl apply -f "${MULTUS_MANIFEST_TEMP}" || log_error "应用 Multus CNI manifest 失败。" +log_info "Multus CNI 插件安装完成。" + +log_info "等待 Multus Pod 启动 (最多 5 分钟)..." +sleep 1 +kubectl wait --for=condition=ready pod -l app=multus -n kube-system --timeout=300s || log_error "Multus Pod 未能在指定时间内启动。" +log_info "Multus Pods 已就绪。" + +# ============================================================================== +# 11. 安装 KubeVirt (用于虚拟机管理) +# ============================================================================== +log_info "安装 KubeVirt..." + +KUBEVIRT_OPERATOR_ORIG="${OFFLINE_ASSETS_DIR}/manifests/kubevirt-operator.yaml" + +if [ ! -f "$KUBEVIRT_OPERATOR_ORIG" ]; then + log_error "KubeVirt Operator 文件 ${KUBEVIRT_OPERATOR_ORIG} 不存在。" +fi + +KUBEVIRT_OPERATOR_TEMP="${TEMP_DIR}/kubevirt-operator.yaml" +cp "${KUBEVIRT_OPERATOR_ORIG}" "${KUBEVIRT_OPERATOR_TEMP}" || log_error "复制 KubeVirt Operator 文件失败。" + +log_info "替换 KubeVirt Operator 镜像地址为本地仓库: ${LOCAL_REGISTRY_ADDR} ..." +# KubeVirt 镜像通常在 quay.io/kubevirt +# 这里需要替换 operator 和所有由 operator 部署的组件的镜像 +sudo sed -i "s|quay.io/kubevirt/virt-operator:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/virt-operator:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}" +# sudo sed -i "s|quay.io/kubevirt/virt-controller:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/virt-controller:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}" +# sudo sed -i "s|quay.io/kubevirt/virt-handler:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/virt-handler:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}" +# sudo sed -i "s|quay.io/kubevirt/virt-launcher:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/virt-launcher:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}" +# sudo sed -i "s|quay.io/kubevirt/virt-api:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/virt-api:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}" +# sudo sed -i "s|quay.io/kubevirt/libguestfs-tools:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/libguestfs-tools:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}" +# sudo sed -i "s|quay.io/kubevirt/bridge-marker:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/bridge-marker:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}" +# sudo sed -i "s|quay.io/kubevirt/sidecar-shim:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/sidecar-shim:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}" +# sudo sed -i "s|quay.io/kubevirt/qemu-bridge-helper:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/qemu-bridge-helper:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}" +awk ' +/^kind: Deployment/ {inDeployment=1} +inDeployment && /^ template:/ {inTemplate=1} +inTemplate && /^ spec:/ {inSpec=1} +inSpec && /^ tolerations:/ { + print + # 插入控制平面 toleration + indent = match($0,/[^ ]/) - 1 + spaces = " " + printf("%s- key: \"node-role.kubernetes.io/control-plane\"\n", substr(spaces, 1, indent)) + printf("%s operator: \"Exists\"\n", substr(spaces, 1, indent)) + printf("%s effect: \"NoSchedule\"\n", substr(spaces, 1, indent)) + # 标记已经插入,防止重复插入 + inserted=1 + next +} +# 如果已经插入,就不再修改其他 tolerations +{print} +' "${KUBEVIRT_OPERATOR_TEMP}" > ${TEMP_DIR}/kubevirt-operator-mod.yaml + +cp ${TEMP_DIR}/kubevirt-operator-mod.yaml ${KUBEVIRT_OPERATOR_TEMP} +log_info "应用 KubeVirt Operator manifest 文件..." +kubectl apply -f "${KUBEVIRT_OPERATOR_TEMP}" || log_error "应用 KubeVirt Operator 失败。" +log_info "KubeVirt Operator 应用完成。" + +log_info "等待 KubeVirt Operator 启动 (最多 15 分钟)..." +sleep 1 +kubectl wait --for=condition=ready pod -l kubevirt.io=virt-operator -n kubevirt --timeout=900s || log_error "KubeVirt Operator Pod 未能在指定时间内启动。" +log_info "KubeVirt Operator Pods 已就绪。" + +# ============================================================================== +# 12. 安装 NFS Client Provisioner (用于动态 PV/PVC) +# ============================================================================== +log_info "安装 NFS Client Provisioner..." + +# 12.1 添加 Helm 仓库 (通常在线操作,离线场景下需要手动解压 chart) +log_info "加载 NFS Client Provisioner Helm Chart..." +NFS_CHART_TGZ="${OFFLINE_ASSETS_DIR}/charts/nfs-subdir-external-provisioner-${NFS_CHART_VERSION}.tgz" +if [ ! -f "$NFS_CHART_TGZ" ]; then + log_error "NFS Client Provisioner Helm Chart 文件 ${NFS_CHART_TGZ} 不存在。" +fi + +# 解压 chart 到临时目录 +log_info "解压 Helm Chart 到临时目录..." +sudo mkdir -p "${TEMP_DIR}/nfs-client-provisioner" +sudo tar -xzf "$NFS_CHART_TGZ" -C "${TEMP_DIR}/nfs-client-provisioner" || log_error "解压 NFS Chart 失败。" +NFS_CHART_PATH="${TEMP_DIR}/nfs-client-provisioner/nfs-subdir-external-provisioner" # 解压后的实际目录 + +# 12.2 创建 NFS provisioner 的 values.yaml +log_info "创建 NFS Client Provisioner 的 values.yaml..." +cat < /dev/null +replicaCount: 1 + +strategy: + type: Recreate + +image: + repository: ${LOCAL_REGISTRY_ADDR}/sig-storage/nfs-subdir-external-provisioner + tag: ${NFS_PROVISIONER_VERSION} + pullPolicy: IfNotPresent + +nfs: + server: ${NFS_SERVER} + path: ${NFS_PATH} + +storageClass: + create: true + name: ${NFS_STORAGE_CLASS_NAME} + defaultClass: true + provisionerName: ${NFS_STORAGE_CLASS_NAME} + reclaimPolicy: Delete + archiveOnDelete: true + +# 允许 Pod 调度到 control-plane 节点 +tolerations: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" + +# 如果你想强制跑在控制节点(通常单节点集群推荐) +# 控制节点通常带有 label:node-role.kubernetes.io/control-plane="" +nodeSelector: + node-role.kubernetes.io/control-plane: "" + +# 也可以留空不写,K8s 会随机选择节点 +# nodeSelector: {} + +EOF + +log_info "NFS Client Provisioner values.yaml 已生成,内容如下:" +cat "${TEMP_DIR}/nfs-provisioner-values.yaml" + +# 12.3 部署 NFS Client Provisioner (使用 Helm) +log_info "使用 Helm 部署 NFS Client Provisioner..." + +# 检查是否已安装,如果已安装则升级,否则安装 +if helm status nfs-client-provisioner -n kube-system &>/dev/null; then + log_info "NFS Client Provisioner 已存在,进行升级..." + helm upgrade nfs-client-provisioner "${NFS_CHART_PATH}" \ + --install \ + --namespace kube-system \ + --values "${TEMP_DIR}/nfs-provisioner-values.yaml" \ + --version "${NFS_CHART_VERSION}" || log_error "升级 NFS Client Provisioner 失败。" +else + log_info "NFS Client Provisioner 未安装,进行安装..." + helm install nfs-client-provisioner "${NFS_CHART_PATH}" \ + --namespace kube-system \ + --values "${TEMP_DIR}/nfs-provisioner-values.yaml" \ + --version "${NFS_CHART_VERSION}" || log_error "安装 NFS Client Provisioner 失败。" +fi + +log_info "NFS Client Provisioner Helm Chart 应用完成。" + +log_info "等待 NFS Client Provisioner Pod 启动 (最多 5 分钟)..." +sleep 1 +kubectl wait --for=condition=ready pod -l app=nfs-subdir-external-provisioner -n kube-system --timeout=300s || log_error "NFS Client Provisioner Pod 未能在指定时间内启动。" +log_info "NFS Client Provisioner Pods 已就绪。" + +log_info "设置默认 StorageClass 为 ${NFS_STORAGE_CLASS_NAME}..." +# 确保旧的默认 StorageClass 被取消默认 +kubectl patch storageclass $(kubectl get storageclass -o jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}') -p '{"metadata":{"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' &>/dev/null || true +# 设置新的默认 StorageClass +kubectl patch storageclass "${NFS_STORAGE_CLASS_NAME}" -p '{"metadata":{"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' || log_error "设置 ${NFS_STORAGE_CLASS_NAME} 为默认 StorageClass 失败。" +log_info "${NFS_STORAGE_CLASS_NAME} 已设置为默认 StorageClass。" + +# ============================================================================== +# 13. KubeVirt 额外配置 (如 NetworkAttachmentDefinition 示例) +# ============================================================================== +log_info "应用 KubeVirt 额外配置 (示例 NetworkAttachmentDefinition)..." + +# 如果需要,这里可以添加其他 NetworkAttachmentDefinition +# 例如,一个 vlan 接口 +cat < /dev/null; then + sudo ipvsadm --clear || { echo "❌ Failed to clear ipvsadm rules, but continuing..."; } +fi + +# 确保删除所有由 Docker daemon 自身创建的 K8s 相关网络(如果 Registry 运行在 Docker 上,并且 Docker daemon 也被 K8s 使用过) +# 再次注意:这一步通常在 K8s 节点上执行时安全,但需谨慎 +sudo docker network ls -q | grep -E 'k8s|cni' | xargs -r sudo docker network rm || true + +echo "" +echo "=== Kubernetes Master Node Cleanup COMPLETED ===" +echo "It is HIGHLY RECOMMENDED to reboot this node now to ensure a completely clean state." +echo "You can do this by running: sudo reboot" +echo "" + +sudo systemctl daemon-reload diff --git a/deploy/tst.sh b/deploy/tst.sh new file mode 100755 index 0000000..650d2eb --- /dev/null +++ b/deploy/tst.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +get_script_path(){ + # 获取脚本真实路径(解析软链接) + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)" + echo "$SCRIPT_DIR" +} + +echo "$(get_script_path)" diff --git a/script/ctrl_init.sh b/script/ctrl_init.sh new file mode 100644 index 0000000..059aef0 --- /dev/null +++ b/script/ctrl_init.sh @@ -0,0 +1,10 @@ +sudo kubeadm init --kubernetes-version=v1.29.0 --pod-network-cidr=10.244.0.0/16 +# 保存 kubeconfig +mkdir -p $HOME/.kube +sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config +sudo chown $(id -u):$(id -g) $HOME/.kube/config + +kubectl apply -f /opt/offline/kubevirt/kubevirt-operator.yaml +kubectl apply -f /opt/offline/kubevirt/kubevirt-cr.yaml + +kubeadm token create --print-join-command diff --git a/script/download_pkgs.sh b/script/download_pkgs.sh new file mode 100755 index 0000000..d36f513 --- /dev/null +++ b/script/download_pkgs.sh @@ -0,0 +1,59 @@ +#!/bin/bash +set -e +mkdir -p /opt/offline/{k8s,containerd,kubevirt,nvidia,dependencies} + +# ------------------------------- +# 1. Ubuntu 22.04 系统依赖 +# ------------------------------- +sudo apt update +DEBS="curl conntrack socat ipvsadm iptables bridge-utils ethtool git wget tar" +mkdir -p /opt/offline/dependencies +for pkg in $DEBS; do + apt download $pkg + mv *.deb /opt/offline/dependencies/ +done + +# ------------------------------- +# 2. Kubernetes 组件 +# ------------------------------- +K8S_VERSION="1.29.0" +mkdir -p /opt/offline/k8s +cd /opt/offline/k8s +curl -LO https://dl.k8s.io/release/v${K8S_VERSION}/bin/linux/amd64/kubeadm +curl -LO https://dl.k8s.io/release/v${K8S_VERSION}/bin/linux/amd64/kubelet +curl -LO https://dl.k8s.io/release/v${K8S_VERSION}/bin/linux/amd64/kubectl +chmod +x kubeadm kubelet kubectl + +# ------------------------------- +# 3. Containerd +# ------------------------------- +CONTAINERD_VERSION="1.9.12" +cd /opt/offline/containerd +wget https://github.com/containerd/containerd/releases/download/v${CONTAINERD_VERSION}/containerd-${CONTAINERD_VERSION}-linux-amd64.tar.gz + +# ------------------------------- +# 4. NVIDIA Container Toolkit & Drivers +# ------------------------------- +mkdir -p /opt/offline/nvidia +# 下载 NVIDIA driver (根据 GPU 型号自行选择) +# 示例: NVIDIA-Linux-x86_64-525.85.12.run +wget -O /opt/offline/nvidia/NVIDIA-DRIVER.run http://us.download.nvidia.com/XFree86/Linux-x86_64/525.85.12/NVIDIA-Linux-x86_64-525.85.12.run +# 下载 NVIDIA container toolkit +wget -O /opt/offline/nvidia/nvidia-container-toolkit.deb https://github.com/NVIDIA/nvidia-docker/releases/download/v2.13.0/nvidia-container-toolkit_2.13.0-1_all.deb +wget -O /opt/offline/nvidia/nvidia-container-runtime.deb https://github.com/NVIDIA/nvidia-docker/releases/download/v2.13.0/nvidia-container-runtime_2.13.0-1_amd64.deb + +# ------------------------------- +# 5. KubeVirt Operator + CR +# ------------------------------- +mkdir -p /opt/offline/kubevirt +curl -L https://github.com/kubevirt/kubevirt/releases/download/v1.28.0/kubevirt-operator.yaml -o /opt/offline/kubevirt/kubevirt-operator.yaml +curl -L https://github.com/kubevirt/kubevirt/releases/download/v1.28.0/kubevirt-cr.yaml -o /opt/offline/kubevirt/kubevirt-cr.yaml + +# ------------------------------- +# 6. GPU Operator +# ------------------------------- +mkdir -p /opt/offline/nvidia/gpu-operator +curl -L https://github.com/NVIDIA/gpu-operator/archive/refs/heads/main.tar.gz -o /opt/offline/nvidia/gpu-operator/gpu-operator.tar.gz + +echo "Offline package download completed. All packages are in /opt/offline/" + diff --git a/script/install_offline.sh b/script/install_offline.sh new file mode 100644 index 0000000..df6a47f --- /dev/null +++ b/script/install_offline.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# GPU节点 +# sudo bash install_offline.sh gpu +# 控制节点或普通工作节点 +# sudo bash install_offline.sh +set -e + +OFFLINE_DIR="/opt/offline" + +# ------------------------------- +# 1. 安装依赖 +# ------------------------------- +dpkg -i $OFFLINE_DIR/dependencies/*.deb || apt-get -f install -y + +# ------------------------------- +# 2. 安装 containerd +# ------------------------------- +tar -C /usr/local -xzf $OFFLINE_DIR/containerd/containerd-*.tar.gz +ln -s /usr/local/bin/containerd /usr/bin/containerd +ln -s /usr/local/bin/containerd-shim /usr/bin/containerd-shim +ln -s /usr/local/bin/ctr /usr/bin/ctr +containerd --version + +# ------------------------------- +# 3. 安装 Kubernetes +# ------------------------------- +cp $OFFLINE_DIR/k8s/kubeadm /usr/bin/ +cp $OFFLINE_DIR/k8s/kubelet /usr/bin/ +cp $OFFLINE_DIR/k8s/kubectl /usr/bin/ +chmod +x /usr/bin/kubeadm /usr/bin/kubelet /usr/bin/kubectl + +# ------------------------------- +# 4. GPU 节点额外安装 NVIDIA 驱动与 runtime +# ------------------------------- +if [ "$1" == "gpu" ]; then + chmod +x $OFFLINE_DIR/nvidia/NVIDIA-DRIVER.run + $OFFLINE_DIR/nvidia/NVIDIA-DRIVER.run --silent + dpkg -i $OFFLINE_DIR/nvidia/nvidia-container-toolkit.deb + dpkg -i $OFFLINE_DIR/nvidia/nvidia-container-runtime.deb +fi + +# ------------------------------- +# 5. 启动 containerd & kubelet +# ------------------------------- +systemctl enable containerd --now +systemctl enable kubelet --now + +echo "Offline install completed on $(hostname)" + diff --git a/script/k8s+kebuvirt/ctrl_install.sh b/script/k8s+kebuvirt/ctrl_install.sh new file mode 100644 index 0000000..151d16f --- /dev/null +++ b/script/k8s+kebuvirt/ctrl_install.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# control-plane-node-install.sh +# 运行在主控节点(假设 IP: 192.168.10.10) + +set -e + +OFFLINE_DIR=/opt/offline +K8S_VERSION=v1.29.6 +CONTROL_PLANE_IP=192.168.10.10 +API_SERVER_NAME=k8s-api.internal + +echo "=== 解压离线包 ===" +tar -xzf ${OFFLINE_DIR}/k8s-offline-all.tar.gz -C /tmp/ + +# 安装基础依赖 +dpkg -i ${OFFLINE_DIR}/debs/*.deb || apt-get -f install -y + +echo "=== 安装 containerd ===" +mkdir -p /usr/local/bin +tar --no-overwrite-dir -C /usr/local -xzf ${OFFLINE_DIR}/containerd.tar.gz + +# 写入 systemd 服务 +cat > /etc/systemd/system/containerd.service << 'EOF' +[Unit] +Description=containerd daemon +After=network.target + +[Service] +ExecStartPre=/sbin/modprobe overlay +ExecStart=/usr/local/bin/containerd +Restart=always +Type=notify +Delegate=yes +KillMode=process + +[Install] +WantedBy=multi-user.target +EOF + +systemctl enable containerd +systemctl start containerd + +# 安装 CNI 插件 +mkdir -p /opt/cni/bin +tar -xzf ${OFFLINE_DIR}/cni-plugins.tgz -C /opt/cni/bin/ + +# 安装 k8s 二进制 +cp ${OFFLINE_DIR}/k8s-binaries/* /usr/bin/ +chmod +x /usr/bin/kubeadm /usr/bin/kubelet /usr/bin/kubectl + +# kubelet systemd 设置 +cat > /etc/systemd/system/kubelet.service << 'EOF' +[Unit] +Description=kubelet +After=containerd.service +Requires=containerd.service + +[Service] +ExecStart=/usr/bin/kubelet +Restart=always +StartLimitInterval=0 +VolumeMountPropagation=private +Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock" + +[Install] +WantedBy=multi-user.target +EOF + +systemctl enable kubelet + +echo "=== 初始化集群 ===" +kubeadm init \ + --pod-network-cidr=10.244.0.0/16 \ + --apiserver-advertise-address=${CONTROL_PLANE_IP} \ + --kubernetes-version=${K8S_VERSION} \ + --ignore-preflight-errors=all + +mkdir -p $HOME/.kube +cp /etc/kubernetes/admin.conf $HOME/.kube/config +chown $(id -u):$(id -g) $HOME/.kube/config + +echo "=== 安装 Flannel CNI ===" +kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/master/Documentation/kube-flannel.yml + +# 标记主节点不调度 Pod(可选) +kubectl taint nodes $(hostname) node-role.kubernetes.io/control-plane:NoSchedule + +echo "✅ 控制节点安装完成" +echo "请将 ~/.kube/config 复制到其他节点或管理机" diff --git a/script/k8s+kebuvirt/deploy-kubevirt-and-gpu.sh b/script/k8s+kebuvirt/deploy-kubevirt-and-gpu.sh new file mode 100644 index 0000000..831f3a0 --- /dev/null +++ b/script/k8s+kebuvirt/deploy-kubevirt-and-gpu.sh @@ -0,0 +1,33 @@ +# deploy-kubevirt-and-gpu.sh +# 在主控节点运行 +# 加载镜像 +docker load -i /tmp/images/gpu-operator-images.tar +docker load -i /tmp/images/kubevirt-images.tar + +# 安装 Helm +tar -xzf /tmp/helm/helm.tar.gz -C /tmp/ +cp /tmp/linux-amd64/helm /usr/local/bin/helm + +# 添加仓库(离线无需 add) +helm install gpu-operator nvidia/gpu-operator \ + --version=v24.9.0 \ + --set driver.enabled=false \ # 已手动安装驱动 + --set toolkit.enabled=true \ + --set devicePlugin.enabled=true \ + --set dcgmExporter.enabled=true \ + --set migManager.enabled=true \ + --set operator.defaultRuntime=containerd + +# 等待 GPU 就绪 +watch kubectl get pods -n gpu-operator-resources + +# 安装 KubeVirt +kubectl create namespace kubevirt +kubectl apply -f https://github.com/kubevirt/kubevirt/releases/download/v1.1.0/kubevirt-operator.yaml +kubectl apply -f https://github.com/kubevirt/kubevirt/releases/download/v1.1.0/kubevirt-cr.yaml + +# 安装 CDI(用于导入镜像) +helm install cdi kubevirt/cdi --namespace kubevirt --version=v1.50.0 + +# 配置 NFS 动态供给(可选) +kubectl apply -f nfs-client-provisioner.yaml # 自定义配置指向你的 100T NFS diff --git a/script/k8s+kebuvirt/dl-pkgs.sh b/script/k8s+kebuvirt/dl-pkgs.sh new file mode 100644 index 0000000..a074fae --- /dev/null +++ b/script/k8s+kebuvirt/dl-pkgs.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# offline-download.sh +# 在有互联网的机器上执行,将所有依赖打包供离线部署使用 + +set -e +apt install podman-docker +export WORKDIR=/tmp/k8s-offline +for d in packages,images,k8s-binaries,helm,nvidia,gpu-operator,kubevirt +do + mkdir -p $WORKDIR/$d +done + +cd $WORKDIR + +echo "=== 下载 Kubernetes 二进制文件 ===" +K8S_VERSION=v1.29.6 +ARCH=amd64 + +curl -L --retry 3 https://dl.k8s.io/${K8S_VERSION}/bin/linux/${ARCH}/kubeadm -o k8s-binaries/kubeadm +curl -L --retry 3 https://dl.k8s.io/${K8S_VERSION}/bin/linux/${ARCH}/kubelet -o k8s-binaries/kubelet +curl -L --retry 3 https://dl.k8s.io/${K8S_VERSION}/bin/linux/${ARCH}/kubectl -o k8s-binaries/kubectl + +chmod +x k8s-binaries/* + +echo "=== 下载 containerd ===" +CONTAINERD_VERSION=1.7.16 +curl -L --retry 3 https://github.com/containerd/containerd/releases/download/v${CONTAINERD_VERSION}/containerd-${CONTAINERD_VERSION}-linux-amd64.tar.gz -o packages/containerd.tar.gz + +echo "=== 下载 runc ===" +RUNC_VERSION=v1.1.13 +curl -L --retry 3 https://github.com/opencontainers/runc/releases/download/${RUNC_VERSION}/runc.amd64 -o packages/runc && chmod +x packages/runc + +echo "=== 下载 CNI 插件 ===" +CNI_VERSION=v1.4.1 +curl -L --retry 3 https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-amd64-${CNI_VERSION}.tgz -o packages/cni-plugins.tgz + +echo "=== 下载 Helm ===" +HELM_VERSION=v3.13.3 +curl -L --retry 3 https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz -o helm/helm.tar.gz + +echo "=== 下载 NVIDIA Driver(仅元信息,实际需手动获取)===" +echo "注意:NVIDIA 驱动无法直接 wget,请从官网下载:" +echo "https://www.nvidia.com/Download/index.aspx?lang=en-us" +echo "选择 A100-SXM4 / Data Center Driver for Linux x86_64" +echo "保存为: nvidia/NVIDIA-Linux-x86_64-535.161.08.run" + +echo "=== 下载 NVIDIA Container Toolkit 依赖(通过 apt 离线包)===" +# 使用 docker pull + save 方式更可靠 +echo "准备构建本地 apt repo 或使用 .deb 包方式" + +# 推荐方法:在一台联网 Ubuntu 22.04 上执行: +cat > prepare-debs.sh << 'EOF' +#!/bin/bash +mkdir -p /tmp/debs +apt update +apt install -y --download-only curl conntrack socat ipvsadm iptables bridge-utils ethtool git wget tar +apt install -y --download-only nfs-utils nfs-common +apt install -y --download-only nvidia-driver-535 nvidia-utils-535 nvidia-dkms-535 +apt install -y --download-only nvidia-container-toolkit +cp /var/cache/apt/archives/*.deb /path/to/offline/nvidia/ +EOF + +echo "请运行 prepare-debs.sh 获取 .deb 包" + +echo "=== 拉取 GPU Operator 所需镜像 ===" +# GPU Operator 会拉取多个镜像,我们预先列出并导出 +cat > gpu-operator-images.txt << 'EOF' +nvcr.io/nvidia/gpu-operator:v24.9.0 +nvcr.io/nvidia/gpu-feature-discovery:v0.8.0 +nvcr.io/nvidia/driver:535.161.08-ubuntu22.04 +nvcr.io/nvidia/container-toolkit:1.14.2-ubuntu22.04 +nvcr.io/nvidia/dcgm:3.1.7-3-ubuntu22.04 +nvcr.io/nvidia/k8s-device-plugin:0.14.2-ubi8 +nvcr.io/nvidia/k8s-operator-validator:v1.2.0 +EOF + +while read img; do + echo "Pulling $img" + docker pull $img || echo "Failed: $img" +done < gpu-operator-images.txt + +# 保存镜像为 tar 文件 +docker save $(cat gpu-operator-images.txt | tr '\n' ' ') -o images/gpu-operator-images.tar + +echo "=== 拉取 KubeVirt 组件镜像 ===" +KV_VERSION=v1.1.0 +cat > kubevirt-images.txt << EOF +quay.io/kubevirt/virt-operator:${KV_VERSION} +quay.io/kubevirt/virt-api:${KV_VERSION} +quay.io/kubevirt/virt-controller:${KV_VERSION} +quay.io/kubevirt/virt-handler:${KV_VERSION} +quay.io/kubevirt/virt-launcher:${KV_VERSION} +quay.io/kubevirt/cdi-operator:v1.50.0 +quay.io/kubevirt/cdi-apiserver:v1.50.0 +quay.io/kubevirt/cdi-uploadproxy:v1.50.0 +quay.io/kubevirt/cdi-cloner:v1.50.0 +quay.io/kubevirt/cdi-importer:v1.50.0 +quay.io/kubevirt/cdi-uploadserver:v1.50.0 +EOF + +while read img; do + docker pull $img || echo "Failed: $img" +done < kubevirt-images.txt + +docker save $(cat kubevirt-images.txt | tr '\n' ' ') -o images/kubevirt-images.tar + +echo "=== 创建最终离线包 ===" +tar -czf k8s-offline-all.tar.gz . + +echo "✅ 所有离线资源已生成:k8s-offline-all.tar.gz" +echo "请将其复制到目标环境并解压" diff --git a/script/k8s+kebuvirt/gpuworker_install.sh b/script/k8s+kebuvirt/gpuworker_install.sh new file mode 100644 index 0000000..aad9fa7 --- /dev/null +++ b/script/k8s+kebuvirt/gpuworker_install.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# worker-gpu-install.sh +# 在每个有 A100 的 GPU 节点上运行 + +set -e + +OFFLINE_DIR=/opt/offline + +# 安装 containerd、k8s 二进制(同上) +tar --no-overwrite-dir -C /usr/local -xzf ${OFFLINE_DIR}/containerd.tar.gz +mkdir -p /opt/cni/bin +tar -xzf ${OFFLINE_DIR}/cni-plugins.tgz -C /opt/cni/bin/ + +cp ${OFFLINE_DIR}/k8s-binaries/kubeadm /usr/bin/ +cp ${OFFLINE_DIR}/k8s-binaries/kubelet /usr/bin/ +chmod +x /usr/bin/kubeadm /usr/bin/kubelet + +# 配置 containerd 和 kubelet(同上) +cat > /etc/systemd/system/containerd.service << 'EOF' +[Unit] +Description=containerd daemon +After=network.target + +[Service] +ExecStartPre=/sbin/modprobe overlay +ExecStart=/usr/local/bin/containerd +Restart=always +Type=notify +Delegate=yes +KillMode=process + +[Install] +WantedBy=multi-user.target +EOF + +systemctl enable containerd +systemctl start containerd + +cat > /etc/systemd/system/kubelet.service << 'EOF' +[Unit] +Description=kubelet +After=containerd.service +Requires=containerd.service + +[Service] +ExecStart=/usr/bin/kubelet +Restart=always +StartLimitInterval=0 +VolumeMountPropagation=private +Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock" + +[Install] +WantedBy=multi-user.target +EOF + +systemctl enable kubelet + +# 安装 NVIDIA 驱动 +echo "=== 安装 NVIDIA 驱动 ===" +chmod +x ${OFFLINE_DIR}/nvidia/NVIDIA-Linux-x86_64-*.run +${OFFLINE_DIR}/nvidia/NVIDIA-Linux-x86_64-535.161.08.run -s --dkms --no-opengl-files + +# 加载内核模块 +modprobe nvidia +modprobe nvidia-uvm + +# 安装 NVIDIA Container Toolkit +dpkg -i ${OFFLINE_DIR}/nvidia/nvidia-container-toolkit*.deb +systemctl restart containerd + +# 开启 MIG 模式(A100 必须) +echo "=== 配置 MIG 模式 ===" +# 示例:每张卡切分为 2 个 MIG 实例(可根据需求调整) +nvidia-smi -i 0 -mig 1 +sleep 5 +# 创建实例(示例:创建两个 3g.20gb 实例) +nvidia-smi mig -i 0 -cgi 3g.20gb,3g.20gb -C +nvidia-smi mig -i 1 -cgi 3g.20gb,3g.20gb -C +# ... 对所有卡重复 + +# 标记节点为 GPU 节点 +cat > /tmp/gpu-label.yaml << 'EOF' +apiVersion: v1 +kind: Node +metadata: + name: $(hostname) + labels: + node-type: gpu-worker + nvidia.com/gpu.present: "true" +EOF + +# 注意:join 后再应用 label +echo "✅ 安装完成,请先加入集群" +echo "然后在 master 上运行:kubectl label node $(hostname) node-type=gpu-worker nvidia.com/gpu.present=true" diff --git a/script/k8s+kebuvirt/nfs-client-provisioner.yaml b/script/k8s+kebuvirt/nfs-client-provisioner.yaml new file mode 100644 index 0000000..48926a2 --- /dev/null +++ b/script/k8s+kebuvirt/nfs-client-provisioner.yaml @@ -0,0 +1,47 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: nfs-client +provisioner: k8s-sigs.io/nfs-subdir-external-provisioner +parameters: + archiveOnDelete: "false" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nfs-client-provisioner + labels: + app: nfs-client-provisioner + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: nfs-client-provisioner + strategy: + type: Recreate + template: + metadata: + labels: + app: nfs-client-provisioner + spec: + serviceAccountName: nfs-client-provisioner + containers: + - name: nfs-client-provisioner + image: registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 + volumeMounts: + - name: nfs-client-root + mountPath: /persistentvolumes + env: + - name: PROVISIONER_NAME + value: k8s-sigs.io/nfs-subdir-external-provisioner + - name: NFS_SERVER + value: 192.168.10.1 # 替换为你的 NFS 服务器 IP + - name: NFS_PATH + value: /export/k8s + volumes: + - name: nfs-client-root + nfs: + server: 192.168.10.1 + path: /export/k8s + diff --git a/script/k8s+kebuvirt/worker_install.sh b/script/k8s+kebuvirt/worker_install.sh new file mode 100644 index 0000000..bebc9b6 --- /dev/null +++ b/script/k8s+kebuvirt/worker_install.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# worker-cpu-install.sh +# 所有无 GPU 的工作节点运行此脚本 + +set -e + +OFFLINE_DIR=/opt/offline + +# 安装 containerd、CNI、k8s 二进制(同 control plane) +tar --no-overwrite-dir -C /usr/local -xzf ${OFFLINE_DIR}/containerd.tar.gz +mkdir -p /opt/cni/bin +tar -xzf ${OFFLINE_DIR}/cni-plugins.tgz -C /opt/cni/bin/ + +cp ${OFFLINE_DIR}/k8s-binaries/kubeadm /usr/bin/ +cp ${OFFLINE_DIR}/k8s-binaries/kubelet /usr/bin/ +chmod +x /usr/bin/kubeadm /usr/bin/kubelet + +# 同样配置 containerd 和 kubelet +cat > /etc/systemd/system/containerd.service << 'EOF' +[Unit] +Description=containerd daemon +After=network.target + +[Service] +ExecStartPre=/sbin/modprobe overlay +ExecStart=/usr/local/bin/containerd +Restart=always +Type=notify +Delegate=yes +KillMode=process + +[Install] +WantedBy=multi-user.target +EOF + +systemctl enable containerd +systemctl start containerd + +cat > /etc/systemd/system/kubelet.service << 'EOF' +[Unit] +Description=kubelet +After=containerd.service +Requires=containerd.service + +[Service] +ExecStart=/usr/bin/kubelet +Restart=always +StartLimitInterval=0 +VolumeMountPropagation=private +Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock" + +[Install] +WantedBy=multi-user.target +EOF + +systemctl enable kubelet + +echo "✅ 准备加入集群,请在主控节点获取 join 命令:" +echo "kubeadm token create --print-join-command" +echo "然后在此节点执行输出的命令"