From f6916a9a77bd489df4b5de8bca834b04361c4b24 Mon Sep 17 00:00:00 2001 From: yumoqing Date: Wed, 26 Nov 2025 21:34:17 +0800 Subject: [PATCH] buggfix --- cluster-config.yaml | 10 +- downloader/dl.sh | 14 +- installer/templates/master.sh.j2 | 746 +++++++++++++++++++++++++++---- 3 files changed, 674 insertions(+), 96 deletions(-) diff --git a/cluster-config.yaml b/cluster-config.yaml index fca9eab..9e6fd04 100644 --- a/cluster-config.yaml +++ b/cluster-config.yaml @@ -1,25 +1,20 @@ # cluster-config.yaml - 修改为你的实际集群配置 kubernetes: - version: "1.29.3" + version: "1.28.2" pod_cidr: "10.244.0.0/16" service_cidr: "10.96.0.0/12" cluster_name: "offline-k8s-cluster" -k8s_version: "1.29.3" -containerd_version: "1.7.13" +containerd_version: "v1.6.37" crictl_version: "1.29.0" - nfs_server: "192.168.16.2" nfs_path: "/d/share/101206" - registry: "registry.local:5000" # 可选:若使用私有 registry -control_plane_ip: "192.168.16.5" - nodes: control_plane: hostname: "k8s-master" @@ -29,7 +24,6 @@ join: token: "abcdef.0123456789abcdef" hash: "123456abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234" -# ===================================================== # GPU Operator / NVIDIA 配置 # ===================================================== gpu: diff --git a/downloader/dl.sh b/downloader/dl.sh index 96d8268..04c97ae 100644 --- a/downloader/dl.sh +++ b/downloader/dl.sh @@ -14,8 +14,13 @@ NVIDIA_DRIVER_VERSION="535.129.03" # ========================================= echo ">>> [0/6] 初始化目录..." -mkdir -p $WORKDIR/{bin,debs,images,drivers,charts,manifests,scripts} -PKGS_TO_DOWNLOAD="nfs-common socat conntrack ipset ebtables lvm2 gnupg2 software-properties-common curl ca-certificates apt-transport-https redis-server" +mkdir -p $WORKDIR/{bin,service, debs,images,drivers,charts,manifests,scripts} + +echo ">>>[x] 下载containerd.service" +cd $WORKDIR/service +sudo curl -L https://raw.githubusercontent.com/containerd/containerd/main/containerd.service -o containerd.service + +PKGS_TO_DOWNLOAD="docker.io nfs-common socat conntrack ipset ebtables lvm2 gnupg2 software-properties-common curl ca-certificates apt-transport-https redis-server" cd $WORKDIR/debs sudo apt-get update -q for pkg in $PKGS_TO_DOWNLOAD; do @@ -61,7 +66,8 @@ if [ ! -f "cni-plugins-linux-amd64-${CNI_VERSION}.tgz" ]; then echo "Downloading CNI Plugins..." wget -q https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-amd64-${CNI_VERSION}.tgz fi - +# containerd +curl -L --retry 3 https://github.com/containerd/containerd/releases/download/v1.6.37/containerd-1.6.37-linux-amd64.tar.gz -o containerd-1.6.37-linux-amd64.tar.gz echo "Binaries ready." # ================= 2. 容器镜像 ================= @@ -79,7 +85,7 @@ IMAGES=( "registry.k8s.io/kube-scheduler:v${K8S_VERSION}" "registry.k8s.io/kube-proxy:v${K8S_VERSION}" "registry.k8s.io/pause:3.9" - "registry.k8s.io/etcd:3.5.12-0" + "registry.k8s.io/etcd:3.5.9-0" "registry.k8s.io/coredns/coredns:v1.10.1" "docker.io/calico/cni:${CALICO_VERSION}" "docker.io/calico/node:${CALICO_VERSION}" diff --git a/installer/templates/master.sh.j2 b/installer/templates/master.sh.j2 index b672b03..ab83191 100644 --- a/installer/templates/master.sh.j2 +++ b/installer/templates/master.sh.j2 @@ -1,107 +1,685 @@ #!/bin/bash -source ./common.sh +set -eo pipefail # 脚本遇到任何错误立即退出,未捕捉的管道错误也退出 -echo "[INFO] === 初始化 Master 节点 ===" +# ============================================================================== +# 配置区域 +# ============================================================================== +OFFLINE_ASSETS_DIR="/root/k8s-offline-bundle" -cat < kubeadm-config.yaml -apiVersion: kubeadm.k8s.io/v1beta3 -kind: ClusterConfiguration -kubernetesVersion: v{{ cluster.kubernetes_version }} -controlPlaneEndpoint: "{{ cluster.api_server_ip }}:6443" -networking: - podSubnet: "{{ cluster.pod_cidr }}" - serviceSubnet: "{{ cluster.service_cidr }}" -imageRepository: {{ registry.ip }}:{{ registry.port }} ---- -apiVersion: kubelet.config.k8s.io/v1beta1 -kind: KubeletConfiguration -cgroupDriver: systemd -CFG +K8S_VERSION="v1.28.2" +CALICO_VERSION="v3.26.1" +KUBEVIRT_VERSION="v1.1.0" +NFS_PROVISIONER_VERSION="v4.0.2" # 镜像标签 +NFS_CHART_VERSION="4.0.18" # Helm Chart 版本 -# 预先检查 -kubeadm init phase preflight --config kubeadm-config.yaml --ignore-preflight-errors=all +LOCAL_REGISTRY_IP="192.168.16.5" +LOCAL_REGISTRY_PORT="5000" +LOCAL_REGISTRY="${LOCAL_REGISTRY_IP}:${LOCAL_REGISTRY_PORT}" -# 正式初始化 -# 注意:因为我们已经手动导入了镜像,不需要 kubeadm pull -kubeadm init --config kubeadm-config.yaml --upload-certs | tee kubeadm-init.log +K8S_APISERVER_ADVERTISE_ADDRESS="${LOCAL_REGISTRY_IP}" +POD_CIDR="192.168.0.0/16" +SERVICE_CIDR="10.96.0.0/12" -mkdir -p $HOME/.kube -cp -i /etc/kubernetes/admin.conf $HOME/.kube/config -chown $(id -u):$(id -g) $HOME/.kube/config - -echo "[INFO] 部署网络插件 (Calico)..." -kubectl apply -f "$BUNDLE_ROOT/manifests/calico.yaml" +NFS_SERVER="192.168.16.2" +NFS_PATH="/d/share/101206" +NFS_STORAGE_CLASS_NAME="nfs-client" +TEMP_DIR="/tmp/k8s-master-setup" NAMESPACE="default" -LOCAL_REGISTRY="{{ registry.ip }}:{{ registry.port }}" -echo "[INFO] 5. 导入离线镜像..." -if [ -d "$IMAGES_DIR" ]; then - for tarfile in "$IMAGE_DIR"/*.tar; do - [ -e "$tarfile" ] || continue +LOCAL_REGISTRY_IP="192.168.16.5" +LOCAL_REGISTRY_PORT="5000" +LOCAL_REGISTRY_ADDR="${LOCAL_REGISTRY_IP}:${LOCAL_REGISTRY_PORT}" +CONTAINERD_CONFIG="/etc/containerd/config.toml" +CERTS_D_PATH="/etc/containerd/certs.d" +CALICO_YAML_PATH="$OFFLINE_ASSETS_DIR/manifests/calico.yaml" # 请确认这个路径 +CALICO_VERSION="v3.26.1" +mkdir -p ${TEMP_DIR} - echo "" - echo ">>> Processing $tarfile" +echo "==================================================" +echo " Kubernetes 控制节点离线安装脚本 " +echo "==================================================" +echo "配置参数:" +echo " K8s 版本: ${K8S_VERSION}" +echo " 本地镜像仓库: ${LOCAL_REGISTRY_ADDR}" +echo " K8s API Server IP: ${K8S_APISERVER_ADVERTISE_ADDRESS}" +echo " Pod CIDR: ${POD_CIDR}" +echo " NFS Server: ${NFS_SERVER}:${NFS_PATH}" +echo "--------------------------------------------------" - # 1️⃣ 导入镜像 - ctr -n "$NAMESPACE" images import "$tarfile" +# ============================================================================== +# 通用函数 (common.sh 中的内容,为简化,直接内联到这里) +# ============================================================================== - # 2️⃣ 获取最新导入镜像(兼容老版本 ctr) - ORIGIN_IMG=$(ctr -n "$NAMESPACE" images ls -q | head -n1) - if [[ -z "$ORIGIN_IMG" ]]; then - echo "❌ Failed to detect original image name, skipping..." - continue - fi - echo "Original image: $ORIGIN_IMG" +log_info() { + echo -e "\e[32m[INFO] $(date +'%Y-%m-%d %H:%M:%S') $1\e[0m" +} - # 3️⃣ 根据 tar 文件名生成本地 registry 镜像名 - # 文件名示例:docker.io_calico_cni_v3.26.1.tar - BASENAME=$(basename "$tarfile" .tar) - BASENAME=${BASENAME#*_} # 去掉 registry 前缀: calico_cni_v3.26.1 - NAME_TAG=${BASENAME} - NAME=${NAME_TAG%_*} # calico_cni - TAG=${NAME_TAG##*_} # v3.26.1 - NEW_IMG="${LOCAL_REGISTRY}/${NAME}:${TAG}" +log_error() { + echo -e "\e[31m[ERROR] $(date +'%Y-%m-%d %H:%M:%S') $1\e[0m" >&2 + exit 1 +} - echo "Retag as: $NEW_IMG" +command_exists() { + command -v "$1" >/dev/null 2>&1 +} - # 4️⃣ 打 tag - ctr -n "$NAMESPACE" images tag "$ORIGIN_IMG" "$NEW_IMG" +check_root() { + if [[ $EUID -ne 0 ]]; then + log_error "此脚本必须以 root 用户或使用 sudo 运行。" + fi +} - # 5️⃣ 推送到本地 registry - ctr -n "$NAMESPACE" images push --plain-http "$NEW_IMG" +configure_sysctl() { + log_info "配置系统内核参数..." + cat < | grep "^\w" | sort -u) +# 但目前我们只有你提供的列表,直接全部安装 +sudo dpkg -i *.deb || true # 第一次安装可能因为依赖失败,忽略错误 + +# 再次尝试,确保所有包尽可能安装 +sudo dpkg -i *.deb || true + +# 离线环境apt install -f 无法工作,这里假设所有必要依赖都在debs目录中 +# 如果有未满足的依赖,这里会显示错误 +log_info "已尝试安装所有DEB包。请检查上述输出是否有未满足的依赖。" +cd - > /dev/null + +# ============================================================================== +# 2. 安装 Docker (仅用于本地镜像仓库) +# ============================================================================== +log_info "安装 Docker daemon (仅用于本地镜像仓库) ..." +# 由于你的DEB包列表中没有docker-ce或docker.io的deb包 +# 假设docker已经通过其他方式安装,或者这里需要补充下载docker的deb包 +# 暂时跳过docker的deb包安装,直接检查docker命令是否存在 + +if ! command_exists docker; then + log_error "未检测到 Docker CLI。请确保已安装 Docker (或其他兼容的容器引擎如Podman)。" + log_info "如果在离线环境中,请将 docker-ce 及其依赖的 .deb 包下载并放在 debs 目录中进行安装。" fi -echo "[INFO] 部署本地 Registry 容器..." -mkdir -p /opt/registry-data -ctr images import $IMAGES_DIR/registry_2.tar -ctr container create \ - --net-host \ - --mount type=bind,src=/opt/registry-data,dst=/var/lib/registry,options=rbind:rw \ - docker.io/library/registry:2 \ - registry-local -nohup ctr task start registry-local & +# 配置 Docker daemon 以信任本地仓库 (针对非 HTTPS) +log_info "配置 Docker daemon 信任本地仓库 ${LOCAL_REGISTRY_ADDR} (针对非 HTTPS)..." +sudo mkdir -p /etc/docker +cat < /dev/null +fi + +# --- 2. 创建必要的目录结构 --- +echo "Creating necessary directory structure under ${CERTS_D_PATH}" +sudo mkdir -p "${CERTS_D_PATH}/${LOCAL_REGISTRY_ADDR}" +sudo mkdir -p "${CERTS_D_PATH}/registry.k8s.io" + +# --- 3. 生成 hosts.toml 文件 --- + +# 为本地 Registry 配置 hosts.toml (http, skip_verify) +echo "Creating ${CERTS_D_PATH}/${LOCAL_REGISTRY_ADDR}/hosts.toml" +sudo tee "${CERTS_D_PATH}/${LOCAL_REGISTRY_ADDR}/hosts.toml" > /dev/null < /dev/null < /dev/null + +# ============================================================================== +# 7. 导入并推送到本地镜像仓库 (使用 Docker CLI,因为目标是 Docker Registry) +# ============================================================================== +log_info "导入并推送到本地镜像仓库 (使用 Docker CLI)..." +IMAGE_TAR_FILES=$(find "${OFFLINE_ASSETS_DIR}/images" -name "*.tar") + +echo "### Cleaning up local Docker Registry and containerd storage ###" + +# 1. 清理本地 Docker Registry (停止并删除容器及数据卷) +echo " Stopping and removing local Docker Registry container: ${LOCAL_REGISTRY_ADDR}" +# 假设 Registry 容器名为 'registry' +sudo docker stop registry || true # 停止容器,如果不存在则忽略错误 +sudo docker rm -v registry || true # 删除容器及其匿名数据卷,如果不存在则忽略错误 +# 如果你的 Registry 使用了具名数据卷 (named volume),需要额外删除它 +# 例如:sudo docker volume rm my-registry-volume || true + +echo " Restarting a fresh local Docker Registry container." +# 重新启动一个干净的 Registry 容器 +# 确保你的 Registry 容器名是 'registry',如果不是请修改 +sudo docker run -d -p 5000:5000 --restart=always --name registry registry:2 +# 稍等片刻,确保 Registry 完全启动 +sleep 5 +echo " Local Docker Registry is ready." + +# 2. 清理 containerd 本地存储中的所有镜像 +echo " Cleaning up existing images from containerd local storage..." +# 获取所有非 且不带 LOCAL_REGISTRY_ADDR 前缀的镜像,以及带 LOCAL_REGISTRY_ADDR 前缀的镜像 +# 这里的目的是删除所有可能由之前操作留下的镜像,包括原始的和打了本地标签的。 +# 排除掉正在被 kubelet 启动的 pause 镜像等,以防万一 +ctr -n "$NAMESPACE" images ls | awk 'NR>1 {print $1}' | while read -r image_ref; do + # 避免删除 LOCAL_REGISTRY_ADDR 容器本身(如果它也被导入了) + if [[ "$image_ref" == "${LOCAL_REGISTRY_ADDR}/registry:2" || "$image_ref" == "docker.io/library/registry:2" ]]; then + echo " Skipping deletion of registry image: $image_ref" + continue + fi + if [[ "$image_ref" == "" ]]; then + continue # 跳过 镜像,它们通常是悬空层 + fi + echo " Deleting containerd image: $image_ref" + ctr -n "$NAMESPACE" images rm "$image_ref" || true # || true 避免因为镜像正在被使用而中断 +done +echo "### Finished cleaning up local environment. ###" + +IMAGE_DIR=$OFFLINE_ASSETS_DIR/images + +echo "=== Importing images from $IMAGE_DIR to local registry $LOCAL_REGISTRY_ADDR ===" + +for tarfile in "$IMAGE_DIR"/*.tar; do + [ -e "$tarfile" ] || continue + + echo "" + echo ">>> Processing $tarfile" + + # 1️⃣ 获取导入前的镜像列表 + IMAGES_BEFORE=$(mktemp) + # ctr images ls 的第一列就是 REF (镜像名称),使用 awk 提取 + if ! ctr -n "$NAMESPACE" images ls | awk 'NR>1 {print $1}' | sort > "$IMAGES_BEFORE"; then + echo "❌ Failed to get images list before import." + continue + fi + + # Debug: + # echo "Images BEFORE import for $tarfile:" + # cat "$IMAGES_BEFORE" + + # 2️⃣ 导入镜像 + if ! ctr -n "$NAMESPACE" images import "$tarfile"; then + echo "❌ Failed to import image from $tarfile." + rm -f "$IMAGES_BEFORE" # 清理临时文件 + continue + fi + + # 3️⃣ 获取导入后的镜像列表 + IMAGES_AFTER=$(mktemp) + if ! ctr -n "$NAMESPACE" images ls | awk 'NR>1 {print $1}' | sort > "$IMAGES_AFTER"; then + echo "❌ Failed to get images list after import." + rm -f "$IMAGES_BEFORE" # 清理临时文件 + continue + fi + + # Debug: + # echo "Images AFTER import for $tarfile:" + # cat "$IMAGES_AFTER" + # echo "Raw difference (comm -13):" + # comm -13 "$IMAGES_BEFORE" "$IMAGES_AFTER" + + # 4️⃣ 找出新增的镜像 (即原始镜像)。排除掉带有本地Registry前缀的镜像本身。 + # 过滤条件:排除本地 registry 已存在的镜像,以及 引用。 + # 因为导入的 tarfile 可能会包含多个 tag,我们只取第一个符合条件的 + ORIGIN_IMG=$(comm -13 "$IMAGES_BEFORE" "$IMAGES_AFTER" | grep -vE "${LOCAL_REGISTRY_ADDR}|" | head -n1) + + rm -f "$IMAGES_BEFORE" "$IMAGES_AFTER" # 清理临时文件 + + if [[ -z "$ORIGIN_IMG" ]]; then + echo "❌ Failed to detect original image name, skipping..." + continue + fi + echo "Original image: $ORIGIN_IMG" + + NEW_IMG="" + if [[ "$ORIGIN_IMG" == "registry.k8s.io/"* ]]; then + if [[ "$ORIGIN_IMG" == "registry.k8s.io/coredns/"* ]]; then + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#registry.k8s.io/coredns/}" + else + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#registry.k8s.io/}" + fi + elif [[ "$ORIGIN_IMG" == "ghcr.io/"* ]]; then + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#ghcr.io/}" + elif [[ "$ORIGIN_IMG" == "quay.io/"* ]]; then + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#quay.io/}" + elif [[ "$ORIGIN_IMG" == "nvcr.io/"* ]]; then + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#nvcr.io/}" + elif [[ "$ORIGIN_IMG" == "docker.io/"* ]]; then + if [[ "$ORIGIN_IMG" == "docker.io/library/"* ]]; then + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#docker.io/library/}" + else + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#docker.io/}" + fi + else + echo "Warning: Unknown original registry prefix for $ORIGIN_IMG. Directly prepending LOCAL_REGISTRY_ADDR." + NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG}" + fi + + echo "Retag as: $NEW_IMG" + + # 4️⃣ 打 tag + ctr -n "$NAMESPACE" images tag "$ORIGIN_IMG" "$NEW_IMG" + + # 5️⃣ 推送到本地 registry + ctr -n "$NAMESPACE" images push --plain-http "$NEW_IMG" + echo "tarfile=$tarfile ORIGIN_IMG=$ORIGIN_IMG NEW_IMG=$NEW_IMG" + + echo "✅ Done: $NEW_IMG" +done + +log_info "所有镜像已导入并推送到本地镜像仓库。" +cd - > /dev/null + +# ============================================================================== +# 8. 初始化 Kubernetes 控制平面 +# ============================================================================== +log_info "初始化 Kubernetes 控制平面..." + +# 生成 kubeadm 配置 +cat < ../../output/join_cluster.sh -chmod +x ../../output/join_cluster.sh -echo "Master 部署完成!请检查 kubectl get nodes"