bugfix
This commit is contained in:
parent
8084dbabd4
commit
40087b4085
27
deploy/README.md
Normal file
27
deploy/README.md
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# k8s + kubevirt
|
||||||
|
实现在k8s环境中分配虚拟机给客户,提高了售卖算力单元的隔离性和安全性,更好的资源管理和控制
|
||||||
|
|
||||||
|
## 环境说明
|
||||||
|
* ubuntu 22.04
|
||||||
|
* NFS共享存储提供虚拟机所需的存储
|
||||||
|
|
||||||
|
实现
|
||||||
|
|
||||||
|
* 按需分配的虚拟机算力,纯cpu算力和gpu算力
|
||||||
|
* 算力节点全生命周期管理,创建,启动,关闭,改配,销毁
|
||||||
|
* 提供本地镜像仓库
|
||||||
|
|
||||||
|
## 安装部署
|
||||||
|
实现离线安装部署,所需安装包均在有网络的环境中下载,并传输到目标主机。
|
||||||
|
实现控制节点和工作节点的安装部署自动化
|
||||||
|
|
||||||
|
安装时需要部分参数做出修改
|
||||||
|
|
||||||
|
### 文件说明
|
||||||
|
|
||||||
|
* dl.sh 环境所需软件的下载脚本,需要在有网络的环境中执行, 并且能无障碍的访问github
|
||||||
|
|
||||||
|
* master-install.sh 控制节点一键安装脚本(需要按照实际环境修改参数)
|
||||||
|
|
||||||
|
* worker-install.sh 工作节点一键安装脚本(需要根据实现环境修改参数)
|
||||||
|
|
||||||
178
deploy/dl.sh
Normal file
178
deploy/dl.sh
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
# https://org.ngc.nvidia.com/setup/api-keys
|
||||||
|
# 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
|
||||||
|
# =================配置区域=================
|
||||||
|
get_script_path(){
|
||||||
|
# 获取脚本真实路径(解析软链接)
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
|
||||||
|
echo "$SCRIPT_DIR"
|
||||||
|
}
|
||||||
|
MYPATH=$(get_script_path)
|
||||||
|
ARCH=amd64
|
||||||
|
WORKDIR=${MYPATH}/k8s-offline-bundle
|
||||||
|
K8S_VERSION="1.28.2"
|
||||||
|
HELM_VERSION="v3.13.1"
|
||||||
|
CNI_VERSION="v1.3.0"
|
||||||
|
CALICO_VERSION="v3.26.1"
|
||||||
|
KUBEVIRT_VERSION="v1.1.0" # 升级到更稳定的版本
|
||||||
|
NVIDIA_DRIVER_VERSION="535.129.03"
|
||||||
|
# =========================================
|
||||||
|
|
||||||
|
echo ">>> [0/6] 初始化目录..."
|
||||||
|
mkdir -p $WORKDIR/{bin,service, debs,images,drivers,charts,manifests,scripts}
|
||||||
|
|
||||||
|
echo ">>>[x] 下载containerd.service"
|
||||||
|
cd $WORKDIR/service
|
||||||
|
sudo curl -L https://raw.githubusercontent.com/containerd/containerd/main/containerd.service -o containerd.service
|
||||||
|
|
||||||
|
PKGS_TO_DOWNLOAD="nfs-common socat conntrack ipset ebtables lvm2 gnupg2 software-properties-common curl ca-certificates apt-transport-https"
|
||||||
|
cd $WORKDIR/debs
|
||||||
|
sudo apt-get update -q
|
||||||
|
for pkg in $PKGS_TO_DOWNLOAD; do
|
||||||
|
echo "Processing package: $pkg"
|
||||||
|
# 使用 apt-rdepends 找出依赖并下载 (需要先安装: sudo apt install apt-rdepends)
|
||||||
|
# 如果没有 apt-rdepends,可以用简化的 apt-get download,但可能漏掉深层依赖
|
||||||
|
# 这里使用一种更通用的方法,尝试下载包本身
|
||||||
|
apt-get download "$pkg" 2>/dev/null || echo "Warning: Failed to download $pkg"
|
||||||
|
done
|
||||||
|
apt-get download build-essential linux-headers-$(uname -r) pkg-config 2>/dev/null
|
||||||
|
# 然后使用 apt-get download 下载包及其所有依赖
|
||||||
|
sudo apt-get download nvidia-container-toolkit libnvidia-container-tools libnvidia-container1 nvidia-container-runtime cuda-keyring
|
||||||
|
ls -l $WORKDIR/debs
|
||||||
|
|
||||||
|
# 检查 Docker 是否存在 (下载镜像必须)
|
||||||
|
if ! command -v docker &> /dev/null; then
|
||||||
|
echo "正在安装 Docker (用于拉取镜像)..."
|
||||||
|
apt-get update && apt-get install -y docker.io
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ================= 1. 二进制文件 =================
|
||||||
|
echo ">>> [1/6] 下载二进制工具 (Helm, CNI)..."
|
||||||
|
cd $WORKDIR/bin
|
||||||
|
|
||||||
|
# 1. Kubernetes Binaries (kubelet, kubeadm, kubectl)
|
||||||
|
curl -L --retry 3 https://dl.k8s.io/v${K8S_VERSION}/bin/linux/${ARCH}/kubeadm -o kubeadm
|
||||||
|
curl -L --retry 3 https://dl.k8s.io/v${K8S_VERSION}/bin/linux/${ARCH}/kubelet -o kubelet
|
||||||
|
curl -L --retry 3 https://dl.k8s.io/v${K8S_VERSION}/bin/linux/${ARCH}/kubectl -o kubectl
|
||||||
|
chmod +x kubeadm kubelet kubectl
|
||||||
|
|
||||||
|
# Helm
|
||||||
|
if [ ! -f "helm" ]; then
|
||||||
|
echo "Downloading Helm..."
|
||||||
|
wget -q https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz
|
||||||
|
tar -zxvf helm-${HELM_VERSION}-linux-amd64.tar.gz
|
||||||
|
mv linux-amd64/helm .
|
||||||
|
rm -rf linux-amd64 helm-*.tar.gz
|
||||||
|
fi
|
||||||
|
|
||||||
|
# CNI Plugins
|
||||||
|
if [ ! -f "cni-plugins-linux-amd64-${CNI_VERSION}.tgz" ]; then
|
||||||
|
echo "Downloading CNI Plugins..."
|
||||||
|
wget -q https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-amd64-${CNI_VERSION}.tgz
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Binaries ready."
|
||||||
|
|
||||||
|
# ================= 2. 容器镜像 =================
|
||||||
|
echo ">>> [2/6] 拉取并打包容器镜像 (这需要较长时间)..."
|
||||||
|
# 确保 Docker 守护进程在运行
|
||||||
|
service docker start || true
|
||||||
|
|
||||||
|
# 定义镜像列表
|
||||||
|
# 包含: K8s 核心, Calico, Multus, KubeVirt, NFS, Nvidia相关
|
||||||
|
# 注意: Pause 镜像版本需与 kubeadm config 中一致
|
||||||
|
NVIDIA_REPO="nvcr.io/nvidia"
|
||||||
|
IMAGES=(
|
||||||
|
"registry.k8s.io/kube-apiserver:v${K8S_VERSION}"
|
||||||
|
"registry.k8s.io/kube-controller-manager:v${K8S_VERSION}"
|
||||||
|
"registry.k8s.io/kube-scheduler:v${K8S_VERSION}"
|
||||||
|
"registry.k8s.io/kube-proxy:v${K8S_VERSION}"
|
||||||
|
"registry.k8s.io/pause:3.9"
|
||||||
|
"registry.k8s.io/etcd:3.5.12-0"
|
||||||
|
"registry.k8s.io/coredns/coredns:v1.10.1"
|
||||||
|
"docker.io/calico/cni:${CALICO_VERSION}"
|
||||||
|
"docker.io/calico/node:${CALICO_VERSION}"
|
||||||
|
"docker.io/calico/kube-controllers:${CALICO_VERSION}"
|
||||||
|
"docker.io/library/registry:2"
|
||||||
|
"ghcr.io/k8snetworkplumbingwg/multus-cni:v4.0.2"
|
||||||
|
"quay.io/kubevirt/virt-operator:${KUBEVIRT_VERSION}"
|
||||||
|
"quay.io/kubevirt/virt-api:${KUBEVIRT_VERSION}"
|
||||||
|
"quay.io/kubevirt/virt-controller:${KUBEVIRT_VERSION}"
|
||||||
|
"quay.io/kubevirt/virt-handler:${KUBEVIRT_VERSION}"
|
||||||
|
"quay.io/kubevirt/virt-launcher:${KUBEVIRT_VERSION}"
|
||||||
|
"registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2"
|
||||||
|
"nvcr.io/nvidia/k8s-device-plugin:v0.14.1"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ${NVIDIA_REPO}/container-toolkit:v1.13.5-ubuntu20.04
|
||||||
|
# ${NVIDIA_REPO}/dcgm-exporter:3.2.5-3.1.7-ubuntu20.04
|
||||||
|
# ${NVIDIA_REPO}/gpu-feature-discovery:v0.8.1
|
||||||
|
# ${NVIDIA_REPO}/driver:535.104.05-ubuntu22.04
|
||||||
|
|
||||||
|
cd $WORKDIR/images
|
||||||
|
for img in "${IMAGES[@]}"; do
|
||||||
|
# 将 / 和 : 替换为 _ 作为文件名
|
||||||
|
FILENAME=$(echo $img | tr '/:' '__').tar
|
||||||
|
if [ -f "$FILENAME" ]; then
|
||||||
|
echo "跳过已存在: $FILENAME"
|
||||||
|
else
|
||||||
|
echo "Pulling $img ..."
|
||||||
|
docker pull $img
|
||||||
|
echo "Saving to $FILENAME ..."
|
||||||
|
docker save $img -o $FILENAME
|
||||||
|
# 节省空间,保存后删除本地 docker缓存
|
||||||
|
docker rmi $img
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# ================= 3. NVIDIA 驱动 =================
|
||||||
|
echo ">>> [3/6] 下载 NVIDIA H100 驱动 (.run)..."
|
||||||
|
cd $WORKDIR/drivers
|
||||||
|
DRIVER_NAME="NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}.run"
|
||||||
|
if [ ! -f "$DRIVER_NAME" ]; then
|
||||||
|
echo "Downloading NVIDIA Driver..."
|
||||||
|
wget -q https://us.download.nvidia.com/tesla/${NVIDIA_DRIVER_VERSION}/${DRIVER_NAME}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ================= 4. YAML Manifests =================
|
||||||
|
echo ">>> [4/6] 下载 K8s YAML 配置文件..."
|
||||||
|
cd $WORKDIR/manifests
|
||||||
|
|
||||||
|
# Calico
|
||||||
|
curl -L -o calico.yaml https://raw.githubusercontent.com/projectcalico/calico/${CALICO_VERSION}/manifests/calico.yaml
|
||||||
|
|
||||||
|
# KubeVirt
|
||||||
|
KUBEVIRT_REL="https://github.com/kubevirt/kubevirt/releases/download/${KUBEVIRT_VERSION}"
|
||||||
|
curl -L -o kubevirt-operator.yaml ${KUBEVIRT_REL}/kubevirt-operator.yaml
|
||||||
|
curl -L -o kubevirt-cr.yaml ${KUBEVIRT_REL}/kubevirt-cr.yaml
|
||||||
|
|
||||||
|
# Multus
|
||||||
|
curl -L -o multus-daemonset.yaml https://raw.githubusercontent.com/k8snetworkplumbingwg/multus-cni/master/deployments/multus-daemonset.yml
|
||||||
|
|
||||||
|
# ================= 5. Helm Charts =================
|
||||||
|
echo ">>> [5/6] 下载 Helm Charts..."
|
||||||
|
cd $WORKDIR/charts
|
||||||
|
|
||||||
|
# 添加 repo (如果 helm 命令可用)
|
||||||
|
if command -v helm &> /dev/null; then
|
||||||
|
helm repo add nfs-subdir-external-provisioner https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner/
|
||||||
|
helm repo update
|
||||||
|
helm pull nfs-subdir-external-provisioner/nfs-subdir-external-provisioner --version 4.0.18
|
||||||
|
else
|
||||||
|
echo "Helm not installed on host, downloading chart directly via wget..."
|
||||||
|
wget -q https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner/releases/download/nfs-subdir-external-provisioner-4.0.18/nfs-subdir-external-provisioner-4.0.18.tgz
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ================= 6. 验证 =================
|
||||||
|
echo "---------------------------------------------"
|
||||||
|
echo ">>> 下载工作全部完成!正在统计文件大小..."
|
||||||
|
cd $WORKDIR
|
||||||
|
du -sh *
|
||||||
|
echo "---------------------------------------------"
|
||||||
|
echo "请检查 debs 目录是否依然有文件 (这是之前下载的)。"
|
||||||
|
echo "images 目录应该有几 GB 大小。"
|
||||||
|
echo "drivers 目录应该有 400MB+。"
|
||||||
|
cd ${MYPATH}
|
||||||
|
tar cvf - k8s-offline-bundle master-install.sh worker-install.sh | gzip > k8s-offline-bundle.tgz
|
||||||
|
|
||||||
868
deploy/master-install.sh
Normal file
868
deploy/master-install.sh
Normal file
@ -0,0 +1,868 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -eo pipefail # 脚本遇到任何错误立即退出,未捕捉的管道错误也退出
|
||||||
|
|
||||||
|
get_script_path(){
|
||||||
|
# 获取脚本真实路径(解析软链接)
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
|
||||||
|
echo "$SCRIPT_DIR"
|
||||||
|
}
|
||||||
|
# ==============================================================================
|
||||||
|
# 配置区域
|
||||||
|
# ==============================================================================
|
||||||
|
MYPATH=$(get_script_path)
|
||||||
|
OFFLINE_ASSETS_DIR="${MYPATH}/k8s-offline-bundle"
|
||||||
|
|
||||||
|
K8S_VERSION="v1.28.2"
|
||||||
|
CALICO_VERSION="v3.26.1"
|
||||||
|
KUBEVIRT_VERSION="v1.1.0"
|
||||||
|
MULTUS_VERSION="v4.0.2" # Multus CNI 镜像版本
|
||||||
|
NFS_PROVISIONER_VERSION="v4.0.2" # NFS Provisioner 镜像标签
|
||||||
|
NFS_CHART_VERSION="4.0.18" # Helm Chart 版本
|
||||||
|
|
||||||
|
K8S_MASTER_IP="192.168.16.5" # 控制节点的IP,用于API Server绑定和广告
|
||||||
|
LOCAL_REGISTRY_PORT="5000"
|
||||||
|
LOCAL_REGISTRY_ADDR="${K8S_MASTER_IP}:${LOCAL_REGISTRY_PORT}" # 本地镜像仓库地址
|
||||||
|
|
||||||
|
K8S_APISERVER_ADVERTISE_ADDRESS="${K8S_MASTER_IP}" # kubeadm init 使用的API Server广告地址
|
||||||
|
POD_CIDR="10.244.0.0/16"
|
||||||
|
SERVICE_CIDR="10.96.0.0/12"
|
||||||
|
|
||||||
|
NFS_SERVER="192.168.16.2"
|
||||||
|
NFS_PATH="/d/share/101206"
|
||||||
|
NFS_STORAGE_CLASS_NAME="nfs-client"
|
||||||
|
|
||||||
|
TEMP_DIR="/tmp/k8s-master-setup" # 临时工作目录
|
||||||
|
NAMESPACE="default" # 默认命名空间,用于 ctr 命令
|
||||||
|
CONTAINERD_CONFIG="/etc/containerd/config.toml"
|
||||||
|
CERTS_D_PATH="/etc/containerd/certs.d"
|
||||||
|
# /etc/containerd/config.toml文件做以下修改
|
||||||
|
# SystemdCgroup = false 在 [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] 下。这个也需要改为 true。
|
||||||
|
# ==============================================================================
|
||||||
|
# 启动前日志输出
|
||||||
|
# ==============================================================================
|
||||||
|
echo "=================================================="
|
||||||
|
echo " Kubernetes 控制节点离线安装脚本 "
|
||||||
|
echo "=================================================="
|
||||||
|
echo "配置参数:"
|
||||||
|
echo " K8s 版本: ${K8S_VERSION}"
|
||||||
|
echo " 本地镜像仓库: ${LOCAL_REGISTRY_ADDR}"
|
||||||
|
echo " K8s API Server IP: ${K8S_APISERVER_ADVERTISE_ADDRESS}"
|
||||||
|
echo " Pod CIDR: ${POD_CIDR}"
|
||||||
|
echo " Service CIDR: ${SERVICE_CIDR}"
|
||||||
|
echo " NFS Server: ${NFS_SERVER}:${NFS_PATH}"
|
||||||
|
echo "--------------------------------------------------"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 通用函数
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
log_info() {
|
||||||
|
echo -e "\e[32m[INFO] $(date +'%Y-%m-%d %H:%M:%S') $1\e[0m"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warn() {
|
||||||
|
echo -e "\e[33m[WARN] $(date +'%Y-%m-%d %H:%M:%S') $1\e[0m" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
echo -e "\e[31m[ERROR] $(date +'%Y-%m-%d %H:%M:%S') $1\e[0m" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
command_exists() {
|
||||||
|
command -v "$1" >/dev/null 2>&1
|
||||||
|
}
|
||||||
|
|
||||||
|
check_root() {
|
||||||
|
if [[ $EUID -ne 0 ]]; then
|
||||||
|
log_error "此脚本必须以 root 用户或使用 sudo 运行。"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
configure_sysctl() {
|
||||||
|
log_info "配置系统内核参数..."
|
||||||
|
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf > /dev/null
|
||||||
|
overlay
|
||||||
|
br_netfilter
|
||||||
|
EOF
|
||||||
|
sudo modprobe overlay
|
||||||
|
sudo modprobe br_netfilter
|
||||||
|
|
||||||
|
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf > /dev/null
|
||||||
|
net.bridge.bridge-nf-call-iptables = 1
|
||||||
|
net.bridge.bridge-nf-call-ip6tables = 1
|
||||||
|
net.ipv4.ip_forward = 1
|
||||||
|
EOF
|
||||||
|
sudo sysctl --system > /dev/null
|
||||||
|
log_info "系统内核参数配置完成。"
|
||||||
|
}
|
||||||
|
|
||||||
|
disable_swap() {
|
||||||
|
log_info "禁用 Swap 分区..."
|
||||||
|
if grep -q "swap" /etc/fstab; then
|
||||||
|
sudo swapoff -a
|
||||||
|
sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab
|
||||||
|
log_info "Swap 分区已禁用并从 fstab 中注释。"
|
||||||
|
else
|
||||||
|
log_info "未检测到 Swap 分区或已禁用。"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 0. 前置检查与环境初始化
|
||||||
|
# ==============================================================================
|
||||||
|
check_root
|
||||||
|
configure_sysctl
|
||||||
|
disable_swap
|
||||||
|
|
||||||
|
log_info "创建临时工作目录: ${TEMP_DIR}"
|
||||||
|
sudo mkdir -p "${TEMP_DIR}"
|
||||||
|
sudo rm -rf "${TEMP_DIR}/*" # 清理旧的临时文件
|
||||||
|
|
||||||
|
log_info "将离线资源目录添加到 PATH。"
|
||||||
|
export PATH="${OFFLINE_ASSETS_DIR}/bin:$PATH"
|
||||||
|
echo "export PATH=${OFFLINE_ASSETS_DIR}/bin:\$PATH" | sudo tee /etc/profile.d/offline-k8s.sh > /dev/null
|
||||||
|
# ==============================================================================
|
||||||
|
# 1. 安装操作系统依赖 (DEB 包)
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "开始安装操作系统依赖 (DEB 包)..."
|
||||||
|
DEBS_DIR="${OFFLINE_ASSETS_DIR}/debs"
|
||||||
|
if [ ! -d "$DEBS_DIR" ]; then
|
||||||
|
log_error "DEB 包目录 ${DEBS_DIR} 不存在。请确保将所有 .deb 文件放在此目录中。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd "${DEBS_DIR}" || log_error "无法进入 DEB 包目录 ${DEBS_DIR}。"
|
||||||
|
|
||||||
|
log_info "尝试安装所有 DEB 包。这可能需要一些时间,并会尝试多次以解决依赖顺序问题。"
|
||||||
|
# 尝试多次安装,以解决部分依赖顺序问题
|
||||||
|
# for i in {1..3}; do
|
||||||
|
# log_info "第 ${i} 次尝试安装 DEB 包..."
|
||||||
|
# sudo dpkg -i *.deb &>/dev/null || true
|
||||||
|
# done
|
||||||
|
|
||||||
|
# 最终检查是否有未满足的依赖,尝试修复
|
||||||
|
log_info "检查并尝试解决任何未满足的 DEB 包依赖..."
|
||||||
|
if ! sudo apt-get install -f --assume-yes &>/dev/null; then
|
||||||
|
log_warn "部分 DEB 包依赖可能未完全满足。请手动检查并解决 (例如运行 'sudo apt-get install -f')。"
|
||||||
|
else
|
||||||
|
log_info "所有 DEB 包及其依赖已成功安装或已解决。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd - > /dev/null # 返回之前的工作目录
|
||||||
|
log_info "操作系统依赖 (DEB 包) 安装完成。"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 2. 安装 Docker (仅用于本地镜像仓库)
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "安装 Docker daemon (仅用于本地镜像仓库) ..."
|
||||||
|
if ! command_exists docker; then
|
||||||
|
log_error "未检测到 Docker CLI。请确保已安装 Docker (或其他兼容的容器引擎如Podman)。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "配置 Docker daemon 信任本地仓库 ${LOCAL_REGISTRY_ADDR} (针对非 HTTPS)..."
|
||||||
|
sudo mkdir -p /etc/docker
|
||||||
|
cat <<EOF | sudo tee /etc/docker/daemon.json > /dev/null
|
||||||
|
{
|
||||||
|
"insecure-registries": ["${LOCAL_REGISTRY_ADDR}"],
|
||||||
|
"exec-opts": ["native.cgroupdriver=systemd"],
|
||||||
|
"log-driver": "json-file",
|
||||||
|
"log-opts": {
|
||||||
|
"max-size": "100m"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EOF
|
||||||
|
sudo groupadd docker &>/dev/null || true # 如果组已存在,忽略错误
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable docker.socket
|
||||||
|
sudo systemctl enable docker
|
||||||
|
sudo systemctl restart docker.socket
|
||||||
|
sudo systemctl restart docker
|
||||||
|
sudo systemctl status docker --no-pager || log_error "Docker daemon 启动失败。"
|
||||||
|
log_info "Docker daemon 已配置信任本地仓库并重启。"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 3. 安装 Containerd 运行时
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "安装 Containerd 运行时..."
|
||||||
|
CONTAINERD_TAR_GZ=$(find "${OFFLINE_ASSETS_DIR}/bin" -name "containerd-*.tar.gz" | head -n 1)
|
||||||
|
if [ -z "$CONTAINERD_TAR_GZ" ]; then
|
||||||
|
log_error "未找到 Containerd 压缩包。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
sudo tar Cxzvf /usr/local "$CONTAINERD_TAR_GZ" || log_error "解压 Containerd 失败。"
|
||||||
|
|
||||||
|
# 确保 containerd systemd 服务文件存在
|
||||||
|
CONTAINERD_SERVICE_FILE="${OFFLINE_ASSETS_DIR}/service/containerd.service"
|
||||||
|
if [ ! -f "$CONTAINERD_SERVICE_FILE" ]; then
|
||||||
|
log_error "未找到 containerd.service 文件: ${CONTAINERD_SERVICE_FILE}"
|
||||||
|
fi
|
||||||
|
sudo cp "$CONTAINERD_SERVICE_FILE" /etc/systemd/system/containerd.service
|
||||||
|
sudo systemctl daemon-reload # 重新加载服务配置
|
||||||
|
|
||||||
|
log_info "生成并配置 Containerd 默认配置文件..."
|
||||||
|
sudo mkdir -p /etc/containerd
|
||||||
|
sudo containerd config default | sudo tee /etc/containerd/config.toml > /dev/null
|
||||||
|
|
||||||
|
# --- 配置 containerd registry mirrors using config_path ---
|
||||||
|
log_info "配置 containerd 镜像仓库代理..."
|
||||||
|
|
||||||
|
# 创建必要的目录
|
||||||
|
for reg in "${LOCAL_REGISTRY_ADDR}" registry.k8s.io ghcr.io quay.io docker.io nvcr.io; do
|
||||||
|
sudo mkdir -p "${CERTS_D_PATH}/${reg}"
|
||||||
|
done
|
||||||
|
|
||||||
|
# 为本地 Registry 配置 hosts.toml (http, skip_verify)
|
||||||
|
sudo tee "${CERTS_D_PATH}/${LOCAL_REGISTRY_ADDR}/hosts.toml" > /dev/null <<EOF
|
||||||
|
server = "http://${LOCAL_REGISTRY_ADDR}"
|
||||||
|
[host."http://${LOCAL_REGISTRY_ADDR}"]
|
||||||
|
capabilities = ["pull", "resolve"]
|
||||||
|
skip_verify = true
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# 为所有上游仓库配置镜像到本地,回退到官方
|
||||||
|
REGISTRY_SOURCES=(
|
||||||
|
"registry.k8s.io"
|
||||||
|
"ghcr.io"
|
||||||
|
"quay.io"
|
||||||
|
"docker.io"
|
||||||
|
"nvcr.io"
|
||||||
|
)
|
||||||
|
|
||||||
|
for source in "${REGISTRY_SOURCES[@]}"; do
|
||||||
|
sudo tee "${CERTS_D_PATH}/${source}/hosts.toml" > /dev/null <<EOF
|
||||||
|
server = "https://${source}"
|
||||||
|
[host."http://${LOCAL_REGISTRY_ADDR}"]
|
||||||
|
capabilities = ["pull", "resolve"]
|
||||||
|
skip_verify = true
|
||||||
|
[host."https://${source}"]
|
||||||
|
capabilities = ["pull", "resolve"]
|
||||||
|
EOF
|
||||||
|
done
|
||||||
|
|
||||||
|
# 修改 /etc/containerd/config.toml
|
||||||
|
log_info "修改 ${CONTAINERD_CONFIG} 配置..."
|
||||||
|
# 设置 sandbox_image
|
||||||
|
sudo sed -i "s|sandbox_image = \"registry.k8s.io/pause:3.6\"|sandbox_image = \"${LOCAL_REGISTRY_ADDR}/pause:3.9\"|g" "$CONTAINERD_CONFIG"
|
||||||
|
sudo sed -i "s|SystemdCgroup = false|SystemdCgroup = true|g" "$CONTAINERD_CONFIG" || true
|
||||||
|
# 设置 config_path
|
||||||
|
if grep -q "config_path =" "$CONTAINERD_CONFIG"; then
|
||||||
|
sudo sed -i "s|^[[:space:]]*config_path = .*| config_path = \"${CERTS_D_PATH}\"|" "$CONTAINERD_CONFIG"
|
||||||
|
else
|
||||||
|
# 在 [plugins."io.containerd.grpc.v1.cri".registry] 块中添加 config_path
|
||||||
|
if ! grep -q "\[plugins.\"io.containerd.grpc.v1.cri\".registry\]" "$CONTAINERD_CONFIG"; then
|
||||||
|
log_warn "未找到 [plugins.\"io.containerd.grpc.v1.cri\".registry] 块,将尝试追加。"
|
||||||
|
echo -e "\n[plugins.\"io.containerd.grpc.v1.cri\".registry]\n config_path = \"${CERTS_D_PATH}\"" | sudo tee -a "$CONTAINERD_CONFIG" > /dev/null
|
||||||
|
else
|
||||||
|
sudo sed -i "/\[plugins.\"io.containerd.grpc.v1.cri\".registry\]/a \\\n config_path = \"${CERTS_D_PATH}\"" "$CONTAINERD_CONFIG"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 移除旧的 mirrors 和 configs (弃用警告相关的部分)
|
||||||
|
# 使用多行 sed 表达式删除整个块
|
||||||
|
sudo sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\."registry\.k8s\.io"\]/,/^endpoint = \[/d' "$CONTAINERD_CONFIG" || true
|
||||||
|
sudo sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.configs\."192\.168\.16\.5:5000"\.tls\]/,/^insecure_skip_verify = /d' "$CONTAINERD_CONFIG" || true
|
||||||
|
# 确保删除所有相关的空行或残留的块头
|
||||||
|
sudo sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.mirrors\]/d' "$CONTAINERD_CONFIG" || true
|
||||||
|
sudo sed -i '/^\[plugins\."io\.containerd\.grpc\.v1\.cri"\.registry\.configs\]/d' "$CONTAINERD_CONFIG" || true
|
||||||
|
|
||||||
|
log_info "重启 containerd 服务..."
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl restart containerd || log_error "Containerd 服务启动失败。"
|
||||||
|
sudo systemctl status containerd --no-pager || log_error "Containerd 服务状态异常。"
|
||||||
|
log_info "Containerd 配置完成并已启动。"
|
||||||
|
|
||||||
|
# 配置 crictl
|
||||||
|
log_info "配置 crictl..."
|
||||||
|
cat <<EOF | sudo tee /etc/crictl.yaml > /dev/null
|
||||||
|
runtime-endpoint: unix:///run/containerd/containerd.sock
|
||||||
|
image-endpoint: unix:///run/containerd/containerd.sock
|
||||||
|
EOF
|
||||||
|
log_info "crictl 配置完成。"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 4. 安装 CNI 插件
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "安装 CNI 插件..."
|
||||||
|
CNI_PLUGINS_TAR_GZ=$(find "${OFFLINE_ASSETS_DIR}/bin" -name "cni-plugins-*.tgz" | head -n 1)
|
||||||
|
if [ -z "$CNI_PLUGINS_TAR_GZ" ]; then
|
||||||
|
log_error "未找到 CNI 插件压缩包。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
sudo mkdir -p /opt/cni/bin
|
||||||
|
sudo tar Cxzvf /opt/cni/bin "$CNI_PLUGINS_TAR_GZ" || log_error "解压 CNI 插件失败。"
|
||||||
|
log_info "CNI 插件安装完成。"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 5. 安装 Kubernetes Binaries (kubelet, kubeadm, kubectl)
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "安装 Kubernetes Binaries..."
|
||||||
|
BIN_DIR="${OFFLINE_ASSETS_DIR}/bin"
|
||||||
|
for bin in kubelet kubeadm kubectl helm; do
|
||||||
|
if [ ! -f "${BIN_DIR}/${bin}" ]; then
|
||||||
|
log_error "Kubernetes 二进制文件 ${bin} 未找到在 ${BIN_DIR}。"
|
||||||
|
fi
|
||||||
|
sudo cp "${BIN_DIR}/${bin}" /usr/local/bin/
|
||||||
|
sudo chmod +x "/usr/local/bin/${bin}"
|
||||||
|
done
|
||||||
|
|
||||||
|
# 配置 kubelet systemd 服务 (从模板生成)
|
||||||
|
log_info "配置 kubelet systemd 服务..."
|
||||||
|
cat <<'EOF' | sudo tee /etc/systemd/system/kubelet.service
|
||||||
|
[Unit]
|
||||||
|
Description=kubelet: The Kubernetes Node Agent
|
||||||
|
Documentation=https://kubernetes.io/docs/
|
||||||
|
After=containerd.service
|
||||||
|
Wants=containerd.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStart=/usr/local/bin/kubelet
|
||||||
|
Restart=always
|
||||||
|
StartLimitInterval=0
|
||||||
|
RestartSec=10
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
sudo mkdir -p /etc/systemd/system/kubelet.service.d
|
||||||
|
cat <<'EOF' | sudo tee /etc/systemd/system/kubelet.service.d/10-kubeadm.conf
|
||||||
|
[Service]
|
||||||
|
Environment="KUBELET_KUBECONFIG_ARGS=--bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf"
|
||||||
|
Environment="KUBELET_CONFIG_ARGS=--config=/var/lib/kubelet/config.yaml"
|
||||||
|
EnvironmentFile=-/etc/default/kubelet
|
||||||
|
ExecStart=
|
||||||
|
ExecStart=/usr/local/bin/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_EXTRA_ARGS
|
||||||
|
EOF
|
||||||
|
|
||||||
|
|
||||||
|
sudo systemctl daemon-reload
|
||||||
|
sudo systemctl enable kubelet || log_error "启用 kubelet 服务失败。"
|
||||||
|
log_info "Kubernetes Binaries 安装完成,kubelet 服务已启用但未启动。"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 6. 启动本地镜像仓库 (仅在控制节点,192.168.16.5)
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "启动本地镜像仓库 ${LOCAL_REGISTRY_ADDR} ..."
|
||||||
|
|
||||||
|
# 加载 registry 镜像
|
||||||
|
cd "${OFFLINE_ASSETS_DIR}/images"
|
||||||
|
REGISTRY_TAR=$(find . -name "registry_2.tar" | head -n 1)
|
||||||
|
if [ -z "$REGISTRY_TAR" ]; then
|
||||||
|
log_error "未找到本地镜像仓库 registry:2 的 tar 包。"
|
||||||
|
fi
|
||||||
|
sudo docker load -i "$REGISTRY_TAR" || log_error "加载 registry:2 镜像失败。"
|
||||||
|
|
||||||
|
# 停止并删除旧的 registry 容器,确保干净启动
|
||||||
|
sudo docker stop registry &>/dev/null || true
|
||||||
|
sudo docker rm -v registry &>/dev/null || true
|
||||||
|
|
||||||
|
# 启动 registry 容器
|
||||||
|
sudo docker run -d -p "${LOCAL_REGISTRY_PORT}:5000" --restart=always --name registry registry:2 || log_error "启动本地镜像仓库容器失败。"
|
||||||
|
log_info "本地镜像仓库已在 ${LOCAL_REGISTRY_ADDR} 启动。"
|
||||||
|
cd - > /dev/null
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 7. 导入并标记所有镜像到 containerd
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "导入所有离线镜像到 containerd 仓库并标记..."
|
||||||
|
|
||||||
|
IMAGE_DIR="${OFFLINE_ASSETS_DIR}/images"
|
||||||
|
if [ ! -d "$IMAGE_DIR" ]; then
|
||||||
|
log_error "镜像文件目录 ${IMAGE_DIR} 不存在。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 清理 containerd 本地存储中的所有镜像 (除registry:2外,避免误删)
|
||||||
|
log_info "清理 containerd 中已存在的镜像..."
|
||||||
|
# 使用 ctr images ls --quiet 获取所有镜像的 digest
|
||||||
|
# 然后过滤掉那些可能是本地 registry 相关的镜像,避免干扰
|
||||||
|
ctr_images_to_delete=$(ctr -n "$NAMESPACE" images ls --quiet | while read -r digest; do
|
||||||
|
# 检查该 digest 对应的 REF 是否包含 LOCAL_REGISTRY_ADDR 或 registry:2
|
||||||
|
# 这里有点复杂,因为一个 digest 可能有多个 REF
|
||||||
|
refs=$(ctr -n "$NAMESPACE" images ls --no-header | grep "$digest" | awk '{print $1}')
|
||||||
|
skip_delete=false
|
||||||
|
for ref in $refs; do
|
||||||
|
if [[ "$ref" == *"/registry:2"* ]]; then
|
||||||
|
log_info " 跳过删除 registry 镜像: $ref ($digest)"
|
||||||
|
skip_delete=true
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if [ "$skip_delete" = false ]; then
|
||||||
|
echo "$digest" # 输出需要删除的 digest
|
||||||
|
fi
|
||||||
|
done)
|
||||||
|
|
||||||
|
if [ -n "$ctr_images_to_delete" ]; then
|
||||||
|
echo "$ctr_images_to_delete" | while read -r digest_to_delete; do
|
||||||
|
log_info " 删除 containerd 镜像 (digest): $digest_to_delete"
|
||||||
|
ctr -n "$NAMESPACE" images rm "$digest_to_delete" &>/dev/null || log_warn "删除镜像 $digest_to_delete 失败 (可能被使用或不存在)。"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
log_info "Containerd 镜像清理完成。"
|
||||||
|
|
||||||
|
for tarfile in "$IMAGE_DIR"/*.tar; do
|
||||||
|
[ -e "$tarfile" ] || continue
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo ">>> Processing $tarfile"
|
||||||
|
|
||||||
|
# 1️⃣ 获取导入前的镜像列表
|
||||||
|
IMAGES_BEFORE=$(mktemp)
|
||||||
|
# ctr images ls 的第一列就是 REF (镜像名称),使用 awk 提取
|
||||||
|
if ! ctr -n "$NAMESPACE" images ls | awk 'NR>1 {print $1}' | sort > "$IMAGES_BEFORE"; then
|
||||||
|
log_info "❌ Failed to get images list before import."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Debug:
|
||||||
|
log_info "Images BEFORE import for $tarfile:"
|
||||||
|
cat "$IMAGES_BEFORE"
|
||||||
|
|
||||||
|
# 2️⃣ 导入镜像
|
||||||
|
if ! ctr -n "$NAMESPACE" images import "$tarfile"; then
|
||||||
|
log_info "❌ Failed to import image from $tarfile."
|
||||||
|
rm -f "$IMAGES_BEFORE" # 清理临时文件
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 3️⃣ 获取导入后的镜像列表
|
||||||
|
IMAGES_AFTER=$(mktemp)
|
||||||
|
if ! ctr -n "$NAMESPACE" images ls | awk 'NR>1 {print $1}' | sort > "$IMAGES_AFTER"; then
|
||||||
|
echo "❌ Failed to get images list after import."
|
||||||
|
rm -f "$IMAGES_BEFORE" # 清理临时文件
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Debug:
|
||||||
|
log_info "Images AFTER import for $tarfile:"
|
||||||
|
# cat "$IMAGES_AFTER"
|
||||||
|
# echo "Raw difference (comm -13):"
|
||||||
|
# comm -13 "$IMAGES_BEFORE" "$IMAGES_AFTER"
|
||||||
|
|
||||||
|
# 4️⃣ 找出新增的镜像 (即原始镜像)。排除掉带有本地Registry前缀的镜像本身。
|
||||||
|
# 过滤条件:排除本地 registry 已存在的镜像,以及 <none> 引用。
|
||||||
|
# 因为导入的 tarfile 可能会包含多个 tag,我们只取第一个符合条件的
|
||||||
|
ORIGIN_IMG=$(comm -13 "$IMAGES_BEFORE" "$IMAGES_AFTER" | grep -vE "${LOCAL_REGISTRY_ADDR}|<none>" | head -n1|| true)
|
||||||
|
|
||||||
|
if [ "$ORIGIN_IMG" = "" ]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
log_info "JUST A TEST"
|
||||||
|
rm -f "$IMAGES_BEFORE" "$IMAGES_AFTER" # 清理临时文件
|
||||||
|
|
||||||
|
if [[ -z "$ORIGIN_IMG" ]]; then
|
||||||
|
echo "❌ Failed to detect original image name, skipping..."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
echo "Original image: $ORIGIN_IMG"
|
||||||
|
|
||||||
|
NEW_IMG=""
|
||||||
|
if [[ "$ORIGIN_IMG" == "registry.k8s.io/"* ]]; then
|
||||||
|
if [[ "$ORIGIN_IMG" == "registry.k8s.io/coredns/"* ]]; then
|
||||||
|
NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#registry.k8s.io/coredns/}"
|
||||||
|
else
|
||||||
|
NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#registry.k8s.io/}"
|
||||||
|
fi
|
||||||
|
elif [[ "$ORIGIN_IMG" == "ghcr.io/"* ]]; then
|
||||||
|
NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#ghcr.io/}"
|
||||||
|
elif [[ "$ORIGIN_IMG" == "quay.io/"* ]]; then
|
||||||
|
NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#quay.io/}"
|
||||||
|
elif [[ "$ORIGIN_IMG" == "nvcr.io/"* ]]; then
|
||||||
|
NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#nvcr.io/}"
|
||||||
|
elif [[ "$ORIGIN_IMG" == "docker.io/"* ]]; then
|
||||||
|
if [[ "$ORIGIN_IMG" == "docker.io/library/"* ]]; then
|
||||||
|
NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#docker.io/library/}"
|
||||||
|
else
|
||||||
|
NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG#docker.io/}"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Warning: Unknown original registry prefix for $ORIGIN_IMG. Directly prepending LOCAL_REGISTRY_ADDR."
|
||||||
|
NEW_IMG="${LOCAL_REGISTRY_ADDR}/${ORIGIN_IMG}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Retag as: $NEW_IMG"
|
||||||
|
|
||||||
|
# 4️⃣ 打 tag
|
||||||
|
ctr -n "$NAMESPACE" images tag "$ORIGIN_IMG" "$NEW_IMG"
|
||||||
|
|
||||||
|
# 5️⃣ 推送到本地 registry
|
||||||
|
ctr -n "$NAMESPACE" images push --plain-http "$NEW_IMG"
|
||||||
|
echo "tarfile=$tarfile ORIGIN_IMG=$ORIGIN_IMG NEW_IMG=$NEW_IMG"
|
||||||
|
|
||||||
|
echo "✅ Done: $NEW_IMG"
|
||||||
|
done
|
||||||
|
|
||||||
|
log_info "所有镜像已导入 containerd 仓库并正确标记。"
|
||||||
|
log_info "当前 containerd 镜像列表 (前 20 条):"
|
||||||
|
ctr -n "$NAMESPACE" images ls | head -n 20 || true # 打印最终镜像列表以供检查
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 8. 初始化 Kubernetes 控制平面
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "初始化 Kubernetes 控制平面..."
|
||||||
|
|
||||||
|
# 确保 /etc/kubernetes 目录干净,防止 kubeadm init 失败
|
||||||
|
log_info "清理 /etc/kubernetes 目录..."
|
||||||
|
sudo kubeadm reset --force &>/dev/null || true # 强制重置 kubeadm 配置
|
||||||
|
sudo rm -rf /etc/kubernetes/* || log_warn "清理 /etc/kubernetes 目录失败,可能存在权限问题或文件被占用。"
|
||||||
|
sudo rm -rf "$HOME/.kube" # 清理用户 kubeconfig
|
||||||
|
log_info "已清理 /etc/kubernetes 目录和用户 .kube 配置。"
|
||||||
|
|
||||||
|
# 生成 kubeadm 配置
|
||||||
|
log_info "生成 kubeadm-config.yaml 配置..."
|
||||||
|
cat <<EOF | sudo tee ${TEMP_DIR}/kubeadm-config.yaml > /dev/null
|
||||||
|
apiVersion: kubeadm.k8s.io/v1beta3
|
||||||
|
kind: InitConfiguration
|
||||||
|
localAPIEndpoint:
|
||||||
|
advertiseAddress: "${K8S_APISERVER_ADVERTISE_ADDRESS}" # 替换为实际 IP,比如 192.168.16.10
|
||||||
|
bindPort: 6443
|
||||||
|
---
|
||||||
|
apiVersion: kubeadm.k8s.io/v1beta3
|
||||||
|
kind: ClusterConfiguration
|
||||||
|
kubernetesVersion: ${K8S_VERSION}
|
||||||
|
imageRepository: ${LOCAL_REGISTRY_ADDR} # ⬅️ 关键!指定本地镜像仓库
|
||||||
|
networking:
|
||||||
|
podSubnet: ${POD_CIDR}
|
||||||
|
serviceSubnet: ${SERVICE_CIDR}
|
||||||
|
---
|
||||||
|
apiVersion: kubelet.config.k8s.io/v1beta1
|
||||||
|
kind: KubeletConfiguration
|
||||||
|
cgroupDriver: systemd # 根据你的环境选择 systemd 或 cgroupfs
|
||||||
|
EOF
|
||||||
|
|
||||||
|
log_info "kubeadm-config.yaml 已生成,内容如下:"
|
||||||
|
cat ${TEMP_DIR}/kubeadm-config.yaml
|
||||||
|
|
||||||
|
# 运行 kubeadm init
|
||||||
|
log_info "运行 kubeadm init 命令..."
|
||||||
|
# --upload-certs: 上传证书到集群以便工作节点获取
|
||||||
|
# --config: 指定配置
|
||||||
|
# --ignore-preflight-errors=all: 忽略所有预检错误,但在生产环境建议逐一排查。
|
||||||
|
sudo kubeadm init --config=${TEMP_DIR}/kubeadm-config.yaml --upload-certs --ignore-preflight-errors=all
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
log_error "kubeadm init 失败。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "Kubernetes 控制平面初始化完成。"
|
||||||
|
|
||||||
|
# 配置 kubectl
|
||||||
|
log_info "配置 kubectl 访问集群..."
|
||||||
|
mkdir -p "$HOME/.kube"
|
||||||
|
sudo cp /etc/kubernetes/admin.conf "$HOME/.kube/config"
|
||||||
|
sudo chown $(id -u):$(id -g) "$HOME/.kube/config"
|
||||||
|
export KUBECONFIG=$HOME/.kube/config # 确保当前会话可用
|
||||||
|
log_info "kubectl 配置完成。"
|
||||||
|
|
||||||
|
log_info "等待 Kubernetes 控制平面 Pod 启动 (最多 5 分钟)..."
|
||||||
|
# 等待 kube-apiserver, kube-controller-manager, kube-scheduler Pod 启动
|
||||||
|
sleep 1
|
||||||
|
kubectl wait --for=condition=ready pod -l component=kube-apiserver -n kube-system --timeout=300s || log_error "kube-apiserver Pod 未能在指定时间内启动。"
|
||||||
|
kubectl wait --for=condition=ready pod -l component=kube-controller-manager -n kube-system --timeout=300s || log_error "kube-controller-manager Pod 未能在指定时间内启动。"
|
||||||
|
kubectl wait --for=condition=ready pod -l component=kube-scheduler -n kube-system --timeout=300s || log_error "kube-scheduler Pod 未能在指定时间内启动。"
|
||||||
|
|
||||||
|
log_info "核心控制平面组件已就绪。"
|
||||||
|
log_info "查看集群节点状态:"
|
||||||
|
kubectl get nodes
|
||||||
|
|
||||||
|
# ========
|
||||||
|
# 设置环境变量
|
||||||
|
# ========
|
||||||
|
mkdir -p $HOME/.kube
|
||||||
|
sudo cp /etc/kubernetes/admin.conf $HOME/.kube/config
|
||||||
|
sudo chown $(id -u):$(id -g) $HOME/.kube/config
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 9. 安装 CNI 网络插件 (Calico)
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "安装 CNI 网络插件 (Calico)..."
|
||||||
|
|
||||||
|
CALICO_MANIFEST_ORIG="${OFFLINE_ASSETS_DIR}/manifests/calico.yaml"
|
||||||
|
if [ ! -f "$CALICO_MANIFEST_ORIG" ]; then
|
||||||
|
log_error "Calico 原始 manifest 文件 ${CALICO_MANIFEST_ORIG} 不存在。"
|
||||||
|
fi
|
||||||
|
CALICO_MANIFEST_TEMP="${TEMP_DIR}/calico.yaml"
|
||||||
|
cp "${CALICO_MANIFEST_ORIG}" "${CALICO_MANIFEST_TEMP}" || log_error "复制 Calico manifest 文件失败。"
|
||||||
|
|
||||||
|
# 替换 Calico 镜像地址
|
||||||
|
log_info "替换 Calico 镜像地址为本地仓库: ${LOCAL_REGISTRY_ADDR} ..."
|
||||||
|
# 注意:Calico 的镜像通常在 docker.io 下,所以替换规则不同于 k8s.io
|
||||||
|
sudo sed -i "s|docker.io/calico/cni:${CALICO_VERSION}|${LOCAL_REGISTRY_ADDR}/calico/cni:${CALICO_VERSION}|g" "${CALICO_MANIFEST_TEMP}"
|
||||||
|
sudo sed -i "s|docker.io/calico/node:${CALICO_VERSION}|${LOCAL_REGISTRY_ADDR}/calico/node:${CALICO_VERSION}|g" "${CALICO_MANIFEST_TEMP}"
|
||||||
|
sudo sed -i "s|docker.io/calico/kube-controllers:${CALICO_VERSION}|${LOCAL_REGISTRY_ADDR}/calico/kube-controllers:${CALICO_VERSION}|g" "${CALICO_MANIFEST_TEMP}"
|
||||||
|
|
||||||
|
# 设置 Pod CIDR
|
||||||
|
log_info "配置 Calico Pod CIDR: ${POD_CIDR} ..."
|
||||||
|
# 确保 # - name: CALICO_IPV4POOL_CIDR 及其下面的 value 行被取消注释并设置
|
||||||
|
sudo sed -i "s|# - name: CALICO_IPV4POOL_CIDR|- name: CALICO_IPV4POOL_CIDR|g" "${CALICO_MANIFEST_TEMP}"
|
||||||
|
sudo sed -i "s|# value: \"192.168.0.0/16\"| value: \"${POD_CIDR}\"|g" "${CALICO_MANIFEST_TEMP}"
|
||||||
|
|
||||||
|
# 在 calico.yaml 文件末尾添加 IPPool 资源 (如果文件中没有,或者确保它存在且配置正确)
|
||||||
|
if ! grep -q "kind: IPPool" "${CALICO_MANIFEST_TEMP}"; then
|
||||||
|
log_info "在 Calico manifest 中添加 IPPool 资源定义..."
|
||||||
|
echo -e "\n---\napiVersion: crd.projectcalico.org/v1\nkind: IPPool\nmetadata:\n name: default-pool-ipv4\nspec:\n cidr: ${POD_CIDR}\n natOutgoing: true\n disabled: false\n ipipMode: Always" | sudo tee -a "${CALICO_MANIFEST_TEMP}" > /dev/null
|
||||||
|
else
|
||||||
|
log_info "Calico IPPool 定义已存在,跳过添加。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "应用 Calico manifest 文件..., 内容如下:"
|
||||||
|
cat ${CALICO_MANIFEST_TEMP}
|
||||||
|
kubectl apply -f "${CALICO_MANIFEST_TEMP}" || log_error "应用 Calico manifest 失败。"
|
||||||
|
log_info "Calico 网络插件安装完成。"
|
||||||
|
|
||||||
|
log_info "等待 Calico Pod 启动 (最多 20 分钟)..."
|
||||||
|
sleep 10
|
||||||
|
kubectl wait --for=condition=ready pod -l k8s-app=calico-node -n kube-system --timeout=1900s || log_error "Calico Node Pod 未能在指定时间内启动。"
|
||||||
|
log_info "Calico Pods 已就绪。"
|
||||||
|
|
||||||
|
#============
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 10. 安装 Multus CNI (用于 KubeVirt 虚拟机多网卡)
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "安装 Multus CNI 插件..."
|
||||||
|
MULTUS_MANIFEST_ORIG="${OFFLINE_ASSETS_DIR}/manifests/multus-daemonset.yaml"
|
||||||
|
if [ ! -f "$MULTUS_MANIFEST_ORIG" ]; then
|
||||||
|
log_error "Multus 原始 manifest 文件 ${MULTUS_MANIFEST_ORIG} 不存在。"
|
||||||
|
fi
|
||||||
|
MULTUS_MANIFEST_TEMP="${TEMP_DIR}/multus-daemonset.yaml"
|
||||||
|
cp "${MULTUS_MANIFEST_ORIG}" "${MULTUS_MANIFEST_TEMP}" || log_error "复制 Multus manifest 文件失败。"
|
||||||
|
|
||||||
|
log_info "替换 Multus CNI 镜像地址为本地仓库: ${LOCAL_REGISTRY_ADDR} ..."
|
||||||
|
# Multus CNI 的镜像通常在 ghcr.io/k8snetworkplumbingwg/ 或 docker.io 下
|
||||||
|
sudo sed -i "s|ghcr.io/k8snetworkplumbingwg/multus-cni:snapshot|${LOCAL_REGISTRY_ADDR}/k8snetworkplumbingwg/multus-cni:${MULTUS_VERSION}|g" "${MULTUS_MANIFEST_TEMP}"
|
||||||
|
sudo sed -i "s|docker.io/k8snetworkplumbingwg/multus-cni:snapshot|${LOCAL_REGISTRY_ADDR}/k8snetworkplumbingwg/multus-cni:${MULTUS_VERSION}|g" "${MULTUS_MANIFEST_TEMP}"
|
||||||
|
|
||||||
|
log_info "应用 Multus CNI manifest 文件..."
|
||||||
|
kubectl apply -f "${MULTUS_MANIFEST_TEMP}" || log_error "应用 Multus CNI manifest 失败。"
|
||||||
|
log_info "Multus CNI 插件安装完成。"
|
||||||
|
|
||||||
|
log_info "等待 Multus Pod 启动 (最多 5 分钟)..."
|
||||||
|
sleep 1
|
||||||
|
kubectl wait --for=condition=ready pod -l app=multus -n kube-system --timeout=300s || log_error "Multus Pod 未能在指定时间内启动。"
|
||||||
|
log_info "Multus Pods 已就绪。"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 11. 安装 KubeVirt (用于虚拟机管理)
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "安装 KubeVirt..."
|
||||||
|
|
||||||
|
KUBEVIRT_OPERATOR_ORIG="${OFFLINE_ASSETS_DIR}/manifests/kubevirt-operator.yaml"
|
||||||
|
|
||||||
|
if [ ! -f "$KUBEVIRT_OPERATOR_ORIG" ]; then
|
||||||
|
log_error "KubeVirt Operator 文件 ${KUBEVIRT_OPERATOR_ORIG} 不存在。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
KUBEVIRT_OPERATOR_TEMP="${TEMP_DIR}/kubevirt-operator.yaml"
|
||||||
|
cp "${KUBEVIRT_OPERATOR_ORIG}" "${KUBEVIRT_OPERATOR_TEMP}" || log_error "复制 KubeVirt Operator 文件失败。"
|
||||||
|
|
||||||
|
log_info "替换 KubeVirt Operator 镜像地址为本地仓库: ${LOCAL_REGISTRY_ADDR} ..."
|
||||||
|
# KubeVirt 镜像通常在 quay.io/kubevirt
|
||||||
|
# 这里需要替换 operator 和所有由 operator 部署的组件的镜像
|
||||||
|
sudo sed -i "s|quay.io/kubevirt/virt-operator:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/virt-operator:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}"
|
||||||
|
# sudo sed -i "s|quay.io/kubevirt/virt-controller:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/virt-controller:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}"
|
||||||
|
# sudo sed -i "s|quay.io/kubevirt/virt-handler:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/virt-handler:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}"
|
||||||
|
# sudo sed -i "s|quay.io/kubevirt/virt-launcher:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/virt-launcher:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}"
|
||||||
|
# sudo sed -i "s|quay.io/kubevirt/virt-api:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/virt-api:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}"
|
||||||
|
# sudo sed -i "s|quay.io/kubevirt/libguestfs-tools:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/libguestfs-tools:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}"
|
||||||
|
# sudo sed -i "s|quay.io/kubevirt/bridge-marker:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/bridge-marker:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}"
|
||||||
|
# sudo sed -i "s|quay.io/kubevirt/sidecar-shim:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/sidecar-shim:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}"
|
||||||
|
# sudo sed -i "s|quay.io/kubevirt/qemu-bridge-helper:${KUBEVIRT_VERSION}|${LOCAL_REGISTRY_ADDR}/kubevirt/qemu-bridge-helper:${KUBEVIRT_VERSION}|g" "${KUBEVIRT_OPERATOR_TEMP}"
|
||||||
|
awk '
|
||||||
|
/^kind: Deployment/ {inDeployment=1}
|
||||||
|
inDeployment && /^ template:/ {inTemplate=1}
|
||||||
|
inTemplate && /^ spec:/ {inSpec=1}
|
||||||
|
inSpec && /^ tolerations:/ {
|
||||||
|
print
|
||||||
|
# 插入控制平面 toleration
|
||||||
|
indent = match($0,/[^ ]/) - 1
|
||||||
|
spaces = " "
|
||||||
|
printf("%s- key: \"node-role.kubernetes.io/control-plane\"\n", substr(spaces, 1, indent))
|
||||||
|
printf("%s operator: \"Exists\"\n", substr(spaces, 1, indent))
|
||||||
|
printf("%s effect: \"NoSchedule\"\n", substr(spaces, 1, indent))
|
||||||
|
# 标记已经插入,防止重复插入
|
||||||
|
inserted=1
|
||||||
|
next
|
||||||
|
}
|
||||||
|
# 如果已经插入,就不再修改其他 tolerations
|
||||||
|
{print}
|
||||||
|
' "${KUBEVIRT_OPERATOR_TEMP}" > ${TEMP_DIR}/kubevirt-operator-mod.yaml
|
||||||
|
|
||||||
|
cp ${TEMP_DIR}/kubevirt-operator-mod.yaml ${KUBEVIRT_OPERATOR_TEMP}
|
||||||
|
log_info "应用 KubeVirt Operator manifest 文件..."
|
||||||
|
kubectl apply -f "${KUBEVIRT_OPERATOR_TEMP}" || log_error "应用 KubeVirt Operator 失败。"
|
||||||
|
log_info "KubeVirt Operator 应用完成。"
|
||||||
|
|
||||||
|
log_info "等待 KubeVirt Operator 启动 (最多 15 分钟)..."
|
||||||
|
sleep 1
|
||||||
|
kubectl wait --for=condition=ready pod -l kubevirt.io=virt-operator -n kubevirt --timeout=900s || log_error "KubeVirt Operator Pod 未能在指定时间内启动。"
|
||||||
|
log_info "KubeVirt Operator Pods 已就绪。"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 12. 安装 NFS Client Provisioner (用于动态 PV/PVC)
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "安装 NFS Client Provisioner..."
|
||||||
|
|
||||||
|
# 12.1 添加 Helm 仓库 (通常在线操作,离线场景下需要手动解压 chart)
|
||||||
|
log_info "加载 NFS Client Provisioner Helm Chart..."
|
||||||
|
NFS_CHART_TGZ="${OFFLINE_ASSETS_DIR}/charts/nfs-subdir-external-provisioner-${NFS_CHART_VERSION}.tgz"
|
||||||
|
if [ ! -f "$NFS_CHART_TGZ" ]; then
|
||||||
|
log_error "NFS Client Provisioner Helm Chart 文件 ${NFS_CHART_TGZ} 不存在。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 解压 chart 到临时目录
|
||||||
|
log_info "解压 Helm Chart 到临时目录..."
|
||||||
|
sudo mkdir -p "${TEMP_DIR}/nfs-client-provisioner"
|
||||||
|
sudo tar -xzf "$NFS_CHART_TGZ" -C "${TEMP_DIR}/nfs-client-provisioner" || log_error "解压 NFS Chart 失败。"
|
||||||
|
NFS_CHART_PATH="${TEMP_DIR}/nfs-client-provisioner/nfs-subdir-external-provisioner" # 解压后的实际目录
|
||||||
|
|
||||||
|
# 12.2 创建 NFS provisioner 的 values.yaml
|
||||||
|
log_info "创建 NFS Client Provisioner 的 values.yaml..."
|
||||||
|
cat <<EOF | sudo tee "${TEMP_DIR}/nfs-provisioner-values.yaml" > /dev/null
|
||||||
|
replicaCount: 1
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
|
||||||
|
image:
|
||||||
|
repository: ${LOCAL_REGISTRY_ADDR}/sig-storage/nfs-subdir-external-provisioner
|
||||||
|
tag: ${NFS_PROVISIONER_VERSION}
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
|
|
||||||
|
nfs:
|
||||||
|
server: ${NFS_SERVER}
|
||||||
|
path: ${NFS_PATH}
|
||||||
|
|
||||||
|
storageClass:
|
||||||
|
create: true
|
||||||
|
name: ${NFS_STORAGE_CLASS_NAME}
|
||||||
|
defaultClass: true
|
||||||
|
provisionerName: ${NFS_STORAGE_CLASS_NAME}
|
||||||
|
reclaimPolicy: Delete
|
||||||
|
archiveOnDelete: true
|
||||||
|
|
||||||
|
# 允许 Pod 调度到 control-plane 节点
|
||||||
|
tolerations:
|
||||||
|
- key: "node-role.kubernetes.io/control-plane"
|
||||||
|
operator: "Exists"
|
||||||
|
effect: "NoSchedule"
|
||||||
|
|
||||||
|
# 如果你想强制跑在控制节点(通常单节点集群推荐)
|
||||||
|
# 控制节点通常带有 label:node-role.kubernetes.io/control-plane=""
|
||||||
|
nodeSelector:
|
||||||
|
node-role.kubernetes.io/control-plane: ""
|
||||||
|
|
||||||
|
# 也可以留空不写,K8s 会随机选择节点
|
||||||
|
# nodeSelector: {}
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
||||||
|
log_info "NFS Client Provisioner values.yaml 已生成,内容如下:"
|
||||||
|
cat "${TEMP_DIR}/nfs-provisioner-values.yaml"
|
||||||
|
|
||||||
|
# 12.3 部署 NFS Client Provisioner (使用 Helm)
|
||||||
|
log_info "使用 Helm 部署 NFS Client Provisioner..."
|
||||||
|
|
||||||
|
# 检查是否已安装,如果已安装则升级,否则安装
|
||||||
|
if helm status nfs-client-provisioner -n kube-system &>/dev/null; then
|
||||||
|
log_info "NFS Client Provisioner 已存在,进行升级..."
|
||||||
|
helm upgrade nfs-client-provisioner "${NFS_CHART_PATH}" \
|
||||||
|
--install \
|
||||||
|
--namespace kube-system \
|
||||||
|
--values "${TEMP_DIR}/nfs-provisioner-values.yaml" \
|
||||||
|
--version "${NFS_CHART_VERSION}" || log_error "升级 NFS Client Provisioner 失败。"
|
||||||
|
else
|
||||||
|
log_info "NFS Client Provisioner 未安装,进行安装..."
|
||||||
|
helm install nfs-client-provisioner "${NFS_CHART_PATH}" \
|
||||||
|
--namespace kube-system \
|
||||||
|
--values "${TEMP_DIR}/nfs-provisioner-values.yaml" \
|
||||||
|
--version "${NFS_CHART_VERSION}" || log_error "安装 NFS Client Provisioner 失败。"
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "NFS Client Provisioner Helm Chart 应用完成。"
|
||||||
|
|
||||||
|
log_info "等待 NFS Client Provisioner Pod 启动 (最多 5 分钟)..."
|
||||||
|
sleep 1
|
||||||
|
kubectl wait --for=condition=ready pod -l app=nfs-subdir-external-provisioner -n kube-system --timeout=300s || log_error "NFS Client Provisioner Pod 未能在指定时间内启动。"
|
||||||
|
log_info "NFS Client Provisioner Pods 已就绪。"
|
||||||
|
|
||||||
|
log_info "设置默认 StorageClass 为 ${NFS_STORAGE_CLASS_NAME}..."
|
||||||
|
# 确保旧的默认 StorageClass 被取消默认
|
||||||
|
kubectl patch storageclass $(kubectl get storageclass -o jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}') -p '{"metadata":{"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' &>/dev/null || true
|
||||||
|
# 设置新的默认 StorageClass
|
||||||
|
kubectl patch storageclass "${NFS_STORAGE_CLASS_NAME}" -p '{"metadata":{"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' || log_error "设置 ${NFS_STORAGE_CLASS_NAME} 为默认 StorageClass 失败。"
|
||||||
|
log_info "${NFS_STORAGE_CLASS_NAME} 已设置为默认 StorageClass。"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 13. KubeVirt 额外配置 (如 NetworkAttachmentDefinition 示例)
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "应用 KubeVirt 额外配置 (示例 NetworkAttachmentDefinition)..."
|
||||||
|
|
||||||
|
# 如果需要,这里可以添加其他 NetworkAttachmentDefinition
|
||||||
|
# 例如,一个 vlan 接口
|
||||||
|
cat <<EOF | kubectl apply -f -
|
||||||
|
apiVersion: k8s.cni.cncf.io/v1
|
||||||
|
kind: NetworkAttachmentDefinition
|
||||||
|
metadata:
|
||||||
|
name: example-vlan-net
|
||||||
|
namespace: default
|
||||||
|
spec:
|
||||||
|
config: '{
|
||||||
|
"cniVersion": "0.3.1",
|
||||||
|
"type": "bridge",
|
||||||
|
"bridge": "br1",
|
||||||
|
"vlan": 100,
|
||||||
|
"ipam": {
|
||||||
|
"type": "whereabouts",
|
||||||
|
"range": "192.168.100.0/24"
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
EOF
|
||||||
|
# 注意:whereabouts 需要单独安装,这里只是一个示例。
|
||||||
|
# 如果没有安装 whereabouts,请替换为 host-local 或其他 IPAM 插件。
|
||||||
|
log_info "示例 NetworkAttachmentDefinition 'example-vlan-net' 已应用 (如果 whereabouts 未安装,此配置可能不会完全生效)。"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 14. 验证集群状态和安装结果
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "--------------------------------------------------"
|
||||||
|
log_info "所有安装步骤完成,开始最终验证..."
|
||||||
|
log_info "--------------------------------------------------"
|
||||||
|
|
||||||
|
log_info "验证所有命名空间下的 Pod 状态..."
|
||||||
|
kubectl get pods --all-namespaces
|
||||||
|
|
||||||
|
log_info "等待所有 Pods 达到 Ready 状态 (最多 10 分钟)..."
|
||||||
|
# 注意:此命令可能会在 Pod 数量多时耗时较长
|
||||||
|
sleep 1
|
||||||
|
kubectl wait --for=condition=ready --all pods --all-namespaces --timeout=600s || log_warn "并非所有 Pods 都达到 Ready 状态,请手动检查。"
|
||||||
|
|
||||||
|
log_info "验证集群节点状态..."
|
||||||
|
kubectl get nodes
|
||||||
|
|
||||||
|
log_info "验证 StorageClass 状态..."
|
||||||
|
kubectl get sc
|
||||||
|
|
||||||
|
log_info "验证 KubeVirt 状态..."
|
||||||
|
kubectl get kubevirts -n kubevirt
|
||||||
|
|
||||||
|
log_info "KubeVirt 预期输出示例: STATUS 为 'Deployed'"
|
||||||
|
virtctl version || log_warn "virtctl 命令可能未安装或不在 PATH 中。"
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# 15. 输出加入命令
|
||||||
|
# ==============================================================================
|
||||||
|
log_info "--------------------------------------------------"
|
||||||
|
log_info "Kubernetes 控制平面离线安装完成!"
|
||||||
|
log_info "使用以下命令将工作节点加入集群:"
|
||||||
|
log_info "--------------------------------------------------"
|
||||||
|
sudo kubeadm token create --print-join-command
|
||||||
|
echo ""
|
||||||
|
log_info "请注意保存上述命令,因为令牌有过期时间。"
|
||||||
|
|
||||||
|
log_info "清理临时目录: ${TEMP_DIR}"
|
||||||
|
sudo rm -rf "${TEMP_DIR}"
|
||||||
|
|
||||||
|
log_info "脚本执行完毕。"
|
||||||
|
|
||||||
69
deploy/master_remove_k8s.sh
Normal file
69
deploy/master_remove_k8s.sh
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "=== Starting Kubernetes Master Node Cleanup ==="
|
||||||
|
echo "WARNING: This script will irrevocably delete all Kubernetes data, configuration,"
|
||||||
|
echo " AND YOUR LOCAL DOCKER REGISTRY'S DATA from this node."
|
||||||
|
echo " DO NOT RUN ON A PRODUCTION CLUSTER!"
|
||||||
|
echo ""
|
||||||
|
echo "--- Step 0: Stopping and Cleaning Local Docker Registry ---"
|
||||||
|
# 假设你的 Registry 容器名为 'registry',并且端口是 5000
|
||||||
|
# 如果你的 Registry 容器名或端口不同,请修改这里
|
||||||
|
LOCAL_REGISTRY_CONTAINER_NAME="registry"
|
||||||
|
|
||||||
|
# 停止并删除 Registry 容器
|
||||||
|
sudo docker stop "$LOCAL_REGISTRY_CONTAINER_NAME" || { echo "❌ Local Registry container stop failed or not found, but continuing..."; }
|
||||||
|
# -v 选项会删除与容器关联的所有匿名数据卷。
|
||||||
|
# 如果你使用了具名数据卷(例如:-v my-registry-data:/var/lib/registry),
|
||||||
|
# 你可能需要手动删除它:sudo docker volume rm my-registry-data
|
||||||
|
sudo docker rm -v "$LOCAL_REGISTRY_CONTAINER_NAME" || { echo "❌ Local Registry container removal failed or not found, but continuing..."; }
|
||||||
|
|
||||||
|
echo " Local Docker Registry container stopped and removed."
|
||||||
|
|
||||||
|
echo "--- Step 1: Resetting kubeadm ---"
|
||||||
|
sudo kubeadm reset -f || { echo "❌ kubeadm reset failed or encountered errors, but continuing..."; }
|
||||||
|
|
||||||
|
echo "--- Step 2: Stopping and cleaning containerd (CRI Runtime) ---"
|
||||||
|
sudo systemctl stop containerd || { echo "❌ containerd service stop failed, but continuing..."; }
|
||||||
|
|
||||||
|
sudo ctr -n k8s.io containers ls -q | xargs -r sudo ctr -n k8s.io containers rm || true
|
||||||
|
sudo ctr -n default containers ls -q | xargs -r sudo ctr -n default containers rm || true
|
||||||
|
|
||||||
|
# 彻底删除 containerd 的镜像数据和配置文件 (强烈推荐)
|
||||||
|
sudo rm -rf /var/lib/containerd/* || { echo "❌ Failed to clean /var/lib/containerd, but continuing..."; }
|
||||||
|
sudo rm -rf /etc/containerd/* || { echo "❌ Failed to clean /etc/containerd, but continuing..."; }
|
||||||
|
|
||||||
|
echo "--- Step 3: Cleaning CNI network configurations ---"
|
||||||
|
sudo rm -rf /etc/cni/net.d/* || { echo "❌ Failed to clean /etc/cni/net.d, but continuing..."; }
|
||||||
|
sudo rm -rf /var/lib/cni/* || { echo "❌ Failed to clean /var/lib/cni, but continuing..."; }
|
||||||
|
|
||||||
|
echo "--- Step 4: Cleaning kubelet related files ---"
|
||||||
|
sudo rm -rf /var/lib/kubelet/* || { echo "❌ Failed to clean /var/lib/kubelet, but continuing..."; }
|
||||||
|
sudo rm -rf /var/run/kubernetes/* || { echo "❌ Failed to clean /var/run/kubernetes, but continuing..."; }
|
||||||
|
|
||||||
|
echo "--- Step 5: Removing Kubernetes configuration files ---"
|
||||||
|
sudo rm -rf ~/.kube || { echo "❌ Failed to clean ~/.kube, but continuing..."; }
|
||||||
|
sudo rm -rf /etc/kubernetes/* || { echo "❌ Failed to clean /etc/kubernetes, but continuing..."; }
|
||||||
|
|
||||||
|
echo "--- Step 6: Cleaning up iptables and IPVS rules ---"
|
||||||
|
sudo iptables -F
|
||||||
|
sudo iptables -t nat -F
|
||||||
|
sudo iptables -t raw -F
|
||||||
|
sudo iptables -t mangle -F
|
||||||
|
sudo iptables -X
|
||||||
|
|
||||||
|
if command -v ipvsadm &> /dev/null; then
|
||||||
|
sudo ipvsadm --clear || { echo "❌ Failed to clear ipvsadm rules, but continuing..."; }
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 确保删除所有由 Docker daemon 自身创建的 K8s 相关网络(如果 Registry 运行在 Docker 上,并且 Docker daemon 也被 K8s 使用过)
|
||||||
|
# 再次注意:这一步通常在 K8s 节点上执行时安全,但需谨慎
|
||||||
|
sudo docker network ls -q | grep -E 'k8s|cni' | xargs -r sudo docker network rm || true
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== Kubernetes Master Node Cleanup COMPLETED ==="
|
||||||
|
echo "It is HIGHLY RECOMMENDED to reboot this node now to ensure a completely clean state."
|
||||||
|
echo "You can do this by running: sudo reboot"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
sudo systemctl daemon-reload
|
||||||
9
deploy/tst.sh
Executable file
9
deploy/tst.sh
Executable file
@ -0,0 +1,9 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
get_script_path(){
|
||||||
|
# 获取脚本真实路径(解析软链接)
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
|
||||||
|
echo "$SCRIPT_DIR"
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "$(get_script_path)"
|
||||||
10
script/ctrl_init.sh
Normal file
10
script/ctrl_init.sh
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
sudo kubeadm init --kubernetes-version=v1.29.0 --pod-network-cidr=10.244.0.0/16
|
||||||
|
# 保存 kubeconfig
|
||||||
|
mkdir -p $HOME/.kube
|
||||||
|
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
|
||||||
|
sudo chown $(id -u):$(id -g) $HOME/.kube/config
|
||||||
|
|
||||||
|
kubectl apply -f /opt/offline/kubevirt/kubevirt-operator.yaml
|
||||||
|
kubectl apply -f /opt/offline/kubevirt/kubevirt-cr.yaml
|
||||||
|
|
||||||
|
kubeadm token create --print-join-command
|
||||||
59
script/download_pkgs.sh
Executable file
59
script/download_pkgs.sh
Executable file
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
mkdir -p /opt/offline/{k8s,containerd,kubevirt,nvidia,dependencies}
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 1. Ubuntu 22.04 系统依赖
|
||||||
|
# -------------------------------
|
||||||
|
sudo apt update
|
||||||
|
DEBS="curl conntrack socat ipvsadm iptables bridge-utils ethtool git wget tar"
|
||||||
|
mkdir -p /opt/offline/dependencies
|
||||||
|
for pkg in $DEBS; do
|
||||||
|
apt download $pkg
|
||||||
|
mv *.deb /opt/offline/dependencies/
|
||||||
|
done
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 2. Kubernetes 组件
|
||||||
|
# -------------------------------
|
||||||
|
K8S_VERSION="1.29.0"
|
||||||
|
mkdir -p /opt/offline/k8s
|
||||||
|
cd /opt/offline/k8s
|
||||||
|
curl -LO https://dl.k8s.io/release/v${K8S_VERSION}/bin/linux/amd64/kubeadm
|
||||||
|
curl -LO https://dl.k8s.io/release/v${K8S_VERSION}/bin/linux/amd64/kubelet
|
||||||
|
curl -LO https://dl.k8s.io/release/v${K8S_VERSION}/bin/linux/amd64/kubectl
|
||||||
|
chmod +x kubeadm kubelet kubectl
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 3. Containerd
|
||||||
|
# -------------------------------
|
||||||
|
CONTAINERD_VERSION="1.9.12"
|
||||||
|
cd /opt/offline/containerd
|
||||||
|
wget https://github.com/containerd/containerd/releases/download/v${CONTAINERD_VERSION}/containerd-${CONTAINERD_VERSION}-linux-amd64.tar.gz
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 4. NVIDIA Container Toolkit & Drivers
|
||||||
|
# -------------------------------
|
||||||
|
mkdir -p /opt/offline/nvidia
|
||||||
|
# 下载 NVIDIA driver (根据 GPU 型号自行选择)
|
||||||
|
# 示例: NVIDIA-Linux-x86_64-525.85.12.run
|
||||||
|
wget -O /opt/offline/nvidia/NVIDIA-DRIVER.run http://us.download.nvidia.com/XFree86/Linux-x86_64/525.85.12/NVIDIA-Linux-x86_64-525.85.12.run
|
||||||
|
# 下载 NVIDIA container toolkit
|
||||||
|
wget -O /opt/offline/nvidia/nvidia-container-toolkit.deb https://github.com/NVIDIA/nvidia-docker/releases/download/v2.13.0/nvidia-container-toolkit_2.13.0-1_all.deb
|
||||||
|
wget -O /opt/offline/nvidia/nvidia-container-runtime.deb https://github.com/NVIDIA/nvidia-docker/releases/download/v2.13.0/nvidia-container-runtime_2.13.0-1_amd64.deb
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 5. KubeVirt Operator + CR
|
||||||
|
# -------------------------------
|
||||||
|
mkdir -p /opt/offline/kubevirt
|
||||||
|
curl -L https://github.com/kubevirt/kubevirt/releases/download/v1.28.0/kubevirt-operator.yaml -o /opt/offline/kubevirt/kubevirt-operator.yaml
|
||||||
|
curl -L https://github.com/kubevirt/kubevirt/releases/download/v1.28.0/kubevirt-cr.yaml -o /opt/offline/kubevirt/kubevirt-cr.yaml
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 6. GPU Operator
|
||||||
|
# -------------------------------
|
||||||
|
mkdir -p /opt/offline/nvidia/gpu-operator
|
||||||
|
curl -L https://github.com/NVIDIA/gpu-operator/archive/refs/heads/main.tar.gz -o /opt/offline/nvidia/gpu-operator/gpu-operator.tar.gz
|
||||||
|
|
||||||
|
echo "Offline package download completed. All packages are in /opt/offline/"
|
||||||
|
|
||||||
49
script/install_offline.sh
Normal file
49
script/install_offline.sh
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# GPU节点
|
||||||
|
# sudo bash install_offline.sh gpu
|
||||||
|
# 控制节点或普通工作节点
|
||||||
|
# sudo bash install_offline.sh
|
||||||
|
set -e
|
||||||
|
|
||||||
|
OFFLINE_DIR="/opt/offline"
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 1. 安装依赖
|
||||||
|
# -------------------------------
|
||||||
|
dpkg -i $OFFLINE_DIR/dependencies/*.deb || apt-get -f install -y
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 2. 安装 containerd
|
||||||
|
# -------------------------------
|
||||||
|
tar -C /usr/local -xzf $OFFLINE_DIR/containerd/containerd-*.tar.gz
|
||||||
|
ln -s /usr/local/bin/containerd /usr/bin/containerd
|
||||||
|
ln -s /usr/local/bin/containerd-shim /usr/bin/containerd-shim
|
||||||
|
ln -s /usr/local/bin/ctr /usr/bin/ctr
|
||||||
|
containerd --version
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 3. 安装 Kubernetes
|
||||||
|
# -------------------------------
|
||||||
|
cp $OFFLINE_DIR/k8s/kubeadm /usr/bin/
|
||||||
|
cp $OFFLINE_DIR/k8s/kubelet /usr/bin/
|
||||||
|
cp $OFFLINE_DIR/k8s/kubectl /usr/bin/
|
||||||
|
chmod +x /usr/bin/kubeadm /usr/bin/kubelet /usr/bin/kubectl
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 4. GPU 节点额外安装 NVIDIA 驱动与 runtime
|
||||||
|
# -------------------------------
|
||||||
|
if [ "$1" == "gpu" ]; then
|
||||||
|
chmod +x $OFFLINE_DIR/nvidia/NVIDIA-DRIVER.run
|
||||||
|
$OFFLINE_DIR/nvidia/NVIDIA-DRIVER.run --silent
|
||||||
|
dpkg -i $OFFLINE_DIR/nvidia/nvidia-container-toolkit.deb
|
||||||
|
dpkg -i $OFFLINE_DIR/nvidia/nvidia-container-runtime.deb
|
||||||
|
fi
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# 5. 启动 containerd & kubelet
|
||||||
|
# -------------------------------
|
||||||
|
systemctl enable containerd --now
|
||||||
|
systemctl enable kubelet --now
|
||||||
|
|
||||||
|
echo "Offline install completed on $(hostname)"
|
||||||
|
|
||||||
89
script/k8s+kebuvirt/ctrl_install.sh
Normal file
89
script/k8s+kebuvirt/ctrl_install.sh
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# control-plane-node-install.sh
|
||||||
|
# 运行在主控节点(假设 IP: 192.168.10.10)
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
OFFLINE_DIR=/opt/offline
|
||||||
|
K8S_VERSION=v1.29.6
|
||||||
|
CONTROL_PLANE_IP=192.168.10.10
|
||||||
|
API_SERVER_NAME=k8s-api.internal
|
||||||
|
|
||||||
|
echo "=== 解压离线包 ==="
|
||||||
|
tar -xzf ${OFFLINE_DIR}/k8s-offline-all.tar.gz -C /tmp/
|
||||||
|
|
||||||
|
# 安装基础依赖
|
||||||
|
dpkg -i ${OFFLINE_DIR}/debs/*.deb || apt-get -f install -y
|
||||||
|
|
||||||
|
echo "=== 安装 containerd ==="
|
||||||
|
mkdir -p /usr/local/bin
|
||||||
|
tar --no-overwrite-dir -C /usr/local -xzf ${OFFLINE_DIR}/containerd.tar.gz
|
||||||
|
|
||||||
|
# 写入 systemd 服务
|
||||||
|
cat > /etc/systemd/system/containerd.service << 'EOF'
|
||||||
|
[Unit]
|
||||||
|
Description=containerd daemon
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStartPre=/sbin/modprobe overlay
|
||||||
|
ExecStart=/usr/local/bin/containerd
|
||||||
|
Restart=always
|
||||||
|
Type=notify
|
||||||
|
Delegate=yes
|
||||||
|
KillMode=process
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl enable containerd
|
||||||
|
systemctl start containerd
|
||||||
|
|
||||||
|
# 安装 CNI 插件
|
||||||
|
mkdir -p /opt/cni/bin
|
||||||
|
tar -xzf ${OFFLINE_DIR}/cni-plugins.tgz -C /opt/cni/bin/
|
||||||
|
|
||||||
|
# 安装 k8s 二进制
|
||||||
|
cp ${OFFLINE_DIR}/k8s-binaries/* /usr/bin/
|
||||||
|
chmod +x /usr/bin/kubeadm /usr/bin/kubelet /usr/bin/kubectl
|
||||||
|
|
||||||
|
# kubelet systemd 设置
|
||||||
|
cat > /etc/systemd/system/kubelet.service << 'EOF'
|
||||||
|
[Unit]
|
||||||
|
Description=kubelet
|
||||||
|
After=containerd.service
|
||||||
|
Requires=containerd.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStart=/usr/bin/kubelet
|
||||||
|
Restart=always
|
||||||
|
StartLimitInterval=0
|
||||||
|
VolumeMountPropagation=private
|
||||||
|
Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl enable kubelet
|
||||||
|
|
||||||
|
echo "=== 初始化集群 ==="
|
||||||
|
kubeadm init \
|
||||||
|
--pod-network-cidr=10.244.0.0/16 \
|
||||||
|
--apiserver-advertise-address=${CONTROL_PLANE_IP} \
|
||||||
|
--kubernetes-version=${K8S_VERSION} \
|
||||||
|
--ignore-preflight-errors=all
|
||||||
|
|
||||||
|
mkdir -p $HOME/.kube
|
||||||
|
cp /etc/kubernetes/admin.conf $HOME/.kube/config
|
||||||
|
chown $(id -u):$(id -g) $HOME/.kube/config
|
||||||
|
|
||||||
|
echo "=== 安装 Flannel CNI ==="
|
||||||
|
kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/master/Documentation/kube-flannel.yml
|
||||||
|
|
||||||
|
# 标记主节点不调度 Pod(可选)
|
||||||
|
kubectl taint nodes $(hostname) node-role.kubernetes.io/control-plane:NoSchedule
|
||||||
|
|
||||||
|
echo "✅ 控制节点安装完成"
|
||||||
|
echo "请将 ~/.kube/config 复制到其他节点或管理机"
|
||||||
33
script/k8s+kebuvirt/deploy-kubevirt-and-gpu.sh
Normal file
33
script/k8s+kebuvirt/deploy-kubevirt-and-gpu.sh
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# deploy-kubevirt-and-gpu.sh
|
||||||
|
# 在主控节点运行
|
||||||
|
# 加载镜像
|
||||||
|
docker load -i /tmp/images/gpu-operator-images.tar
|
||||||
|
docker load -i /tmp/images/kubevirt-images.tar
|
||||||
|
|
||||||
|
# 安装 Helm
|
||||||
|
tar -xzf /tmp/helm/helm.tar.gz -C /tmp/
|
||||||
|
cp /tmp/linux-amd64/helm /usr/local/bin/helm
|
||||||
|
|
||||||
|
# 添加仓库(离线无需 add)
|
||||||
|
helm install gpu-operator nvidia/gpu-operator \
|
||||||
|
--version=v24.9.0 \
|
||||||
|
--set driver.enabled=false \ # 已手动安装驱动
|
||||||
|
--set toolkit.enabled=true \
|
||||||
|
--set devicePlugin.enabled=true \
|
||||||
|
--set dcgmExporter.enabled=true \
|
||||||
|
--set migManager.enabled=true \
|
||||||
|
--set operator.defaultRuntime=containerd
|
||||||
|
|
||||||
|
# 等待 GPU 就绪
|
||||||
|
watch kubectl get pods -n gpu-operator-resources
|
||||||
|
|
||||||
|
# 安装 KubeVirt
|
||||||
|
kubectl create namespace kubevirt
|
||||||
|
kubectl apply -f https://github.com/kubevirt/kubevirt/releases/download/v1.1.0/kubevirt-operator.yaml
|
||||||
|
kubectl apply -f https://github.com/kubevirt/kubevirt/releases/download/v1.1.0/kubevirt-cr.yaml
|
||||||
|
|
||||||
|
# 安装 CDI(用于导入镜像)
|
||||||
|
helm install cdi kubevirt/cdi --namespace kubevirt --version=v1.50.0
|
||||||
|
|
||||||
|
# 配置 NFS 动态供给(可选)
|
||||||
|
kubectl apply -f nfs-client-provisioner.yaml # 自定义配置指向你的 100T NFS
|
||||||
111
script/k8s+kebuvirt/dl-pkgs.sh
Normal file
111
script/k8s+kebuvirt/dl-pkgs.sh
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# offline-download.sh
|
||||||
|
# 在有互联网的机器上执行,将所有依赖打包供离线部署使用
|
||||||
|
|
||||||
|
set -e
|
||||||
|
apt install podman-docker
|
||||||
|
export WORKDIR=/tmp/k8s-offline
|
||||||
|
for d in packages,images,k8s-binaries,helm,nvidia,gpu-operator,kubevirt
|
||||||
|
do
|
||||||
|
mkdir -p $WORKDIR/$d
|
||||||
|
done
|
||||||
|
|
||||||
|
cd $WORKDIR
|
||||||
|
|
||||||
|
echo "=== 下载 Kubernetes 二进制文件 ==="
|
||||||
|
K8S_VERSION=v1.29.6
|
||||||
|
ARCH=amd64
|
||||||
|
|
||||||
|
curl -L --retry 3 https://dl.k8s.io/${K8S_VERSION}/bin/linux/${ARCH}/kubeadm -o k8s-binaries/kubeadm
|
||||||
|
curl -L --retry 3 https://dl.k8s.io/${K8S_VERSION}/bin/linux/${ARCH}/kubelet -o k8s-binaries/kubelet
|
||||||
|
curl -L --retry 3 https://dl.k8s.io/${K8S_VERSION}/bin/linux/${ARCH}/kubectl -o k8s-binaries/kubectl
|
||||||
|
|
||||||
|
chmod +x k8s-binaries/*
|
||||||
|
|
||||||
|
echo "=== 下载 containerd ==="
|
||||||
|
CONTAINERD_VERSION=1.7.16
|
||||||
|
curl -L --retry 3 https://github.com/containerd/containerd/releases/download/v${CONTAINERD_VERSION}/containerd-${CONTAINERD_VERSION}-linux-amd64.tar.gz -o packages/containerd.tar.gz
|
||||||
|
|
||||||
|
echo "=== 下载 runc ==="
|
||||||
|
RUNC_VERSION=v1.1.13
|
||||||
|
curl -L --retry 3 https://github.com/opencontainers/runc/releases/download/${RUNC_VERSION}/runc.amd64 -o packages/runc && chmod +x packages/runc
|
||||||
|
|
||||||
|
echo "=== 下载 CNI 插件 ==="
|
||||||
|
CNI_VERSION=v1.4.1
|
||||||
|
curl -L --retry 3 https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-amd64-${CNI_VERSION}.tgz -o packages/cni-plugins.tgz
|
||||||
|
|
||||||
|
echo "=== 下载 Helm ==="
|
||||||
|
HELM_VERSION=v3.13.3
|
||||||
|
curl -L --retry 3 https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz -o helm/helm.tar.gz
|
||||||
|
|
||||||
|
echo "=== 下载 NVIDIA Driver(仅元信息,实际需手动获取)==="
|
||||||
|
echo "注意:NVIDIA 驱动无法直接 wget,请从官网下载:"
|
||||||
|
echo "https://www.nvidia.com/Download/index.aspx?lang=en-us"
|
||||||
|
echo "选择 A100-SXM4 / Data Center Driver for Linux x86_64"
|
||||||
|
echo "保存为: nvidia/NVIDIA-Linux-x86_64-535.161.08.run"
|
||||||
|
|
||||||
|
echo "=== 下载 NVIDIA Container Toolkit 依赖(通过 apt 离线包)==="
|
||||||
|
# 使用 docker pull + save 方式更可靠
|
||||||
|
echo "准备构建本地 apt repo 或使用 .deb 包方式"
|
||||||
|
|
||||||
|
# 推荐方法:在一台联网 Ubuntu 22.04 上执行:
|
||||||
|
cat > prepare-debs.sh << 'EOF'
|
||||||
|
#!/bin/bash
|
||||||
|
mkdir -p /tmp/debs
|
||||||
|
apt update
|
||||||
|
apt install -y --download-only curl conntrack socat ipvsadm iptables bridge-utils ethtool git wget tar
|
||||||
|
apt install -y --download-only nfs-utils nfs-common
|
||||||
|
apt install -y --download-only nvidia-driver-535 nvidia-utils-535 nvidia-dkms-535
|
||||||
|
apt install -y --download-only nvidia-container-toolkit
|
||||||
|
cp /var/cache/apt/archives/*.deb /path/to/offline/nvidia/
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "请运行 prepare-debs.sh 获取 .deb 包"
|
||||||
|
|
||||||
|
echo "=== 拉取 GPU Operator 所需镜像 ==="
|
||||||
|
# GPU Operator 会拉取多个镜像,我们预先列出并导出
|
||||||
|
cat > gpu-operator-images.txt << 'EOF'
|
||||||
|
nvcr.io/nvidia/gpu-operator:v24.9.0
|
||||||
|
nvcr.io/nvidia/gpu-feature-discovery:v0.8.0
|
||||||
|
nvcr.io/nvidia/driver:535.161.08-ubuntu22.04
|
||||||
|
nvcr.io/nvidia/container-toolkit:1.14.2-ubuntu22.04
|
||||||
|
nvcr.io/nvidia/dcgm:3.1.7-3-ubuntu22.04
|
||||||
|
nvcr.io/nvidia/k8s-device-plugin:0.14.2-ubi8
|
||||||
|
nvcr.io/nvidia/k8s-operator-validator:v1.2.0
|
||||||
|
EOF
|
||||||
|
|
||||||
|
while read img; do
|
||||||
|
echo "Pulling $img"
|
||||||
|
docker pull $img || echo "Failed: $img"
|
||||||
|
done < gpu-operator-images.txt
|
||||||
|
|
||||||
|
# 保存镜像为 tar 文件
|
||||||
|
docker save $(cat gpu-operator-images.txt | tr '\n' ' ') -o images/gpu-operator-images.tar
|
||||||
|
|
||||||
|
echo "=== 拉取 KubeVirt 组件镜像 ==="
|
||||||
|
KV_VERSION=v1.1.0
|
||||||
|
cat > kubevirt-images.txt << EOF
|
||||||
|
quay.io/kubevirt/virt-operator:${KV_VERSION}
|
||||||
|
quay.io/kubevirt/virt-api:${KV_VERSION}
|
||||||
|
quay.io/kubevirt/virt-controller:${KV_VERSION}
|
||||||
|
quay.io/kubevirt/virt-handler:${KV_VERSION}
|
||||||
|
quay.io/kubevirt/virt-launcher:${KV_VERSION}
|
||||||
|
quay.io/kubevirt/cdi-operator:v1.50.0
|
||||||
|
quay.io/kubevirt/cdi-apiserver:v1.50.0
|
||||||
|
quay.io/kubevirt/cdi-uploadproxy:v1.50.0
|
||||||
|
quay.io/kubevirt/cdi-cloner:v1.50.0
|
||||||
|
quay.io/kubevirt/cdi-importer:v1.50.0
|
||||||
|
quay.io/kubevirt/cdi-uploadserver:v1.50.0
|
||||||
|
EOF
|
||||||
|
|
||||||
|
while read img; do
|
||||||
|
docker pull $img || echo "Failed: $img"
|
||||||
|
done < kubevirt-images.txt
|
||||||
|
|
||||||
|
docker save $(cat kubevirt-images.txt | tr '\n' ' ') -o images/kubevirt-images.tar
|
||||||
|
|
||||||
|
echo "=== 创建最终离线包 ==="
|
||||||
|
tar -czf k8s-offline-all.tar.gz .
|
||||||
|
|
||||||
|
echo "✅ 所有离线资源已生成:k8s-offline-all.tar.gz"
|
||||||
|
echo "请将其复制到目标环境并解压"
|
||||||
94
script/k8s+kebuvirt/gpuworker_install.sh
Normal file
94
script/k8s+kebuvirt/gpuworker_install.sh
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# worker-gpu-install.sh
|
||||||
|
# 在每个有 A100 的 GPU 节点上运行
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
OFFLINE_DIR=/opt/offline
|
||||||
|
|
||||||
|
# 安装 containerd、k8s 二进制(同上)
|
||||||
|
tar --no-overwrite-dir -C /usr/local -xzf ${OFFLINE_DIR}/containerd.tar.gz
|
||||||
|
mkdir -p /opt/cni/bin
|
||||||
|
tar -xzf ${OFFLINE_DIR}/cni-plugins.tgz -C /opt/cni/bin/
|
||||||
|
|
||||||
|
cp ${OFFLINE_DIR}/k8s-binaries/kubeadm /usr/bin/
|
||||||
|
cp ${OFFLINE_DIR}/k8s-binaries/kubelet /usr/bin/
|
||||||
|
chmod +x /usr/bin/kubeadm /usr/bin/kubelet
|
||||||
|
|
||||||
|
# 配置 containerd 和 kubelet(同上)
|
||||||
|
cat > /etc/systemd/system/containerd.service << 'EOF'
|
||||||
|
[Unit]
|
||||||
|
Description=containerd daemon
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStartPre=/sbin/modprobe overlay
|
||||||
|
ExecStart=/usr/local/bin/containerd
|
||||||
|
Restart=always
|
||||||
|
Type=notify
|
||||||
|
Delegate=yes
|
||||||
|
KillMode=process
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl enable containerd
|
||||||
|
systemctl start containerd
|
||||||
|
|
||||||
|
cat > /etc/systemd/system/kubelet.service << 'EOF'
|
||||||
|
[Unit]
|
||||||
|
Description=kubelet
|
||||||
|
After=containerd.service
|
||||||
|
Requires=containerd.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStart=/usr/bin/kubelet
|
||||||
|
Restart=always
|
||||||
|
StartLimitInterval=0
|
||||||
|
VolumeMountPropagation=private
|
||||||
|
Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl enable kubelet
|
||||||
|
|
||||||
|
# 安装 NVIDIA 驱动
|
||||||
|
echo "=== 安装 NVIDIA 驱动 ==="
|
||||||
|
chmod +x ${OFFLINE_DIR}/nvidia/NVIDIA-Linux-x86_64-*.run
|
||||||
|
${OFFLINE_DIR}/nvidia/NVIDIA-Linux-x86_64-535.161.08.run -s --dkms --no-opengl-files
|
||||||
|
|
||||||
|
# 加载内核模块
|
||||||
|
modprobe nvidia
|
||||||
|
modprobe nvidia-uvm
|
||||||
|
|
||||||
|
# 安装 NVIDIA Container Toolkit
|
||||||
|
dpkg -i ${OFFLINE_DIR}/nvidia/nvidia-container-toolkit*.deb
|
||||||
|
systemctl restart containerd
|
||||||
|
|
||||||
|
# 开启 MIG 模式(A100 必须)
|
||||||
|
echo "=== 配置 MIG 模式 ==="
|
||||||
|
# 示例:每张卡切分为 2 个 MIG 实例(可根据需求调整)
|
||||||
|
nvidia-smi -i 0 -mig 1
|
||||||
|
sleep 5
|
||||||
|
# 创建实例(示例:创建两个 3g.20gb 实例)
|
||||||
|
nvidia-smi mig -i 0 -cgi 3g.20gb,3g.20gb -C
|
||||||
|
nvidia-smi mig -i 1 -cgi 3g.20gb,3g.20gb -C
|
||||||
|
# ... 对所有卡重复
|
||||||
|
|
||||||
|
# 标记节点为 GPU 节点
|
||||||
|
cat > /tmp/gpu-label.yaml << 'EOF'
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Node
|
||||||
|
metadata:
|
||||||
|
name: $(hostname)
|
||||||
|
labels:
|
||||||
|
node-type: gpu-worker
|
||||||
|
nvidia.com/gpu.present: "true"
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# 注意:join 后再应用 label
|
||||||
|
echo "✅ 安装完成,请先加入集群"
|
||||||
|
echo "然后在 master 上运行:kubectl label node $(hostname) node-type=gpu-worker nvidia.com/gpu.present=true"
|
||||||
47
script/k8s+kebuvirt/nfs-client-provisioner.yaml
Normal file
47
script/k8s+kebuvirt/nfs-client-provisioner.yaml
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
apiVersion: storage.k8s.io/v1
|
||||||
|
kind: StorageClass
|
||||||
|
metadata:
|
||||||
|
name: nfs-client
|
||||||
|
provisioner: k8s-sigs.io/nfs-subdir-external-provisioner
|
||||||
|
parameters:
|
||||||
|
archiveOnDelete: "false"
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: nfs-client-provisioner
|
||||||
|
labels:
|
||||||
|
app: nfs-client-provisioner
|
||||||
|
namespace: default
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: nfs-client-provisioner
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: nfs-client-provisioner
|
||||||
|
spec:
|
||||||
|
serviceAccountName: nfs-client-provisioner
|
||||||
|
containers:
|
||||||
|
- name: nfs-client-provisioner
|
||||||
|
image: registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2
|
||||||
|
volumeMounts:
|
||||||
|
- name: nfs-client-root
|
||||||
|
mountPath: /persistentvolumes
|
||||||
|
env:
|
||||||
|
- name: PROVISIONER_NAME
|
||||||
|
value: k8s-sigs.io/nfs-subdir-external-provisioner
|
||||||
|
- name: NFS_SERVER
|
||||||
|
value: 192.168.10.1 # 替换为你的 NFS 服务器 IP
|
||||||
|
- name: NFS_PATH
|
||||||
|
value: /export/k8s
|
||||||
|
volumes:
|
||||||
|
- name: nfs-client-root
|
||||||
|
nfs:
|
||||||
|
server: 192.168.10.1
|
||||||
|
path: /export/k8s
|
||||||
|
|
||||||
60
script/k8s+kebuvirt/worker_install.sh
Normal file
60
script/k8s+kebuvirt/worker_install.sh
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# worker-cpu-install.sh
|
||||||
|
# 所有无 GPU 的工作节点运行此脚本
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
OFFLINE_DIR=/opt/offline
|
||||||
|
|
||||||
|
# 安装 containerd、CNI、k8s 二进制(同 control plane)
|
||||||
|
tar --no-overwrite-dir -C /usr/local -xzf ${OFFLINE_DIR}/containerd.tar.gz
|
||||||
|
mkdir -p /opt/cni/bin
|
||||||
|
tar -xzf ${OFFLINE_DIR}/cni-plugins.tgz -C /opt/cni/bin/
|
||||||
|
|
||||||
|
cp ${OFFLINE_DIR}/k8s-binaries/kubeadm /usr/bin/
|
||||||
|
cp ${OFFLINE_DIR}/k8s-binaries/kubelet /usr/bin/
|
||||||
|
chmod +x /usr/bin/kubeadm /usr/bin/kubelet
|
||||||
|
|
||||||
|
# 同样配置 containerd 和 kubelet
|
||||||
|
cat > /etc/systemd/system/containerd.service << 'EOF'
|
||||||
|
[Unit]
|
||||||
|
Description=containerd daemon
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStartPre=/sbin/modprobe overlay
|
||||||
|
ExecStart=/usr/local/bin/containerd
|
||||||
|
Restart=always
|
||||||
|
Type=notify
|
||||||
|
Delegate=yes
|
||||||
|
KillMode=process
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl enable containerd
|
||||||
|
systemctl start containerd
|
||||||
|
|
||||||
|
cat > /etc/systemd/system/kubelet.service << 'EOF'
|
||||||
|
[Unit]
|
||||||
|
Description=kubelet
|
||||||
|
After=containerd.service
|
||||||
|
Requires=containerd.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
ExecStart=/usr/bin/kubelet
|
||||||
|
Restart=always
|
||||||
|
StartLimitInterval=0
|
||||||
|
VolumeMountPropagation=private
|
||||||
|
Environment="KUBELET_EXTRA_ARGS=--container-runtime=remote --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock"
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl enable kubelet
|
||||||
|
|
||||||
|
echo "✅ 准备加入集群,请在主控节点获取 join 命令:"
|
||||||
|
echo "kubeadm token create --print-join-command"
|
||||||
|
echo "然后在此节点执行输出的命令"
|
||||||
Loading…
x
Reference in New Issue
Block a user