修补问题

This commit is contained in:
ysh 2025-07-17 17:54:32 +08:00
parent 7d32077aa4
commit d69c5336e6
4 changed files with 354 additions and 46 deletions

View File

@ -13,7 +13,7 @@ from . import ssh_utils,k8s_utils_public
from appPublic.log import debug
import traceback
def delete_cluster_node(params):
async def delete_cluster_node(params):
"""
删除集群节点
--namespace -n:指定节点所在的命名空间不过,节点是集群级别的资源,不隶属于特定的命名空间,所以此参数一般不用于删除节点
@ -32,7 +32,7 @@ def delete_cluster_node(params):
"""
return "delete_cluster_node ok"
def node_state_switch(params):
async def node_state_switch(params):
"""
恢复节点
kubectl uncordon 命令将节点标记为可调度状态,这样调度器就会重新考虑将新的 Pod 分配到该节点上
@ -45,7 +45,7 @@ def node_state_switch(params):
"""
return "node_state_switch ok"
def yaml_apply_delete(params):
async def yaml_apply_delete(params):
"""
1. 通过cpcc传递过来的参数进行级联初始化资源实例;
2. 通过cpcc传递过来的参数进行级联更新资源实例;
@ -60,7 +60,7 @@ def yaml_apply_delete(params):
elif instance_type == "LinuxOS":
k8s_utils_linuxos_ubuntu.handle_k8s_operations(params)
def node_label_opt(params):
async def node_label_opt(params):
"""
要设置节点 worker-node-1 上的标签 app,可以使用以下命令
kubectl label nodes worker-node-1 app=app,注意标签键和值之间有一个等号 (=),表示设置该标签
@ -106,7 +106,7 @@ def node_label_opt(params):
else:
raise f"{worker_node} 解绑标签 {label} 失败,请检查集群节点状态或标签是否已绑定?"
def unset_node_label(params):
async def unset_node_label(params):
"""
要取消节点 worker-node-1 上的标签 app,可以使用以下命令
kubectl label nodes worker-node-1 app-注意标签键后面有一个短横线 (-),表示取消该标签
@ -121,7 +121,7 @@ def unset_node_label(params):
label = params.get("label")
def get_cluster_nodes_by_server(params):
async def get_cluster_nodes_by_server(params):
host = params.get("host")
port = int(params.get("port"))
username = params.get("user")
@ -137,7 +137,7 @@ def get_cluster_nodes_by_server(params):
# debug(f'集群 {host=} 所有节点信息如下{results=} => 转换后:\n{parse_k8s_nodes_result=}')
return parse_k8s_nodes_result
def get_cluster_pods_by_kubeconfig(params):
async def get_cluster_pods_by_kubeconfig(params):
"""
通过调用方传递来的kubeconfig信息
获取集群中所有资源实例(Pod)信息详情
@ -145,7 +145,7 @@ def get_cluster_pods_by_kubeconfig(params):
kubeconfig = params.get("kubeconfig")
return k8s_utils_public.get_pod_info(kubeconfig)
def determine_accommodat_by_kubeconfig(params):
async def determine_accommodat_by_kubeconfig(params):
"""
通过调用方传递来的kubeconfig信息
判断集群中可部署哪些部件组合n
@ -157,7 +157,7 @@ def determine_accommodat_by_kubeconfig(params):
# debug(f'=====kubeconfig: {kubeconfig}, resources: {resources}')
return k8s_utils_public.determine_accommodat(kubeconfig, resources)
def get_cluster_nodes_by_kubeconfig(params):
async def get_cluster_nodes_by_kubeconfig(params):
"""
通过调用方传递来的kubeconfig信息
获取集群中所有节点信息详情
@ -165,7 +165,7 @@ def get_cluster_nodes_by_kubeconfig(params):
kubeconfig = params.get("kubeconfig")
return k8s_utils_public.get_node_info(kubeconfig)
def get_cluster_pods_by_server(params):
async def get_cluster_pods_by_server(params):
host = params.get("host")
port = int(params.get("port"))
username = params.get("user")
@ -182,7 +182,7 @@ def get_cluster_pods_by_server(params):
# debug(f'集群 {host=} 所有Pod信息如下{results=} => 转换后:\n{parse_k8s_pods_result=}')
return parse_k8s_pods_result
def new_cluster_install(params):
async def new_cluster_install(params):
# 随后填充远程操控k8s主逻辑
"""
用于接收cpcc端传递过来的k8s安装指令参数, 进行远程sshx调用操作内网机器进行集群节点的安装
@ -206,6 +206,7 @@ def new_cluster_install(params):
"files/storage_class.yaml":"/opt/storage_class.yaml",
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
"files/config.toml": "/opt/config.toml",
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",
@ -275,7 +276,7 @@ def new_cluster_install(params):
return results
def get_multiple_cluster_pod():
async def get_multiple_cluster_pod():
"""
获取 kubeconfig 中所有集群的 Pod 信息JSON 格式
@ -326,7 +327,7 @@ def get_multiple_cluster_pod():
return all_clusters_pods
def get_multiple_cluster():
async def get_multiple_cluster():
"""
获取所有集群的完整信息,包括用户证书RBAC状态服务账号颁发者等
@ -445,7 +446,7 @@ def get_multiple_cluster():
}, indent=4)
def process_kubeconfigs():
async def process_kubeconfigs():
"""
检测当前目录下的 kubestage 文件夹中的 kubeconfig 格式文件,
计算每个文件的大写 MD5 ,将其改名成对应的 MD5 ,

View File

@ -70,18 +70,18 @@ def init_func():
# g.delete_ldap_user=delete_ldap_user
### k8s多集群相关
g.new_cluster_install = awaitify(new_cluster_install)
g.get_multiple_cluster = awaitify(get_multiple_cluster)
g.get_multiple_cluster_pod = awaitify(get_multiple_cluster_pod)
g.get_cluster_nodes_by_server = awaitify(get_cluster_nodes_by_server)
g.get_cluster_pods_by_server = awaitify(get_cluster_pods_by_server)
g.delete_cluster_node = awaitify(delete_cluster_node)
g.node_state_switch = awaitify(node_state_switch)
g.yaml_apply_delete = awaitify(yaml_apply_delete)
g.get_cluster_nodes_by_kubeconfig = awaitify(get_cluster_nodes_by_kubeconfig)
g.determine_accommodat_by_kubeconfig = awaitify(determine_accommodat_by_kubeconfig)
g.get_cluster_pods_by_kubeconfig = awaitify(get_cluster_pods_by_kubeconfig)
g.node_label_opt = awaitify(node_label_opt)
g.new_cluster_install = new_cluster_install
g.get_multiple_cluster = get_multiple_cluster
g.get_multiple_cluster_pod = get_multiple_cluster_pod
g.get_cluster_nodes_by_server = get_cluster_nodes_by_server
g.get_cluster_pods_by_server = get_cluster_pods_by_server
g.delete_cluster_node = delete_cluster_node
g.node_state_switch = node_state_switch
g.yaml_apply_delete = yaml_apply_delete
g.get_cluster_nodes_by_kubeconfig = get_cluster_nodes_by_kubeconfig
g.determine_accommodat_by_kubeconfig = determine_accommodat_by_kubeconfig
g.get_cluster_pods_by_kubeconfig = get_cluster_pods_by_kubeconfig
g.node_label_opt = node_label_opt
g.get_storage_json=get_storage_json
g.result_dict={

297
files/config.toml Normal file
View File

@ -0,0 +1,297 @@
disabled_plugins = []
imports = []
oom_score = 0
plugin_dir = ""
required_plugins = []
root = "/var/lib/containerd"
state = "/run/containerd"
temp = ""
version = 2
[cgroup]
path = ""
[debug]
address = ""
format = ""
gid = 0
level = ""
uid = 0
[grpc]
address = "/run/containerd/containerd.sock"
gid = 0
max_recv_message_size = 16777216
max_send_message_size = 16777216
tcp_address = ""
tcp_tls_ca = ""
tcp_tls_cert = ""
tcp_tls_key = ""
uid = 0
[metrics]
address = ""
grpc_histogram = false
[plugins]
[plugins."io.containerd.gc.v1.scheduler"]
deletion_threshold = 0
mutation_threshold = 100
pause_threshold = 0.02
schedule_delay = "0s"
startup_delay = "100ms"
[plugins."io.containerd.grpc.v1.cri"]
cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"]
device_ownership_from_security_context = false
disable_apparmor = false
disable_cgroup = false
disable_hugetlb_controller = true
disable_proc_mount = false
disable_tcp_service = true
drain_exec_sync_io_timeout = "0s"
enable_cdi = false
enable_selinux = false
enable_tls_streaming = false
enable_unprivileged_icmp = false
enable_unprivileged_ports = false
ignore_deprecation_warnings = []
ignore_image_defined_volumes = false
image_pull_progress_timeout = "5m0s"
image_pull_with_sync_fs = false
max_concurrent_downloads = 3
max_container_log_line_size = 16384
netns_mounts_under_state_dir = false
restrict_oom_score_adj = false
sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.9"
selinux_category_range = 1024
stats_collect_period = 10
stream_idle_timeout = "4h0m0s"
stream_server_address = "127.0.0.1"
stream_server_port = "0"
systemd_cgroup = false
tolerate_missing_hugetlb_controller = true
unset_seccomp_profile = ""
[plugins."io.containerd.grpc.v1.cri".cni]
bin_dir = "/opt/cni/bin"
conf_dir = "/etc/cni/net.d"
conf_template = ""
ip_pref = ""
max_conf_num = 1
setup_serially = false
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia"
disable_snapshot_annotations = true
discard_unpacked_layers = false
ignore_blockio_not_enabled_errors = false
ignore_rdt_not_enabled_errors = false
no_pivot = false
snapshotter = "overlayfs"
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
base_runtime_spec = ""
cni_conf_dir = ""
cni_max_conf_num = 0
container_annotations = []
pod_annotations = []
privileged_without_host_devices = false
privileged_without_host_devices_all_devices_allowed = false
runtime_engine = ""
runtime_path = ""
runtime_root = ""
runtime_type = ""
sandbox_mode = ""
snapshotter = ""
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
runtime_type = "io.containerd.runc.v2"
privileged_without_host_devices = false
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
base_runtime_spec = ""
cni_conf_dir = ""
cni_max_conf_num = 0
container_annotations = []
pod_annotations = []
privileged_without_host_devices = false
privileged_without_host_devices_all_devices_allowed = false
runtime_engine = ""
runtime_path = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"
sandbox_mode = "podsandbox"
snapshotter = ""
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
BinaryName = ""
CriuImagePath = ""
CriuPath = ""
CriuWorkPath = ""
IoGid = 0
IoUid = 0
NoNewKeyring = false
NoPivotRoot = false
Root = ""
ShimCgroup = ""
SystemdCgroup = true
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
base_runtime_spec = ""
cni_conf_dir = ""
cni_max_conf_num = 0
container_annotations = []
pod_annotations = []
privileged_without_host_devices = false
privileged_without_host_devices_all_devices_allowed = false
runtime_engine = ""
runtime_path = ""
runtime_root = ""
runtime_type = ""
sandbox_mode = ""
snapshotter = ""
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options]
[plugins."io.containerd.grpc.v1.cri".image_decryption]
key_model = "node"
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = ""
[plugins."io.containerd.grpc.v1.cri".registry.auths]
[plugins."io.containerd.grpc.v1.cri".registry.configs]
[plugins."io.containerd.grpc.v1.cri".registry.headers]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
[plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
tls_cert_file = ""
tls_key_file = ""
[plugins."io.containerd.internal.v1.opt"]
path = "/opt/containerd"
[plugins."io.containerd.internal.v1.restart"]
interval = "10s"
[plugins."io.containerd.internal.v1.tracing"]
[plugins."io.containerd.metadata.v1.bolt"]
content_sharing_policy = "shared"
[plugins."io.containerd.monitor.v1.cgroups"]
no_prometheus = false
[plugins."io.containerd.nri.v1.nri"]
disable = true
disable_connections = false
plugin_config_path = "/etc/nri/conf.d"
plugin_path = "/opt/nri/plugins"
plugin_registration_timeout = "5s"
plugin_request_timeout = "2s"
socket_path = "/var/run/nri/nri.sock"
[plugins."io.containerd.runtime.v1.linux"]
no_shim = false
runtime = "runc"
runtime_root = ""
shim = "containerd-shim"
shim_debug = false
[plugins."io.containerd.runtime.v2.task"]
platforms = ["linux/amd64"]
sched_core = false
[plugins."io.containerd.service.v1.diff-service"]
default = ["walking"]
[plugins."io.containerd.service.v1.tasks-service"]
blockio_config_file = ""
rdt_config_file = ""
[plugins."io.containerd.snapshotter.v1.aufs"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.blockfile"]
fs_type = ""
mount_options = []
root_path = ""
scratch_file = ""
[plugins."io.containerd.snapshotter.v1.btrfs"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.devmapper"]
async_remove = false
base_image_size = ""
discard_blocks = false
fs_options = ""
fs_type = ""
pool_name = ""
root_path = ""
[plugins."io.containerd.snapshotter.v1.native"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.overlayfs"]
mount_options = []
root_path = ""
sync_remove = false
upperdir_label = false
[plugins."io.containerd.snapshotter.v1.zfs"]
root_path = ""
[plugins."io.containerd.tracing.processor.v1.otlp"]
[plugins."io.containerd.transfer.v1.local"]
config_path = ""
max_concurrent_downloads = 3
max_concurrent_uploaded_layers = 3
[[plugins."io.containerd.transfer.v1.local".unpack_config]]
differ = ""
platform = "linux/amd64"
snapshotter = "overlayfs"
[proxy_plugins]
[stream_processors]
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar"]
accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"]
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
path = "ctd-decoder"
returns = "application/vnd.oci.image.layer.v1.tar"
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"]
accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"]
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
path = "ctd-decoder"
returns = "application/vnd.oci.image.layer.v1.tar+gzip"
[timeouts]
"io.containerd.timeout.bolt.open" = "0s"
"io.containerd.timeout.metrics.shimstats" = "2s"
"io.containerd.timeout.shim.cleanup" = "5s"
"io.containerd.timeout.shim.load" = "5s"
"io.containerd.timeout.shim.shutdown" = "3s"
"io.containerd.timeout.task.state" = "2s"
[ttrpc]
address = ""
gid = 0
uid = 0

View File

@ -315,36 +315,45 @@ if lspci | grep -i nvidia > /dev/null 2>&1; then
log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..."
# 检查 .deb 文件是否存在
if [ ! "$(ls /opt/*.deb 2>/dev/null | wc -l)" -ge 1 ]; then
DEB_FILES=(/opt/*_amd64.deb)
if [ ! -e "${DEB_FILES[0]}" ]; then
log_error "/opt/ 下没有 .deb 文件"
exit 1
fi
# 安装 .deb 包
for deb in /opt/*_amd64.deb; do
dpkg -i "$deb" || log_error "安装 $deb 失败"
for deb in "${DEB_FILES[@]}"; do
dpkg -i "$deb" || {
log_error "安装 $deb 失败"
exit 1
}
done
# 配置 containerd
CONTAINERD_CONFIG="/etc/containerd/config.toml"
log_info "正在更新 $CONTAINERD_CONFIG 配置..."
# 1. 添加 nvidia 运行时配置(插入到 runtimes 块内部)
if ! grep -qF '[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]' "$CONTAINERD_CONFIG"; then
# 在 runtimes 块下插入 nvidia 配置(保持格式缩进)
sed -i '/\[plugins."io.containerd.grpc.v1.cri".containerd.runtimes\]/a \
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
' "$CONTAINERD_CONFIG"
fi
# 1. 添加 nvidia 运行时配置到 runtimes 块内部
# 添加 nvidia runtime 配置到 runtimes 块下
NVIDIA_SECTION='plugins\."io\.containerd\.grpc\.v1\.cri"\.containerd\.runtimes\.nvidia'
# if ! grep -qF "[${NVIDIA_SECTION}]" "$CONTAINERD_CONFIG"; then
# sudo sed -i '/^
# $$
# plugins\."io\.containerd\.grpc\.v1\.cri"\.containerd\.runtimes
# $$
# $/a \
# [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]\n\
# privileged_without_host_devices = false\n\
# runtime_type = "io.containerd.runc.v2"\n\
# [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]\n\
# BinaryName = "/usr/bin/nvidia-container-runtime"' /etc/containerd/config.toml
# fi
# 2. 修改默认运行时为 nvidia正确匹配配置项
if ! grep -qF 'default_runtime_name = "nvidia"' "$CONTAINERD_CONFIG"; then
sed -i '/default_runtime_name = "runc"/s/"runc"/"nvidia"/' "$CONTAINERD_CONFIG"
fi
# # 2. 修改默认运行时为 nvidia
# if ! grep -qF 'default_runtime_name = "nvidia"' "$CONTAINERD_CONFIG"; then
# sudo sed -i 's/default_runtime_name = "runc"/default_runtime_name = "nvidia"/' "$CONTAINERD_CONFIG"
# fi
cp -v /opt/config.toml /etc/containerd/config.toml || log_error "直接复制containerd配置文件失败"
# 3. 重启 containerd 并检查状态
log_info "重启 containerd 服务..."
@ -359,6 +368,7 @@ if lspci | grep -i nvidia > /dev/null 2>&1; then
log_info "配置 CUDA 环境变量..."
grep -qxF 'export PATH=/usr/local/cuda/bin:$PATH' ~/.bashrc || echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc
grep -qxF 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' ~/.bashrc || echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc
# 应用环境变量非交互式shell提示手动执行
if [[ "$-" == *i* ]]; then
source ~/.bashrc