diff --git a/app/k8sManager/multiple_clusters.py b/app/k8sManager/multiple_clusters.py index 37cd431..a66084a 100644 --- a/app/k8sManager/multiple_clusters.py +++ b/app/k8sManager/multiple_clusters.py @@ -13,7 +13,7 @@ from . import ssh_utils,k8s_utils_public from appPublic.log import debug import traceback -def delete_cluster_node(params): +async def delete_cluster_node(params): """ 删除集群节点 --namespace 或 -n:指定节点所在的命名空间。不过,节点是集群级别的资源,不隶属于特定的命名空间,所以此参数一般不用于删除节点。 @@ -32,7 +32,7 @@ def delete_cluster_node(params): """ return "delete_cluster_node ok" -def node_state_switch(params): +async def node_state_switch(params): """ 恢复节点: kubectl uncordon 命令将节点标记为可调度状态,这样调度器就会重新考虑将新的 Pod 分配到该节点上 @@ -45,7 +45,7 @@ def node_state_switch(params): """ return "node_state_switch ok" -def yaml_apply_delete(params): +async def yaml_apply_delete(params): """ 1. 通过cpcc传递过来的参数进行级联初始化资源实例; 2. 通过cpcc传递过来的参数进行级联更新资源实例; @@ -60,7 +60,7 @@ def yaml_apply_delete(params): elif instance_type == "LinuxOS": k8s_utils_linuxos_ubuntu.handle_k8s_operations(params) -def node_label_opt(params): +async def node_label_opt(params): """ 要设置节点 worker-node-1 上的标签 app,可以使用以下命令: kubectl label nodes worker-node-1 app=app,注意标签键和值之间有一个等号 (=),表示设置该标签。 @@ -106,7 +106,7 @@ def node_label_opt(params): else: raise f"{worker_node} 解绑标签 {label} 失败,请检查集群节点状态或标签是否已绑定?" -def unset_node_label(params): +async def unset_node_label(params): """ 要取消节点 worker-node-1 上的标签 app,可以使用以下命令: kubectl label nodes worker-node-1 app-,注意标签键后面有一个短横线 (-),表示取消该标签。 @@ -121,7 +121,7 @@ def unset_node_label(params): label = params.get("label") -def get_cluster_nodes_by_server(params): +async def get_cluster_nodes_by_server(params): host = params.get("host") port = int(params.get("port")) username = params.get("user") @@ -137,7 +137,7 @@ def get_cluster_nodes_by_server(params): # debug(f'集群 {host=} 所有节点信息如下{results=} => 转换后:\n{parse_k8s_nodes_result=}') return parse_k8s_nodes_result -def get_cluster_pods_by_kubeconfig(params): +async def get_cluster_pods_by_kubeconfig(params): """ 通过调用方传递来的kubeconfig信息 获取集群中所有资源实例(Pod)信息详情 @@ -145,7 +145,7 @@ def get_cluster_pods_by_kubeconfig(params): kubeconfig = params.get("kubeconfig") return k8s_utils_public.get_pod_info(kubeconfig) -def determine_accommodat_by_kubeconfig(params): +async def determine_accommodat_by_kubeconfig(params): """ 通过调用方传递来的kubeconfig信息 判断集群中可部署哪些部件组合n @@ -157,7 +157,7 @@ def determine_accommodat_by_kubeconfig(params): # debug(f'=====kubeconfig: {kubeconfig}, resources: {resources}') return k8s_utils_public.determine_accommodat(kubeconfig, resources) -def get_cluster_nodes_by_kubeconfig(params): +async def get_cluster_nodes_by_kubeconfig(params): """ 通过调用方传递来的kubeconfig信息 获取集群中所有节点信息详情 @@ -165,7 +165,7 @@ def get_cluster_nodes_by_kubeconfig(params): kubeconfig = params.get("kubeconfig") return k8s_utils_public.get_node_info(kubeconfig) -def get_cluster_pods_by_server(params): +async def get_cluster_pods_by_server(params): host = params.get("host") port = int(params.get("port")) username = params.get("user") @@ -182,7 +182,7 @@ def get_cluster_pods_by_server(params): # debug(f'集群 {host=} 所有Pod信息如下{results=} => 转换后:\n{parse_k8s_pods_result=}') return parse_k8s_pods_result -def new_cluster_install(params): +async def new_cluster_install(params): # 随后填充远程操控k8s主逻辑 """ 用于接收cpcc端传递过来的k8s安装指令参数, 进行远程sshx调用操作内网机器进行集群节点的安装 @@ -206,6 +206,7 @@ def new_cluster_install(params): "files/storage_class.yaml":"/opt/storage_class.yaml", # "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml", "files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml", + "files/config.toml": "/opt/config.toml", "files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml", "files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb", "files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb", @@ -275,7 +276,7 @@ def new_cluster_install(params): return results -def get_multiple_cluster_pod(): +async def get_multiple_cluster_pod(): """ 获取 kubeconfig 中所有集群的 Pod 信息(JSON 格式) @@ -326,7 +327,7 @@ def get_multiple_cluster_pod(): return all_clusters_pods -def get_multiple_cluster(): +async def get_multiple_cluster(): """ 获取所有集群的完整信息,包括用户证书、RBAC状态、服务账号颁发者等。 @@ -445,7 +446,7 @@ def get_multiple_cluster(): }, indent=4) -def process_kubeconfigs(): +async def process_kubeconfigs(): """ 检测当前目录下的 kubestage 文件夹中的 kubeconfig 格式文件, 计算每个文件的大写 MD5 值,将其改名成对应的 MD5 值, diff --git a/app/pcapi.py b/app/pcapi.py index a3fedd0..0949550 100644 --- a/app/pcapi.py +++ b/app/pcapi.py @@ -70,18 +70,18 @@ def init_func(): # g.delete_ldap_user=delete_ldap_user ### k8s多集群相关 - g.new_cluster_install = awaitify(new_cluster_install) - g.get_multiple_cluster = awaitify(get_multiple_cluster) - g.get_multiple_cluster_pod = awaitify(get_multiple_cluster_pod) - g.get_cluster_nodes_by_server = awaitify(get_cluster_nodes_by_server) - g.get_cluster_pods_by_server = awaitify(get_cluster_pods_by_server) - g.delete_cluster_node = awaitify(delete_cluster_node) - g.node_state_switch = awaitify(node_state_switch) - g.yaml_apply_delete = awaitify(yaml_apply_delete) - g.get_cluster_nodes_by_kubeconfig = awaitify(get_cluster_nodes_by_kubeconfig) - g.determine_accommodat_by_kubeconfig = awaitify(determine_accommodat_by_kubeconfig) - g.get_cluster_pods_by_kubeconfig = awaitify(get_cluster_pods_by_kubeconfig) - g.node_label_opt = awaitify(node_label_opt) + g.new_cluster_install = new_cluster_install + g.get_multiple_cluster = get_multiple_cluster + g.get_multiple_cluster_pod = get_multiple_cluster_pod + g.get_cluster_nodes_by_server = get_cluster_nodes_by_server + g.get_cluster_pods_by_server = get_cluster_pods_by_server + g.delete_cluster_node = delete_cluster_node + g.node_state_switch = node_state_switch + g.yaml_apply_delete = yaml_apply_delete + g.get_cluster_nodes_by_kubeconfig = get_cluster_nodes_by_kubeconfig + g.determine_accommodat_by_kubeconfig = determine_accommodat_by_kubeconfig + g.get_cluster_pods_by_kubeconfig = get_cluster_pods_by_kubeconfig + g.node_label_opt = node_label_opt g.get_storage_json=get_storage_json g.result_dict={ diff --git a/files/config.toml b/files/config.toml new file mode 100644 index 0000000..dc942bf --- /dev/null +++ b/files/config.toml @@ -0,0 +1,297 @@ +disabled_plugins = [] +imports = [] +oom_score = 0 +plugin_dir = "" +required_plugins = [] +root = "/var/lib/containerd" +state = "/run/containerd" +temp = "" +version = 2 + +[cgroup] + path = "" + +[debug] + address = "" + format = "" + gid = 0 + level = "" + uid = 0 + +[grpc] + address = "/run/containerd/containerd.sock" + gid = 0 + max_recv_message_size = 16777216 + max_send_message_size = 16777216 + tcp_address = "" + tcp_tls_ca = "" + tcp_tls_cert = "" + tcp_tls_key = "" + uid = 0 + +[metrics] + address = "" + grpc_histogram = false + +[plugins] + + [plugins."io.containerd.gc.v1.scheduler"] + deletion_threshold = 0 + mutation_threshold = 100 + pause_threshold = 0.02 + schedule_delay = "0s" + startup_delay = "100ms" + + [plugins."io.containerd.grpc.v1.cri"] + cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"] + device_ownership_from_security_context = false + disable_apparmor = false + disable_cgroup = false + disable_hugetlb_controller = true + disable_proc_mount = false + disable_tcp_service = true + drain_exec_sync_io_timeout = "0s" + enable_cdi = false + enable_selinux = false + enable_tls_streaming = false + enable_unprivileged_icmp = false + enable_unprivileged_ports = false + ignore_deprecation_warnings = [] + ignore_image_defined_volumes = false + image_pull_progress_timeout = "5m0s" + image_pull_with_sync_fs = false + max_concurrent_downloads = 3 + max_container_log_line_size = 16384 + netns_mounts_under_state_dir = false + restrict_oom_score_adj = false + sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.9" + selinux_category_range = 1024 + stats_collect_period = 10 + stream_idle_timeout = "4h0m0s" + stream_server_address = "127.0.0.1" + stream_server_port = "0" + systemd_cgroup = false + tolerate_missing_hugetlb_controller = true + unset_seccomp_profile = "" + + [plugins."io.containerd.grpc.v1.cri".cni] + bin_dir = "/opt/cni/bin" + conf_dir = "/etc/cni/net.d" + conf_template = "" + ip_pref = "" + max_conf_num = 1 + setup_serially = false + + [plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + disable_snapshot_annotations = true + discard_unpacked_layers = false + ignore_blockio_not_enabled_errors = false + ignore_rdt_not_enabled_errors = false + no_pivot = false + snapshotter = "overlayfs" + + [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime] + base_runtime_spec = "" + cni_conf_dir = "" + cni_max_conf_num = 0 + container_annotations = [] + pod_annotations = [] + privileged_without_host_devices = false + privileged_without_host_devices_all_devices_allowed = false + runtime_engine = "" + runtime_path = "" + runtime_root = "" + runtime_type = "" + sandbox_mode = "" + snapshotter = "" + + [plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options] + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + runtime_type = "io.containerd.runc.v2" + privileged_without_host_devices = false + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + base_runtime_spec = "" + cni_conf_dir = "" + cni_max_conf_num = 0 + container_annotations = [] + pod_annotations = [] + privileged_without_host_devices = false + privileged_without_host_devices_all_devices_allowed = false + runtime_engine = "" + runtime_path = "" + runtime_root = "" + runtime_type = "io.containerd.runc.v2" + sandbox_mode = "podsandbox" + snapshotter = "" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + BinaryName = "" + CriuImagePath = "" + CriuPath = "" + CriuWorkPath = "" + IoGid = 0 + IoUid = 0 + NoNewKeyring = false + NoPivotRoot = false + Root = "" + ShimCgroup = "" + SystemdCgroup = true + + [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime] + base_runtime_spec = "" + cni_conf_dir = "" + cni_max_conf_num = 0 + container_annotations = [] + pod_annotations = [] + privileged_without_host_devices = false + privileged_without_host_devices_all_devices_allowed = false + runtime_engine = "" + runtime_path = "" + runtime_root = "" + runtime_type = "" + sandbox_mode = "" + snapshotter = "" + + [plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options] + + [plugins."io.containerd.grpc.v1.cri".image_decryption] + key_model = "node" + + [plugins."io.containerd.grpc.v1.cri".registry] + config_path = "" + + [plugins."io.containerd.grpc.v1.cri".registry.auths] + + [plugins."io.containerd.grpc.v1.cri".registry.configs] + + [plugins."io.containerd.grpc.v1.cri".registry.headers] + + [plugins."io.containerd.grpc.v1.cri".registry.mirrors] + + [plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming] + tls_cert_file = "" + tls_key_file = "" + + [plugins."io.containerd.internal.v1.opt"] + path = "/opt/containerd" + + [plugins."io.containerd.internal.v1.restart"] + interval = "10s" + + [plugins."io.containerd.internal.v1.tracing"] + + [plugins."io.containerd.metadata.v1.bolt"] + content_sharing_policy = "shared" + + [plugins."io.containerd.monitor.v1.cgroups"] + no_prometheus = false + + [plugins."io.containerd.nri.v1.nri"] + disable = true + disable_connections = false + plugin_config_path = "/etc/nri/conf.d" + plugin_path = "/opt/nri/plugins" + plugin_registration_timeout = "5s" + plugin_request_timeout = "2s" + socket_path = "/var/run/nri/nri.sock" + + [plugins."io.containerd.runtime.v1.linux"] + no_shim = false + runtime = "runc" + runtime_root = "" + shim = "containerd-shim" + shim_debug = false + + [plugins."io.containerd.runtime.v2.task"] + platforms = ["linux/amd64"] + sched_core = false + + [plugins."io.containerd.service.v1.diff-service"] + default = ["walking"] + + [plugins."io.containerd.service.v1.tasks-service"] + blockio_config_file = "" + rdt_config_file = "" + + [plugins."io.containerd.snapshotter.v1.aufs"] + root_path = "" + + [plugins."io.containerd.snapshotter.v1.blockfile"] + fs_type = "" + mount_options = [] + root_path = "" + scratch_file = "" + + [plugins."io.containerd.snapshotter.v1.btrfs"] + root_path = "" + + [plugins."io.containerd.snapshotter.v1.devmapper"] + async_remove = false + base_image_size = "" + discard_blocks = false + fs_options = "" + fs_type = "" + pool_name = "" + root_path = "" + + [plugins."io.containerd.snapshotter.v1.native"] + root_path = "" + + [plugins."io.containerd.snapshotter.v1.overlayfs"] + mount_options = [] + root_path = "" + sync_remove = false + upperdir_label = false + + [plugins."io.containerd.snapshotter.v1.zfs"] + root_path = "" + + [plugins."io.containerd.tracing.processor.v1.otlp"] + + [plugins."io.containerd.transfer.v1.local"] + config_path = "" + max_concurrent_downloads = 3 + max_concurrent_uploaded_layers = 3 + + [[plugins."io.containerd.transfer.v1.local".unpack_config]] + differ = "" + platform = "linux/amd64" + snapshotter = "overlayfs" + +[proxy_plugins] + +[stream_processors] + + [stream_processors."io.containerd.ocicrypt.decoder.v1.tar"] + accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"] + args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"] + env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"] + path = "ctd-decoder" + returns = "application/vnd.oci.image.layer.v1.tar" + + [stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"] + accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"] + args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"] + env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"] + path = "ctd-decoder" + returns = "application/vnd.oci.image.layer.v1.tar+gzip" + +[timeouts] + "io.containerd.timeout.bolt.open" = "0s" + "io.containerd.timeout.metrics.shimstats" = "2s" + "io.containerd.timeout.shim.cleanup" = "5s" + "io.containerd.timeout.shim.load" = "5s" + "io.containerd.timeout.shim.shutdown" = "3s" + "io.containerd.timeout.task.state" = "2s" + +[ttrpc] + address = "" + gid = 0 + uid = 0 diff --git a/script/k8s_install.sh b/script/k8s_install.sh index d4a252b..7ef7cbc 100644 --- a/script/k8s_install.sh +++ b/script/k8s_install.sh @@ -315,36 +315,45 @@ if lspci | grep -i nvidia > /dev/null 2>&1; then log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..." # 检查 .deb 文件是否存在 - if [ ! "$(ls /opt/*.deb 2>/dev/null | wc -l)" -ge 1 ]; then - log_error "/opt/ 下没有 .deb 文件" - exit 1 + DEB_FILES=(/opt/*_amd64.deb) + if [ ! -e "${DEB_FILES[0]}" ]; then + log_error "/opt/ 下没有 .deb 文件" + exit 1 fi # 安装 .deb 包 - for deb in /opt/*_amd64.deb; do - dpkg -i "$deb" || log_error "安装 $deb 失败" + for deb in "${DEB_FILES[@]}"; do + dpkg -i "$deb" || { + log_error "安装 $deb 失败" + exit 1 + } done # 配置 containerd CONTAINERD_CONFIG="/etc/containerd/config.toml" log_info "正在更新 $CONTAINERD_CONFIG 配置..." - # 1. 添加 nvidia 运行时配置(插入到 runtimes 块内部) - if ! grep -qF '[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]' "$CONTAINERD_CONFIG"; then - # 在 runtimes 块下插入 nvidia 配置(保持格式缩进) - sed -i '/\[plugins."io.containerd.grpc.v1.cri".containerd.runtimes\]/a \ - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] - privileged_without_host_devices = false - runtime_type = "io.containerd.runc.v2" - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] - BinaryName = "/usr/bin/nvidia-container-runtime" -' "$CONTAINERD_CONFIG" - fi + # 1. 添加 nvidia 运行时配置到 runtimes 块内部 + # 添加 nvidia runtime 配置到 runtimes 块下 + NVIDIA_SECTION='plugins\."io\.containerd\.grpc\.v1\.cri"\.containerd\.runtimes\.nvidia' +# if ! grep -qF "[${NVIDIA_SECTION}]" "$CONTAINERD_CONFIG"; then +# sudo sed -i '/^ +# $$ +# plugins\."io\.containerd\.grpc\.v1\.cri"\.containerd\.runtimes +# $$ +# $/a \ +# [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]\n\ +# privileged_without_host_devices = false\n\ +# runtime_type = "io.containerd.runc.v2"\n\ +# [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]\n\ +# BinaryName = "/usr/bin/nvidia-container-runtime"' /etc/containerd/config.toml +# fi - # 2. 修改默认运行时为 nvidia(正确匹配配置项) - if ! grep -qF 'default_runtime_name = "nvidia"' "$CONTAINERD_CONFIG"; then - sed -i '/default_runtime_name = "runc"/s/"runc"/"nvidia"/' "$CONTAINERD_CONFIG" - fi +# # 2. 修改默认运行时为 nvidia +# if ! grep -qF 'default_runtime_name = "nvidia"' "$CONTAINERD_CONFIG"; then +# sudo sed -i 's/default_runtime_name = "runc"/default_runtime_name = "nvidia"/' "$CONTAINERD_CONFIG" +# fi + cp -v /opt/config.toml /etc/containerd/config.toml || log_error "直接复制containerd配置文件失败" # 3. 重启 containerd 并检查状态 log_info "重启 containerd 服务..." @@ -359,6 +368,7 @@ if lspci | grep -i nvidia > /dev/null 2>&1; then log_info "配置 CUDA 环境变量..." grep -qxF 'export PATH=/usr/local/cuda/bin:$PATH' ~/.bashrc || echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc grep -qxF 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' ~/.bashrc || echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc + # 应用环境变量(非交互式shell提示手动执行) if [[ "$-" == *i* ]]; then source ~/.bashrc