commit
567b4ff367
@ -13,7 +13,7 @@ from . import ssh_utils,k8s_utils_public
|
||||
from appPublic.log import debug
|
||||
import traceback
|
||||
|
||||
def delete_cluster_node(params):
|
||||
async def delete_cluster_node(params):
|
||||
"""
|
||||
删除集群节点
|
||||
--namespace 或 -n:指定节点所在的命名空间。不过,节点是集群级别的资源,不隶属于特定的命名空间,所以此参数一般不用于删除节点。
|
||||
@ -32,7 +32,7 @@ def delete_cluster_node(params):
|
||||
"""
|
||||
return "delete_cluster_node ok"
|
||||
|
||||
def node_state_switch(params):
|
||||
async def node_state_switch(params):
|
||||
"""
|
||||
恢复节点:
|
||||
kubectl uncordon 命令将节点标记为可调度状态,这样调度器就会重新考虑将新的 Pod 分配到该节点上
|
||||
@ -45,7 +45,7 @@ def node_state_switch(params):
|
||||
"""
|
||||
return "node_state_switch ok"
|
||||
|
||||
def yaml_apply_delete(params):
|
||||
async def yaml_apply_delete(params):
|
||||
"""
|
||||
1. 通过cpcc传递过来的参数进行级联初始化资源实例;
|
||||
2. 通过cpcc传递过来的参数进行级联更新资源实例;
|
||||
@ -60,7 +60,7 @@ def yaml_apply_delete(params):
|
||||
elif instance_type == "LinuxOS":
|
||||
k8s_utils_linuxos_ubuntu.handle_k8s_operations(params)
|
||||
|
||||
def node_label_opt(params):
|
||||
async def node_label_opt(params):
|
||||
"""
|
||||
要设置节点 worker-node-1 上的标签 app,可以使用以下命令:
|
||||
kubectl label nodes worker-node-1 app=app,注意标签键和值之间有一个等号 (=),表示设置该标签。
|
||||
@ -106,7 +106,7 @@ def node_label_opt(params):
|
||||
else:
|
||||
raise f"{worker_node} 解绑标签 {label} 失败,请检查集群节点状态或标签是否已绑定?"
|
||||
|
||||
def unset_node_label(params):
|
||||
async def unset_node_label(params):
|
||||
"""
|
||||
要取消节点 worker-node-1 上的标签 app,可以使用以下命令:
|
||||
kubectl label nodes worker-node-1 app-,注意标签键后面有一个短横线 (-),表示取消该标签。
|
||||
@ -121,7 +121,7 @@ def unset_node_label(params):
|
||||
label = params.get("label")
|
||||
|
||||
|
||||
def get_cluster_nodes_by_server(params):
|
||||
async def get_cluster_nodes_by_server(params):
|
||||
host = params.get("host")
|
||||
port = int(params.get("port"))
|
||||
username = params.get("user")
|
||||
@ -137,7 +137,7 @@ def get_cluster_nodes_by_server(params):
|
||||
# debug(f'集群 {host=} 所有节点信息如下{results=} => 转换后:\n{parse_k8s_nodes_result=}')
|
||||
return parse_k8s_nodes_result
|
||||
|
||||
def get_cluster_pods_by_kubeconfig(params):
|
||||
async def get_cluster_pods_by_kubeconfig(params):
|
||||
"""
|
||||
通过调用方传递来的kubeconfig信息
|
||||
获取集群中所有资源实例(Pod)信息详情
|
||||
@ -145,7 +145,7 @@ def get_cluster_pods_by_kubeconfig(params):
|
||||
kubeconfig = params.get("kubeconfig")
|
||||
return k8s_utils_public.get_pod_info(kubeconfig)
|
||||
|
||||
def determine_accommodat_by_kubeconfig(params):
|
||||
async def determine_accommodat_by_kubeconfig(params):
|
||||
"""
|
||||
通过调用方传递来的kubeconfig信息
|
||||
判断集群中可部署哪些部件组合n
|
||||
@ -157,7 +157,7 @@ def determine_accommodat_by_kubeconfig(params):
|
||||
# debug(f'=====kubeconfig: {kubeconfig}, resources: {resources}')
|
||||
return k8s_utils_public.determine_accommodat(kubeconfig, resources)
|
||||
|
||||
def get_cluster_nodes_by_kubeconfig(params):
|
||||
async def get_cluster_nodes_by_kubeconfig(params):
|
||||
"""
|
||||
通过调用方传递来的kubeconfig信息
|
||||
获取集群中所有节点信息详情
|
||||
@ -165,7 +165,7 @@ def get_cluster_nodes_by_kubeconfig(params):
|
||||
kubeconfig = params.get("kubeconfig")
|
||||
return k8s_utils_public.get_node_info(kubeconfig)
|
||||
|
||||
def get_cluster_pods_by_server(params):
|
||||
async def get_cluster_pods_by_server(params):
|
||||
host = params.get("host")
|
||||
port = int(params.get("port"))
|
||||
username = params.get("user")
|
||||
@ -182,7 +182,7 @@ def get_cluster_pods_by_server(params):
|
||||
# debug(f'集群 {host=} 所有Pod信息如下{results=} => 转换后:\n{parse_k8s_pods_result=}')
|
||||
return parse_k8s_pods_result
|
||||
|
||||
def new_cluster_install(params):
|
||||
async def new_cluster_install(params):
|
||||
# 随后填充远程操控k8s主逻辑
|
||||
"""
|
||||
用于接收cpcc端传递过来的k8s安装指令参数, 进行远程sshx调用操作内网机器进行集群节点的安装
|
||||
@ -206,6 +206,7 @@ def new_cluster_install(params):
|
||||
"files/storage_class.yaml":"/opt/storage_class.yaml",
|
||||
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
|
||||
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
|
||||
"files/config.toml": "/opt/config.toml",
|
||||
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
|
||||
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
|
||||
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",
|
||||
@ -275,7 +276,7 @@ def new_cluster_install(params):
|
||||
|
||||
return results
|
||||
|
||||
def get_multiple_cluster_pod():
|
||||
async def get_multiple_cluster_pod():
|
||||
"""
|
||||
获取 kubeconfig 中所有集群的 Pod 信息(JSON 格式)
|
||||
|
||||
@ -326,7 +327,7 @@ def get_multiple_cluster_pod():
|
||||
return all_clusters_pods
|
||||
|
||||
|
||||
def get_multiple_cluster():
|
||||
async def get_multiple_cluster():
|
||||
"""
|
||||
获取所有集群的完整信息,包括用户证书、RBAC状态、服务账号颁发者等。
|
||||
|
||||
@ -445,7 +446,7 @@ def get_multiple_cluster():
|
||||
}, indent=4)
|
||||
|
||||
|
||||
def process_kubeconfigs():
|
||||
async def process_kubeconfigs():
|
||||
"""
|
||||
检测当前目录下的 kubestage 文件夹中的 kubeconfig 格式文件,
|
||||
计算每个文件的大写 MD5 值,将其改名成对应的 MD5 值,
|
||||
|
||||
24
app/pcapi.py
24
app/pcapi.py
@ -70,18 +70,18 @@ def init_func():
|
||||
# g.delete_ldap_user=delete_ldap_user
|
||||
|
||||
### k8s多集群相关
|
||||
g.new_cluster_install = awaitify(new_cluster_install)
|
||||
g.get_multiple_cluster = awaitify(get_multiple_cluster)
|
||||
g.get_multiple_cluster_pod = awaitify(get_multiple_cluster_pod)
|
||||
g.get_cluster_nodes_by_server = awaitify(get_cluster_nodes_by_server)
|
||||
g.get_cluster_pods_by_server = awaitify(get_cluster_pods_by_server)
|
||||
g.delete_cluster_node = awaitify(delete_cluster_node)
|
||||
g.node_state_switch = awaitify(node_state_switch)
|
||||
g.yaml_apply_delete = awaitify(yaml_apply_delete)
|
||||
g.get_cluster_nodes_by_kubeconfig = awaitify(get_cluster_nodes_by_kubeconfig)
|
||||
g.determine_accommodat_by_kubeconfig = awaitify(determine_accommodat_by_kubeconfig)
|
||||
g.get_cluster_pods_by_kubeconfig = awaitify(get_cluster_pods_by_kubeconfig)
|
||||
g.node_label_opt = awaitify(node_label_opt)
|
||||
g.new_cluster_install = new_cluster_install
|
||||
g.get_multiple_cluster = get_multiple_cluster
|
||||
g.get_multiple_cluster_pod = get_multiple_cluster_pod
|
||||
g.get_cluster_nodes_by_server = get_cluster_nodes_by_server
|
||||
g.get_cluster_pods_by_server = get_cluster_pods_by_server
|
||||
g.delete_cluster_node = delete_cluster_node
|
||||
g.node_state_switch = node_state_switch
|
||||
g.yaml_apply_delete = yaml_apply_delete
|
||||
g.get_cluster_nodes_by_kubeconfig = get_cluster_nodes_by_kubeconfig
|
||||
g.determine_accommodat_by_kubeconfig = determine_accommodat_by_kubeconfig
|
||||
g.get_cluster_pods_by_kubeconfig = get_cluster_pods_by_kubeconfig
|
||||
g.node_label_opt = node_label_opt
|
||||
|
||||
g.get_storage_json=get_storage_json
|
||||
g.result_dict={
|
||||
|
||||
297
files/config.toml
Normal file
297
files/config.toml
Normal file
@ -0,0 +1,297 @@
|
||||
disabled_plugins = []
|
||||
imports = []
|
||||
oom_score = 0
|
||||
plugin_dir = ""
|
||||
required_plugins = []
|
||||
root = "/var/lib/containerd"
|
||||
state = "/run/containerd"
|
||||
temp = ""
|
||||
version = 2
|
||||
|
||||
[cgroup]
|
||||
path = ""
|
||||
|
||||
[debug]
|
||||
address = ""
|
||||
format = ""
|
||||
gid = 0
|
||||
level = ""
|
||||
uid = 0
|
||||
|
||||
[grpc]
|
||||
address = "/run/containerd/containerd.sock"
|
||||
gid = 0
|
||||
max_recv_message_size = 16777216
|
||||
max_send_message_size = 16777216
|
||||
tcp_address = ""
|
||||
tcp_tls_ca = ""
|
||||
tcp_tls_cert = ""
|
||||
tcp_tls_key = ""
|
||||
uid = 0
|
||||
|
||||
[metrics]
|
||||
address = ""
|
||||
grpc_histogram = false
|
||||
|
||||
[plugins]
|
||||
|
||||
[plugins."io.containerd.gc.v1.scheduler"]
|
||||
deletion_threshold = 0
|
||||
mutation_threshold = 100
|
||||
pause_threshold = 0.02
|
||||
schedule_delay = "0s"
|
||||
startup_delay = "100ms"
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri"]
|
||||
cdi_spec_dirs = ["/etc/cdi", "/var/run/cdi"]
|
||||
device_ownership_from_security_context = false
|
||||
disable_apparmor = false
|
||||
disable_cgroup = false
|
||||
disable_hugetlb_controller = true
|
||||
disable_proc_mount = false
|
||||
disable_tcp_service = true
|
||||
drain_exec_sync_io_timeout = "0s"
|
||||
enable_cdi = false
|
||||
enable_selinux = false
|
||||
enable_tls_streaming = false
|
||||
enable_unprivileged_icmp = false
|
||||
enable_unprivileged_ports = false
|
||||
ignore_deprecation_warnings = []
|
||||
ignore_image_defined_volumes = false
|
||||
image_pull_progress_timeout = "5m0s"
|
||||
image_pull_with_sync_fs = false
|
||||
max_concurrent_downloads = 3
|
||||
max_container_log_line_size = 16384
|
||||
netns_mounts_under_state_dir = false
|
||||
restrict_oom_score_adj = false
|
||||
sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.9"
|
||||
selinux_category_range = 1024
|
||||
stats_collect_period = 10
|
||||
stream_idle_timeout = "4h0m0s"
|
||||
stream_server_address = "127.0.0.1"
|
||||
stream_server_port = "0"
|
||||
systemd_cgroup = false
|
||||
tolerate_missing_hugetlb_controller = true
|
||||
unset_seccomp_profile = ""
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".cni]
|
||||
bin_dir = "/opt/cni/bin"
|
||||
conf_dir = "/etc/cni/net.d"
|
||||
conf_template = ""
|
||||
ip_pref = ""
|
||||
max_conf_num = 1
|
||||
setup_serially = false
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd]
|
||||
default_runtime_name = "nvidia"
|
||||
disable_snapshot_annotations = true
|
||||
discard_unpacked_layers = false
|
||||
ignore_blockio_not_enabled_errors = false
|
||||
ignore_rdt_not_enabled_errors = false
|
||||
no_pivot = false
|
||||
snapshotter = "overlayfs"
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
|
||||
base_runtime_spec = ""
|
||||
cni_conf_dir = ""
|
||||
cni_max_conf_num = 0
|
||||
container_annotations = []
|
||||
pod_annotations = []
|
||||
privileged_without_host_devices = false
|
||||
privileged_without_host_devices_all_devices_allowed = false
|
||||
runtime_engine = ""
|
||||
runtime_path = ""
|
||||
runtime_root = ""
|
||||
runtime_type = ""
|
||||
sandbox_mode = ""
|
||||
snapshotter = ""
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options]
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
privileged_without_host_devices = false
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
|
||||
BinaryName = "/usr/bin/nvidia-container-runtime"
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
|
||||
base_runtime_spec = ""
|
||||
cni_conf_dir = ""
|
||||
cni_max_conf_num = 0
|
||||
container_annotations = []
|
||||
pod_annotations = []
|
||||
privileged_without_host_devices = false
|
||||
privileged_without_host_devices_all_devices_allowed = false
|
||||
runtime_engine = ""
|
||||
runtime_path = ""
|
||||
runtime_root = ""
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
sandbox_mode = "podsandbox"
|
||||
snapshotter = ""
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
|
||||
BinaryName = ""
|
||||
CriuImagePath = ""
|
||||
CriuPath = ""
|
||||
CriuWorkPath = ""
|
||||
IoGid = 0
|
||||
IoUid = 0
|
||||
NoNewKeyring = false
|
||||
NoPivotRoot = false
|
||||
Root = ""
|
||||
ShimCgroup = ""
|
||||
SystemdCgroup = true
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
|
||||
base_runtime_spec = ""
|
||||
cni_conf_dir = ""
|
||||
cni_max_conf_num = 0
|
||||
container_annotations = []
|
||||
pod_annotations = []
|
||||
privileged_without_host_devices = false
|
||||
privileged_without_host_devices_all_devices_allowed = false
|
||||
runtime_engine = ""
|
||||
runtime_path = ""
|
||||
runtime_root = ""
|
||||
runtime_type = ""
|
||||
sandbox_mode = ""
|
||||
snapshotter = ""
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options]
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".image_decryption]
|
||||
key_model = "node"
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".registry]
|
||||
config_path = ""
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".registry.auths]
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".registry.configs]
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".registry.headers]
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
|
||||
tls_cert_file = ""
|
||||
tls_key_file = ""
|
||||
|
||||
[plugins."io.containerd.internal.v1.opt"]
|
||||
path = "/opt/containerd"
|
||||
|
||||
[plugins."io.containerd.internal.v1.restart"]
|
||||
interval = "10s"
|
||||
|
||||
[plugins."io.containerd.internal.v1.tracing"]
|
||||
|
||||
[plugins."io.containerd.metadata.v1.bolt"]
|
||||
content_sharing_policy = "shared"
|
||||
|
||||
[plugins."io.containerd.monitor.v1.cgroups"]
|
||||
no_prometheus = false
|
||||
|
||||
[plugins."io.containerd.nri.v1.nri"]
|
||||
disable = true
|
||||
disable_connections = false
|
||||
plugin_config_path = "/etc/nri/conf.d"
|
||||
plugin_path = "/opt/nri/plugins"
|
||||
plugin_registration_timeout = "5s"
|
||||
plugin_request_timeout = "2s"
|
||||
socket_path = "/var/run/nri/nri.sock"
|
||||
|
||||
[plugins."io.containerd.runtime.v1.linux"]
|
||||
no_shim = false
|
||||
runtime = "runc"
|
||||
runtime_root = ""
|
||||
shim = "containerd-shim"
|
||||
shim_debug = false
|
||||
|
||||
[plugins."io.containerd.runtime.v2.task"]
|
||||
platforms = ["linux/amd64"]
|
||||
sched_core = false
|
||||
|
||||
[plugins."io.containerd.service.v1.diff-service"]
|
||||
default = ["walking"]
|
||||
|
||||
[plugins."io.containerd.service.v1.tasks-service"]
|
||||
blockio_config_file = ""
|
||||
rdt_config_file = ""
|
||||
|
||||
[plugins."io.containerd.snapshotter.v1.aufs"]
|
||||
root_path = ""
|
||||
|
||||
[plugins."io.containerd.snapshotter.v1.blockfile"]
|
||||
fs_type = ""
|
||||
mount_options = []
|
||||
root_path = ""
|
||||
scratch_file = ""
|
||||
|
||||
[plugins."io.containerd.snapshotter.v1.btrfs"]
|
||||
root_path = ""
|
||||
|
||||
[plugins."io.containerd.snapshotter.v1.devmapper"]
|
||||
async_remove = false
|
||||
base_image_size = ""
|
||||
discard_blocks = false
|
||||
fs_options = ""
|
||||
fs_type = ""
|
||||
pool_name = ""
|
||||
root_path = ""
|
||||
|
||||
[plugins."io.containerd.snapshotter.v1.native"]
|
||||
root_path = ""
|
||||
|
||||
[plugins."io.containerd.snapshotter.v1.overlayfs"]
|
||||
mount_options = []
|
||||
root_path = ""
|
||||
sync_remove = false
|
||||
upperdir_label = false
|
||||
|
||||
[plugins."io.containerd.snapshotter.v1.zfs"]
|
||||
root_path = ""
|
||||
|
||||
[plugins."io.containerd.tracing.processor.v1.otlp"]
|
||||
|
||||
[plugins."io.containerd.transfer.v1.local"]
|
||||
config_path = ""
|
||||
max_concurrent_downloads = 3
|
||||
max_concurrent_uploaded_layers = 3
|
||||
|
||||
[[plugins."io.containerd.transfer.v1.local".unpack_config]]
|
||||
differ = ""
|
||||
platform = "linux/amd64"
|
||||
snapshotter = "overlayfs"
|
||||
|
||||
[proxy_plugins]
|
||||
|
||||
[stream_processors]
|
||||
|
||||
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar"]
|
||||
accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"]
|
||||
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
|
||||
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
|
||||
path = "ctd-decoder"
|
||||
returns = "application/vnd.oci.image.layer.v1.tar"
|
||||
|
||||
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"]
|
||||
accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"]
|
||||
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
|
||||
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
|
||||
path = "ctd-decoder"
|
||||
returns = "application/vnd.oci.image.layer.v1.tar+gzip"
|
||||
|
||||
[timeouts]
|
||||
"io.containerd.timeout.bolt.open" = "0s"
|
||||
"io.containerd.timeout.metrics.shimstats" = "2s"
|
||||
"io.containerd.timeout.shim.cleanup" = "5s"
|
||||
"io.containerd.timeout.shim.load" = "5s"
|
||||
"io.containerd.timeout.shim.shutdown" = "3s"
|
||||
"io.containerd.timeout.task.state" = "2s"
|
||||
|
||||
[ttrpc]
|
||||
address = ""
|
||||
gid = 0
|
||||
uid = 0
|
||||
@ -315,36 +315,45 @@ if lspci | grep -i nvidia > /dev/null 2>&1; then
|
||||
log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..."
|
||||
|
||||
# 检查 .deb 文件是否存在
|
||||
if [ ! "$(ls /opt/*.deb 2>/dev/null | wc -l)" -ge 1 ]; then
|
||||
DEB_FILES=(/opt/*_amd64.deb)
|
||||
if [ ! -e "${DEB_FILES[0]}" ]; then
|
||||
log_error "/opt/ 下没有 .deb 文件"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 安装 .deb 包
|
||||
for deb in /opt/*_amd64.deb; do
|
||||
dpkg -i "$deb" || log_error "安装 $deb 失败"
|
||||
for deb in "${DEB_FILES[@]}"; do
|
||||
dpkg -i "$deb" || {
|
||||
log_error "安装 $deb 失败"
|
||||
exit 1
|
||||
}
|
||||
done
|
||||
|
||||
# 配置 containerd
|
||||
CONTAINERD_CONFIG="/etc/containerd/config.toml"
|
||||
log_info "正在更新 $CONTAINERD_CONFIG 配置..."
|
||||
|
||||
# 1. 添加 nvidia 运行时配置(插入到 runtimes 块内部)
|
||||
if ! grep -qF '[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]' "$CONTAINERD_CONFIG"; then
|
||||
# 在 runtimes 块下插入 nvidia 配置(保持格式缩进)
|
||||
sed -i '/\[plugins."io.containerd.grpc.v1.cri".containerd.runtimes\]/a \
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
|
||||
privileged_without_host_devices = false
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
|
||||
BinaryName = "/usr/bin/nvidia-container-runtime"
|
||||
' "$CONTAINERD_CONFIG"
|
||||
fi
|
||||
# 1. 添加 nvidia 运行时配置到 runtimes 块内部
|
||||
# 添加 nvidia runtime 配置到 runtimes 块下
|
||||
NVIDIA_SECTION='plugins\."io\.containerd\.grpc\.v1\.cri"\.containerd\.runtimes\.nvidia'
|
||||
# if ! grep -qF "[${NVIDIA_SECTION}]" "$CONTAINERD_CONFIG"; then
|
||||
# sudo sed -i '/^
|
||||
# $$
|
||||
# plugins\."io\.containerd\.grpc\.v1\.cri"\.containerd\.runtimes
|
||||
# $$
|
||||
# $/a \
|
||||
# [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]\n\
|
||||
# privileged_without_host_devices = false\n\
|
||||
# runtime_type = "io.containerd.runc.v2"\n\
|
||||
# [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]\n\
|
||||
# BinaryName = "/usr/bin/nvidia-container-runtime"' /etc/containerd/config.toml
|
||||
# fi
|
||||
|
||||
# 2. 修改默认运行时为 nvidia(正确匹配配置项)
|
||||
if ! grep -qF 'default_runtime_name = "nvidia"' "$CONTAINERD_CONFIG"; then
|
||||
sed -i '/default_runtime_name = "runc"/s/"runc"/"nvidia"/' "$CONTAINERD_CONFIG"
|
||||
fi
|
||||
# # 2. 修改默认运行时为 nvidia
|
||||
# if ! grep -qF 'default_runtime_name = "nvidia"' "$CONTAINERD_CONFIG"; then
|
||||
# sudo sed -i 's/default_runtime_name = "runc"/default_runtime_name = "nvidia"/' "$CONTAINERD_CONFIG"
|
||||
# fi
|
||||
cp -v /opt/config.toml /etc/containerd/config.toml || log_error "直接复制containerd配置文件失败"
|
||||
|
||||
# 3. 重启 containerd 并检查状态
|
||||
log_info "重启 containerd 服务..."
|
||||
@ -359,6 +368,7 @@ if lspci | grep -i nvidia > /dev/null 2>&1; then
|
||||
log_info "配置 CUDA 环境变量..."
|
||||
grep -qxF 'export PATH=/usr/local/cuda/bin:$PATH' ~/.bashrc || echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc
|
||||
grep -qxF 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' ~/.bashrc || echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc
|
||||
|
||||
# 应用环境变量(非交互式shell提示手动执行)
|
||||
if [[ "$-" == *i* ]]; then
|
||||
source ~/.bashrc
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user