增强友好性
This commit is contained in:
parent
43ada048c4
commit
58383646e2
@ -296,6 +296,7 @@ def create_or_update_statefulset(apps_v1, v1, namespace_name, statefulset_name,
|
||||
"kind": "StatefulSet",
|
||||
"metadata": {"name": statefulset_name, "namespace": namespace_name},
|
||||
"spec": {
|
||||
"runtimeClassName": "nvidia", # 指定使用 nvidia runtime"
|
||||
"replicas": replicas,
|
||||
"selector": {"matchLabels": source_selflabel},
|
||||
"serviceName": statefulset_name,
|
||||
|
||||
@ -207,6 +207,7 @@ async def new_cluster_install(params):
|
||||
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
|
||||
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
|
||||
"files/config.toml": "/opt/config.toml",
|
||||
"files/runtimeclass-nvidia.yaml": "/opt/runtimeclass-nvidia.yaml",
|
||||
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
|
||||
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
|
||||
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",
|
||||
|
||||
@ -83,7 +83,7 @@ version = 2
|
||||
setup_serially = false
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd]
|
||||
default_runtime_name = "nvidia"
|
||||
default_runtime_name = "runc"
|
||||
disable_snapshot_annotations = true
|
||||
discard_unpacked_layers = false
|
||||
ignore_blockio_not_enabled_errors = false
|
||||
|
||||
@ -28,13 +28,14 @@ spec:
|
||||
labels:
|
||||
name: nvidia-device-plugin-ds
|
||||
spec:
|
||||
runtimeClassName: nvidia # 指定使用 nvidia runtime
|
||||
tolerations:
|
||||
- key: nvidia.com/gpu
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
priorityClassName: "system-node-critical"
|
||||
containers:
|
||||
- image: nvcr.io/nvidia/k8s-device-plugin:v0.13.0
|
||||
- image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1
|
||||
name: nvidia-device-plugin-ctr
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
|
||||
6
files/runtimeclass-nvidia.yaml
Normal file
6
files/runtimeclass-nvidia.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
# runtimeclass-nvidia.yaml
|
||||
apiVersion: node.k8s.io/v1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
name: nvidia # 与 Pod 中引用的名称一致
|
||||
handler: nvidia # 对应 containerd 配置中的 runtime 名称
|
||||
@ -544,10 +544,12 @@ if [ "$1" == "master" ]; then
|
||||
# kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败"
|
||||
log_info "正在安装nfs-client-provisioner插件"
|
||||
aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7
|
||||
# log_info "正在安装MetricsServer插件"
|
||||
# kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败"
|
||||
# log_info "正在安装GPU模式必要插件"
|
||||
# kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
|
||||
log_info "正在安装MetricsServer插件"
|
||||
kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败"
|
||||
log_info "正在安装GPU模式必要插件runtimeclass-nvidia.yaml"
|
||||
kubectl apply -f /opt/runtimeclass-nvidia.yaml || log_error "本地安装GPU模式必要插件runtimeclass-nvidia.yaml失败"
|
||||
log_info "正在安装GPU模式必要插件nvidia-device-plugin.yml"
|
||||
kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "NFS 服务器端安装失败,请检查网络连接或软件源。"
|
||||
exit 1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user