Compare commits

..

No commits in common. "370edc473cef844cb289055f77cbc300c4b3af49" and "9a4d10e93c1eb712659db9dd9d0e51be9435a25b" have entirely different histories.

6 changed files with 8 additions and 19 deletions

View File

@ -296,7 +296,6 @@ def create_or_update_statefulset(apps_v1, v1, namespace_name, statefulset_name,
"kind": "StatefulSet",
"metadata": {"name": statefulset_name, "namespace": namespace_name},
"spec": {
"runtimeClassName": "nvidia", # 指定使用 nvidia runtime"
"replicas": replicas,
"selector": {"matchLabels": source_selflabel},
"serviceName": statefulset_name,

View File

@ -207,7 +207,6 @@ async def new_cluster_install(params):
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
"files/config.toml": "/opt/config.toml",
"files/runtimeclass-nvidia.yaml": "/opt/runtimeclass-nvidia.yaml",
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
"files/libnvidia-container-tools_1.17.8-1_amd64.deb": "/opt/libnvidia-container-tools_1.17.8-1_amd64.deb",
"files/libnvidia-container1_1.17.8-1_amd64.deb": "/opt/libnvidia-container1_1.17.8-1_amd64.deb",

View File

@ -83,7 +83,7 @@ version = 2
setup_serially = false
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "runc"
default_runtime_name = "nvidia"
disable_snapshot_annotations = true
discard_unpacked_layers = false
ignore_blockio_not_enabled_errors = false

View File

@ -28,14 +28,13 @@ spec:
labels:
name: nvidia-device-plugin-ds
spec:
runtimeClassName: nvidia # 指定使用 nvidia runtime
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
priorityClassName: "system-node-critical"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.16.1
- image: nvcr.io/nvidia/k8s-device-plugin:v0.13.0
name: nvidia-device-plugin-ctr
securityContext:
allowPrivilegeEscalation: false

View File

@ -1,6 +0,0 @@
# runtimeclass-nvidia.yaml
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia # 与 Pod 中引用的名称一致
handler: nvidia # 对应 containerd 配置中的 runtime 名称

View File

@ -540,16 +540,14 @@ if [ "$1" == "master" ]; then
# 安装网络插件
log_info "正在安装网络插件(flannel)"
kubectl apply -f /opt/kube-flannel.yml || log_error "本地安装flannel网络插件失败"
# log_info "正在安装Ingress-nginx-controller插件"
# kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败"
log_info "正在安装nfs-client-provisioner插件"
aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7
log_info "正在安装MetricsServer插件"
kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败"
log_info "正在安装GPU模式必要插件runtimeclass-nvidia.yaml"
kubectl apply -f /opt/runtimeclass-nvidia.yaml || log_error "本地安装GPU模式必要插件runtimeclass-nvidia.yaml失败"
log_info "正在安装GPU模式必要插件nvidia-device-plugin.yml"
kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
# log_info "正在安装Ingress-nginx-controller插件"
# kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败"
# log_info "正在安装GPU模式必要插件"
# kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
log_info "正在安装nfs-client-provisioner插件"
aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7
if [ $? -ne 0 ]; then
echo "NFS 服务器端安装失败,请检查网络连接或软件源。"
exit 1