Compare commits

..

15 Commits
dev1 ... main

Author SHA1 Message Date
c661aea553 bugfix 2025-11-19 18:02:22 +08:00
ysh
8084dbabd4 Merge pull request 'GPU正式可用版本' (#19) from dev1 into main
Reviewed-on: #19
2025-07-18 21:32:16 +08:00
ysh
83b869fcf3 Merge pull request 'GPU正式可用版本' (#18) from dev1 into main
Reviewed-on: #18
2025-07-18 21:24:26 +08:00
ysh
370edc473c Merge pull request 'dev1' (#17) from dev1 into main
Reviewed-on: #17
2025-07-18 18:15:53 +08:00
ysh
9a4d10e93c Merge pull request '增强友好性' (#16) from dev1 into main
Reviewed-on: #16
2025-07-18 17:14:52 +08:00
ysh
10c1f0c268 Merge pull request '测试异步' (#15) from dev1 into main
Reviewed-on: #15
2025-07-18 15:06:12 +08:00
ysh
e317df2fac Merge pull request '测试异步' (#14) from dev1 into main
Reviewed-on: #14
2025-07-18 15:04:15 +08:00
ysh
970c68d201 Merge pull request '安装异步' (#13) from dev1 into main
Reviewed-on: #13
2025-07-18 14:59:24 +08:00
ysh
56900d5b5a Merge pull request '修补问题' (#12) from dev1 into main
Reviewed-on: #12
2025-07-18 14:13:12 +08:00
ysh
84e524365c Merge pull request '修补问题' (#11) from dev1 into main
Reviewed-on: #11
2025-07-18 11:40:17 +08:00
ysh
7a4e301092 Merge pull request 'dev1' (#10) from dev1 into main
Reviewed-on: #10
2025-07-18 11:16:42 +08:00
ysh
8b92a9b1cf Merge pull request '修补问题' (#9) from dev1 into main
Reviewed-on: #9
2025-07-17 18:18:48 +08:00
ysh
567b4ff367 Merge pull request '修补问题' (#8) from dev1 into main
Reviewed-on: #8
2025-07-17 17:55:03 +08:00
ysh
e81bbfdaa9 Merge pull request '算力中心API异步化2' (#7) from dev1 into main
Reviewed-on: #7
2025-07-17 14:15:44 +08:00
ysh
3513f1ddc1 Merge pull request '算力中心相关API文档' (#6) from dev1 into main
Reviewed-on: #6
2025-07-17 13:57:53 +08:00
2 changed files with 348 additions and 0 deletions

View File

@ -0,0 +1,187 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import json
import csv
from kubernetes import client, config
import pynvml
import subprocess
# ----------------------------
# 1. 初始化 Kubernetes
# ----------------------------
try:
kubeconfig_path = os.environ.get("KUBECONFIG", "/root/.kube/config")
config.load_kube_config(config_file=kubeconfig_path)
except Exception as e:
print(f"Error loading kubeconfig: {e}")
sys.exit(1)
v1 = client.CoreV1Api()
crd_api = client.CustomObjectsApi()
# ----------------------------
# 2. 初始化 NVIDIA
# ----------------------------
try:
pynvml.nvmlInit()
gpu_count = pynvml.nvmlDeviceGetCount()
except Exception as e:
print(f"No NVIDIA GPU found or driver not installed: {e}")
gpu_count = 0
# ----------------------------
# 3. 节点资源
# ----------------------------
nodes_info = []
gpu_nodes = {}
for node in v1.list_node().items:
name = node.metadata.name
status = next((cond.type for cond in node.status.conditions if cond.status=="True"), "Unknown")
cpu = node.status.allocatable.get('cpu', '0')
memory = node.status.allocatable.get('memory', '0')
gpu = node.status.allocatable.get('nvidia.com/gpu', '0')
nodes_info.append({'name': name,'status': status,'cpu_allocatable': cpu,'memory_allocatable': memory,'gpu_allocatable': gpu})
if gpu != '0':
gpu_nodes[name] = int(gpu)
# ----------------------------
# 4. Pod资源
# ----------------------------
pods_info = []
pod_pid_map = {} # key=PID, value=(namespace,pod_name)
for pod in v1.list_pod_for_all_namespaces().items:
pod_name = pod.metadata.name
namespace = pod.metadata.namespace
node_name = pod.spec.node_name
for c in pod.spec.containers:
cpu_req = c.resources.requests.get('cpu', '0') if c.resources.requests else '0'
mem_req = c.resources.requests.get('memory', '0') if c.resources.requests else '0'
gpu_req = c.resources.requests.get('nvidia.com/gpu', '0') if c.resources.requests else '0'
pods_info.append({
'namespace': namespace,
'pod_name': pod_name,
'node': node_name,
'container': c.name,
'cpu_request': cpu_req,
'memory_request': mem_req,
'gpu_request': gpu_req
})
# 获取容器 PID 映射 (离线环境需有 nsenter 或 crictl)
try:
cmd = f"crictl inspectp $(crictl ps --name {pod_name} -q)"
out = subprocess.check_output(cmd, shell=True).decode()
# 简化处理,只提取 PID
import re
pids = re.findall(r'"pid":\s*(\d+)', out)
for pid in pids:
pod_pid_map[int(pid)] = (namespace, pod_name)
except Exception:
continue
# ----------------------------
# 5. KubeVirt VM资源
# ----------------------------
vms_info = []
vm_pid_map = {} # key=PID, value=(namespace,vm_name)
namespaces = [ns.metadata.name for ns in v1.list_namespace().items]
for ns in namespaces:
try:
vms = crd_api.list_namespaced_custom_object(
group="kubevirt.io",
version="v1",
namespace=ns,
plural="virtualmachines"
)
for vm in vms.get('items', []):
name = vm['metadata']['name']
spec = vm.get('spec', {}).get('template', {}).get('spec', {})
domain = spec.get('domain', {})
resources = domain.get('resources', {}).get('requests', {})
cpu = resources.get('cpu', '0')
memory = resources.get('memory', '0')
gpus = domain.get('devices', {}).get('gpus', [])
vms_info.append({
'namespace': ns,
'vm_name': name,
'cpu_request': cpu,
'memory_request': memory,
'gpus': gpus
})
# 获取 virt-launcher Pod PID
try:
vl_pods = v1.list_namespaced_pod(namespace=ns, label_selector=f"kubevirt.io=virt-launcher,vm={name}")
for vl_pod in vl_pods.items:
cmd = f"crictl inspectp $(crictl ps --name {vl_pod.metadata.name} -q)"
out = subprocess.check_output(cmd, shell=True).decode()
import re
pids = re.findall(r'"pid":\s*(\d+)', out)
for pid in pids:
vm_pid_map[int(pid)] = (ns, name)
except Exception:
continue
except client.exceptions.ApiException:
continue
# ----------------------------
# 6. GPU实时使用 + PID映射
# ----------------------------
gpu_info = []
if gpu_count > 0:
for i in range(gpu_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
name = pynvml.nvmlDeviceGetName(handle).decode()
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
processes = []
try:
for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
pid = proc.pid
used_mem = proc.usedGpuMemory // 1024**2
owner = pod_pid_map.get(pid) or vm_pid_map.get(pid) or ("unknown", "unknown")
processes.append({
'pid': pid,
'used_memory_MB': used_mem,
'owner_type': 'pod' if pid in pod_pid_map else 'vm' if pid in vm_pid_map else 'unknown',
'namespace': owner[0],
'name': owner[1]
})
except pynvml.NVMLError:
pass
gpu_info.append({
'gpu_index': i,
'name': name,
'memory_total_MB': mem_info.total // 1024**2,
'memory_used_MB': mem_info.used // 1024**2,
'gpu_util_percent': util.gpu,
'mem_util_percent': util.memory,
'processes': processes
})
# ----------------------------
# 7. 输出 JSON 和 CSV
# ----------------------------
output = {'nodes': nodes_info, 'pods': pods_info, 'vms': vms_info, 'gpus': gpu_info}
with open("cluster_resources.json", "w") as f:
json.dump(output, f, indent=2)
# CSV输出函数
def write_csv(filename, data, fieldnames):
with open(filename, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
write_csv("nodes.csv", nodes_info, ['name','status','cpu_allocatable','memory_allocatable','gpu_allocatable'])
write_csv("pods.csv", pods_info, ['namespace','pod_name','node','container','cpu_request','memory_request','gpu_request'])
write_csv("vms.csv", vms_info, ['namespace','vm_name','cpu_request','memory_request','gpus'])
write_csv("gpus.csv", gpu_info, ['gpu_index','name','memory_total_MB','memory_used_MB','gpu_util_percent','mem_util_percent','processes'])
print("Cluster resource report generated with GPU PID → Pod/VM mapping")
print(" - cluster_resources.json")
print(" - nodes.csv, pods.csv, vms.csv, gpus.csv")

161
pcapi/vm_buckup_restore.py Normal file
View File

@ -0,0 +1,161 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
备份前建议关机或确保数据一致性
恢复前 VM 应该处于关机状态
只处理 disk0多磁盘可扩展循环处理
GPU VM 也适用恢复 PVC 后可重新分配 GPU
"""
import os
import sys
import argparse
import subprocess
import yaml
from kubernetes import client, config
# ----------------------------
# 参数解析
# ----------------------------
parser = argparse.ArgumentParser(description="KubeVirt VM Backup and Restore (NFS, Multi-disk, GPU)")
parser.add_argument("action", choices=["backup","restore"], help="操作类型")
parser.add_argument("--vm", required=True, help="VM 名称")
parser.add_argument("--namespace", default="default", help="VM 所在 Namespace")
parser.add_argument("--nfs-path", required=True, help="共享 NFS 路径,例如 /mnt/nfs-backup")
parser.add_argument("--restore-vm", help="恢复时新 VM 名称,如果不填覆盖原 VM")
args = parser.parse_args()
vm_name = args.vm
namespace = args.namespace
nfs_path = args.nfs_path
restore_vm_name = args.restore_vm or vm_name
# ----------------------------
# 初始化 K8s API
# ----------------------------
try:
kubeconfig_path = os.environ.get("KUBECONFIG","/root/.kube/config")
config.load_kube_config(config_file=kubeconfig_path)
except Exception as e:
print(f"加载 kubeconfig 失败: {e}")
sys.exit(1)
v1 = client.CoreV1Api()
crd_api = client.CustomObjectsApi()
# ----------------------------
# 获取 virt-launcher Pod
# ----------------------------
def get_virt_launcher_pod(vm, ns):
pods = v1.list_namespaced_pod(namespace=ns, label_selector=f"kubevirt.io=virt-launcher,vm={vm}")
if not pods.items:
print(f"未找到 VM {vm} 的 virt-launcher Pod")
sys.exit(1)
return pods.items[0].metadata.name
# ----------------------------
# 获取 VM spec包括 nodeSelector, GPU 配置)
# ----------------------------
def get_vm_spec(vm, ns):
vm_obj = crd_api.get_namespaced_custom_object(
group="kubevirt.io",
version="v1",
namespace=ns,
plural="virtualmachines",
name=vm
)
return vm_obj.get("spec", {}).get("template", {}).get("spec", {})
# ----------------------------
# 备份 VM
# ----------------------------
def backup_vm():
pod_name = get_virt_launcher_pod(vm_name, namespace)
backup_dir = os.path.join(nfs_path, vm_name)
os.makedirs(backup_dir, exist_ok=True)
# 获取磁盘路径
cmd_list_disks = f"kubectl exec -n {namespace} {pod_name} -- ls /var/run/kubevirt-private/vmi-disks/"
disks = subprocess.check_output(cmd_list_disks, shell=True).decode().splitlines()
for disk in disks:
src_path = f"/var/run/kubevirt-private/vmi-disks/{disk}"
dest_path = os.path.join(backup_dir, f"{disk}.qcow2")
print(f"[备份] {disk} -> {dest_path}")
# 转换成 QCOW2 临时文件
cmd_convert = f"kubectl exec -n {namespace} {pod_name} -- qemu-img convert -O qcow2 {src_path} /tmp/{disk}.qcow2"
subprocess.run(cmd_convert, shell=True, check=True)
# 拷贝到 NFS
cmd_cp = f"kubectl cp {namespace}/{pod_name}:/tmp/{disk}.qcow2 {dest_path}"
subprocess.run(cmd_cp, shell=True, check=True)
# 清理临时文件
subprocess.run(f"kubectl exec -n {namespace} {pod_name} -- rm /tmp/{disk}.qcow2", shell=True)
print(f"VM {vm_name} 备份完成,存放在 {backup_dir}")
# ----------------------------
# 恢复 VM
# ----------------------------
def restore_vm():
backup_dir = os.path.join(nfs_path, vm_name)
if not os.path.exists(backup_dir):
print(f"备份目录 {backup_dir} 不存在")
sys.exit(1)
# 获取原 VM spec
original_spec = get_vm_spec(vm_name, namespace)
# 删除原 VM如果覆盖
if restore_vm_name == vm_name:
print(f"[恢复] 删除原 VM {vm_name},请确保 VM 已关机")
subprocess.run(f"kubectl delete vm {vm_name} -n {namespace}", shell=True, check=True)
# 创建 PVC 并上传 QCOW2
disk_files = [f for f in os.listdir(backup_dir) if f.endswith(".qcow2")]
for i, disk_file in enumerate(disk_files):
pvc_name = f"{restore_vm_name}-disk{i}"
pvc_yaml = {
"apiVersion": "v1",
"kind": "PersistentVolumeClaim",
"metadata": {"name": pvc_name, "namespace": namespace},
"spec": {
"accessModes": ["ReadWriteOnce"],
"resources": {"requests": {"storage": "100Gi"}},
"storageClassName": "nfs-sc"
}
}
pvc_file_path = f"/tmp/{pvc_name}.yaml"
with open(pvc_file_path, "w") as f:
yaml.dump(pvc_yaml, f)
subprocess.run(f"kubectl apply -f {pvc_file_path}", shell=True, check=True)
# 创建新的 VM YAML
vm_yaml = {
"apiVersion": "kubevirt.io/v1",
"kind": "VirtualMachine",
"metadata": {"name": restore_vm_name, "namespace": namespace},
"spec": {"running": True, "template": {"spec": original_spec}}
}
vm_file = f"/tmp/{restore_vm_name}.yaml"
with open(vm_file, "w") as f:
yaml.dump(vm_yaml, f)
subprocess.run(f"kubectl apply -f {vm_file}", shell=True, check=True)
# 上传 QCOW2 到 virt-launcher Pod
pod_name = get_virt_launcher_pod(restore_vm_name, namespace)
for i, disk_file in enumerate(disk_files):
qcow_path = os.path.join(backup_dir, disk_file)
pvc_disk_path = f"/var/run/kubevirt-private/vmi-disks/disk{i}"
print(f"[恢复] 上传 {qcow_path} -> {pvc_disk_path}")
cmd_cp = f"kubectl cp {qcow_path} {namespace}/{pod_name}:{pvc_disk_path}"
subprocess.run(cmd_cp, shell=True, check=True)
print(f"VM {restore_vm_name} 恢复完成,可启动")
# ----------------------------
# 主函数
# ----------------------------
if args.action == "backup":
backup_vm()
elif args.action == "restore":
restore_vm()