diff --git a/pcapi/cluster_resource_monitor_with_pid_mapping.py b/pcapi/cluster_resource_monitor_with_pid_mapping.py new file mode 100644 index 0000000..073ab01 --- /dev/null +++ b/pcapi/cluster_resource_monitor_with_pid_mapping.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import sys +import json +import csv +from kubernetes import client, config +import pynvml +import subprocess + +# ---------------------------- +# 1. 初始化 Kubernetes +# ---------------------------- +try: + kubeconfig_path = os.environ.get("KUBECONFIG", "/root/.kube/config") + config.load_kube_config(config_file=kubeconfig_path) +except Exception as e: + print(f"Error loading kubeconfig: {e}") + sys.exit(1) + +v1 = client.CoreV1Api() +crd_api = client.CustomObjectsApi() + +# ---------------------------- +# 2. 初始化 NVIDIA +# ---------------------------- +try: + pynvml.nvmlInit() + gpu_count = pynvml.nvmlDeviceGetCount() +except Exception as e: + print(f"No NVIDIA GPU found or driver not installed: {e}") + gpu_count = 0 + +# ---------------------------- +# 3. 节点资源 +# ---------------------------- +nodes_info = [] +gpu_nodes = {} +for node in v1.list_node().items: + name = node.metadata.name + status = next((cond.type for cond in node.status.conditions if cond.status=="True"), "Unknown") + cpu = node.status.allocatable.get('cpu', '0') + memory = node.status.allocatable.get('memory', '0') + gpu = node.status.allocatable.get('nvidia.com/gpu', '0') + nodes_info.append({'name': name,'status': status,'cpu_allocatable': cpu,'memory_allocatable': memory,'gpu_allocatable': gpu}) + if gpu != '0': + gpu_nodes[name] = int(gpu) + +# ---------------------------- +# 4. Pod资源 +# ---------------------------- +pods_info = [] +pod_pid_map = {} # key=PID, value=(namespace,pod_name) +for pod in v1.list_pod_for_all_namespaces().items: + pod_name = pod.metadata.name + namespace = pod.metadata.namespace + node_name = pod.spec.node_name + for c in pod.spec.containers: + cpu_req = c.resources.requests.get('cpu', '0') if c.resources.requests else '0' + mem_req = c.resources.requests.get('memory', '0') if c.resources.requests else '0' + gpu_req = c.resources.requests.get('nvidia.com/gpu', '0') if c.resources.requests else '0' + pods_info.append({ + 'namespace': namespace, + 'pod_name': pod_name, + 'node': node_name, + 'container': c.name, + 'cpu_request': cpu_req, + 'memory_request': mem_req, + 'gpu_request': gpu_req + }) + # 获取容器 PID 映射 (离线环境需有 nsenter 或 crictl) + try: + cmd = f"crictl inspectp $(crictl ps --name {pod_name} -q)" + out = subprocess.check_output(cmd, shell=True).decode() + # 简化处理,只提取 PID + import re + pids = re.findall(r'"pid":\s*(\d+)', out) + for pid in pids: + pod_pid_map[int(pid)] = (namespace, pod_name) + except Exception: + continue + +# ---------------------------- +# 5. KubeVirt VM资源 +# ---------------------------- +vms_info = [] +vm_pid_map = {} # key=PID, value=(namespace,vm_name) +namespaces = [ns.metadata.name for ns in v1.list_namespace().items] +for ns in namespaces: + try: + vms = crd_api.list_namespaced_custom_object( + group="kubevirt.io", + version="v1", + namespace=ns, + plural="virtualmachines" + ) + for vm in vms.get('items', []): + name = vm['metadata']['name'] + spec = vm.get('spec', {}).get('template', {}).get('spec', {}) + domain = spec.get('domain', {}) + resources = domain.get('resources', {}).get('requests', {}) + cpu = resources.get('cpu', '0') + memory = resources.get('memory', '0') + gpus = domain.get('devices', {}).get('gpus', []) + vms_info.append({ + 'namespace': ns, + 'vm_name': name, + 'cpu_request': cpu, + 'memory_request': memory, + 'gpus': gpus + }) + # 获取 virt-launcher Pod PID + try: + vl_pods = v1.list_namespaced_pod(namespace=ns, label_selector=f"kubevirt.io=virt-launcher,vm={name}") + for vl_pod in vl_pods.items: + cmd = f"crictl inspectp $(crictl ps --name {vl_pod.metadata.name} -q)" + out = subprocess.check_output(cmd, shell=True).decode() + import re + pids = re.findall(r'"pid":\s*(\d+)', out) + for pid in pids: + vm_pid_map[int(pid)] = (ns, name) + except Exception: + continue + except client.exceptions.ApiException: + continue + +# ---------------------------- +# 6. GPU实时使用 + PID映射 +# ---------------------------- +gpu_info = [] +if gpu_count > 0: + for i in range(gpu_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + name = pynvml.nvmlDeviceGetName(handle).decode() + mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + util = pynvml.nvmlDeviceGetUtilizationRates(handle) + + processes = [] + try: + for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle): + pid = proc.pid + used_mem = proc.usedGpuMemory // 1024**2 + owner = pod_pid_map.get(pid) or vm_pid_map.get(pid) or ("unknown", "unknown") + processes.append({ + 'pid': pid, + 'used_memory_MB': used_mem, + 'owner_type': 'pod' if pid in pod_pid_map else 'vm' if pid in vm_pid_map else 'unknown', + 'namespace': owner[0], + 'name': owner[1] + }) + except pynvml.NVMLError: + pass + + gpu_info.append({ + 'gpu_index': i, + 'name': name, + 'memory_total_MB': mem_info.total // 1024**2, + 'memory_used_MB': mem_info.used // 1024**2, + 'gpu_util_percent': util.gpu, + 'mem_util_percent': util.memory, + 'processes': processes + }) + +# ---------------------------- +# 7. 输出 JSON 和 CSV +# ---------------------------- +output = {'nodes': nodes_info, 'pods': pods_info, 'vms': vms_info, 'gpus': gpu_info} +with open("cluster_resources.json", "w") as f: + json.dump(output, f, indent=2) + +# CSV输出函数 +def write_csv(filename, data, fieldnames): + with open(filename, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(data) + +write_csv("nodes.csv", nodes_info, ['name','status','cpu_allocatable','memory_allocatable','gpu_allocatable']) +write_csv("pods.csv", pods_info, ['namespace','pod_name','node','container','cpu_request','memory_request','gpu_request']) +write_csv("vms.csv", vms_info, ['namespace','vm_name','cpu_request','memory_request','gpus']) +write_csv("gpus.csv", gpu_info, ['gpu_index','name','memory_total_MB','memory_used_MB','gpu_util_percent','mem_util_percent','processes']) + +print("Cluster resource report generated with GPU PID → Pod/VM mapping") +print(" - cluster_resources.json") +print(" - nodes.csv, pods.csv, vms.csv, gpus.csv") + diff --git a/pcapi/vm_buckup_restore.py b/pcapi/vm_buckup_restore.py new file mode 100644 index 0000000..39fabbd --- /dev/null +++ b/pcapi/vm_buckup_restore.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +备份前建议关机或确保数据一致性 +恢复前 VM 应该处于关机状态 +只处理 disk0,多磁盘可扩展循环处理 +GPU VM 也适用,恢复 PVC 后可重新分配 GPU +""" +import os +import sys +import argparse +import subprocess +import yaml +from kubernetes import client, config + +# ---------------------------- +# 参数解析 +# ---------------------------- +parser = argparse.ArgumentParser(description="KubeVirt VM Backup and Restore (NFS, Multi-disk, GPU)") +parser.add_argument("action", choices=["backup","restore"], help="操作类型") +parser.add_argument("--vm", required=True, help="VM 名称") +parser.add_argument("--namespace", default="default", help="VM 所在 Namespace") +parser.add_argument("--nfs-path", required=True, help="共享 NFS 路径,例如 /mnt/nfs-backup") +parser.add_argument("--restore-vm", help="恢复时新 VM 名称,如果不填覆盖原 VM") +args = parser.parse_args() + +vm_name = args.vm +namespace = args.namespace +nfs_path = args.nfs_path +restore_vm_name = args.restore_vm or vm_name + +# ---------------------------- +# 初始化 K8s API +# ---------------------------- +try: + kubeconfig_path = os.environ.get("KUBECONFIG","/root/.kube/config") + config.load_kube_config(config_file=kubeconfig_path) +except Exception as e: + print(f"加载 kubeconfig 失败: {e}") + sys.exit(1) + +v1 = client.CoreV1Api() +crd_api = client.CustomObjectsApi() + +# ---------------------------- +# 获取 virt-launcher Pod +# ---------------------------- +def get_virt_launcher_pod(vm, ns): + pods = v1.list_namespaced_pod(namespace=ns, label_selector=f"kubevirt.io=virt-launcher,vm={vm}") + if not pods.items: + print(f"未找到 VM {vm} 的 virt-launcher Pod") + sys.exit(1) + return pods.items[0].metadata.name + +# ---------------------------- +# 获取 VM spec(包括 nodeSelector, GPU 配置) +# ---------------------------- +def get_vm_spec(vm, ns): + vm_obj = crd_api.get_namespaced_custom_object( + group="kubevirt.io", + version="v1", + namespace=ns, + plural="virtualmachines", + name=vm + ) + return vm_obj.get("spec", {}).get("template", {}).get("spec", {}) + +# ---------------------------- +# 备份 VM +# ---------------------------- +def backup_vm(): + pod_name = get_virt_launcher_pod(vm_name, namespace) + backup_dir = os.path.join(nfs_path, vm_name) + os.makedirs(backup_dir, exist_ok=True) + + # 获取磁盘路径 + cmd_list_disks = f"kubectl exec -n {namespace} {pod_name} -- ls /var/run/kubevirt-private/vmi-disks/" + disks = subprocess.check_output(cmd_list_disks, shell=True).decode().splitlines() + + for disk in disks: + src_path = f"/var/run/kubevirt-private/vmi-disks/{disk}" + dest_path = os.path.join(backup_dir, f"{disk}.qcow2") + print(f"[备份] {disk} -> {dest_path}") + # 转换成 QCOW2 临时文件 + cmd_convert = f"kubectl exec -n {namespace} {pod_name} -- qemu-img convert -O qcow2 {src_path} /tmp/{disk}.qcow2" + subprocess.run(cmd_convert, shell=True, check=True) + # 拷贝到 NFS + cmd_cp = f"kubectl cp {namespace}/{pod_name}:/tmp/{disk}.qcow2 {dest_path}" + subprocess.run(cmd_cp, shell=True, check=True) + # 清理临时文件 + subprocess.run(f"kubectl exec -n {namespace} {pod_name} -- rm /tmp/{disk}.qcow2", shell=True) + print(f"VM {vm_name} 备份完成,存放在 {backup_dir}") + +# ---------------------------- +# 恢复 VM +# ---------------------------- +def restore_vm(): + backup_dir = os.path.join(nfs_path, vm_name) + if not os.path.exists(backup_dir): + print(f"备份目录 {backup_dir} 不存在") + sys.exit(1) + + # 获取原 VM spec + original_spec = get_vm_spec(vm_name, namespace) + + # 删除原 VM(如果覆盖) + if restore_vm_name == vm_name: + print(f"[恢复] 删除原 VM {vm_name},请确保 VM 已关机") + subprocess.run(f"kubectl delete vm {vm_name} -n {namespace}", shell=True, check=True) + + # 创建 PVC 并上传 QCOW2 + disk_files = [f for f in os.listdir(backup_dir) if f.endswith(".qcow2")] + for i, disk_file in enumerate(disk_files): + pvc_name = f"{restore_vm_name}-disk{i}" + pvc_yaml = { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": {"name": pvc_name, "namespace": namespace}, + "spec": { + "accessModes": ["ReadWriteOnce"], + "resources": {"requests": {"storage": "100Gi"}}, + "storageClassName": "nfs-sc" + } + } + pvc_file_path = f"/tmp/{pvc_name}.yaml" + with open(pvc_file_path, "w") as f: + yaml.dump(pvc_yaml, f) + subprocess.run(f"kubectl apply -f {pvc_file_path}", shell=True, check=True) + + # 创建新的 VM YAML + vm_yaml = { + "apiVersion": "kubevirt.io/v1", + "kind": "VirtualMachine", + "metadata": {"name": restore_vm_name, "namespace": namespace}, + "spec": {"running": True, "template": {"spec": original_spec}} + } + vm_file = f"/tmp/{restore_vm_name}.yaml" + with open(vm_file, "w") as f: + yaml.dump(vm_yaml, f) + subprocess.run(f"kubectl apply -f {vm_file}", shell=True, check=True) + + # 上传 QCOW2 到 virt-launcher Pod + pod_name = get_virt_launcher_pod(restore_vm_name, namespace) + for i, disk_file in enumerate(disk_files): + qcow_path = os.path.join(backup_dir, disk_file) + pvc_disk_path = f"/var/run/kubevirt-private/vmi-disks/disk{i}" + print(f"[恢复] 上传 {qcow_path} -> {pvc_disk_path}") + cmd_cp = f"kubectl cp {qcow_path} {namespace}/{pod_name}:{pvc_disk_path}" + subprocess.run(cmd_cp, shell=True, check=True) + + print(f"VM {restore_vm_name} 恢复完成,可启动") + +# ---------------------------- +# 主函数 +# ---------------------------- +if args.action == "backup": + backup_vm() +elif args.action == "restore": + restore_vm() +