Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c661aea553 | |||
| 8084dbabd4 | |||
| 83b869fcf3 | |||
| 370edc473c | |||
| 9a4d10e93c | |||
| 10c1f0c268 | |||
| e317df2fac | |||
| 970c68d201 | |||
| 56900d5b5a | |||
| 84e524365c | |||
| 7a4e301092 | |||
| 8b92a9b1cf | |||
| 567b4ff367 | |||
| e81bbfdaa9 | |||
| 3513f1ddc1 |
187
pcapi/cluster_resource_monitor_with_pid_mapping.py
Normal file
187
pcapi/cluster_resource_monitor_with_pid_mapping.py
Normal file
@ -0,0 +1,187 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import csv
|
||||||
|
from kubernetes import client, config
|
||||||
|
import pynvml
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 1. 初始化 Kubernetes
|
||||||
|
# ----------------------------
|
||||||
|
try:
|
||||||
|
kubeconfig_path = os.environ.get("KUBECONFIG", "/root/.kube/config")
|
||||||
|
config.load_kube_config(config_file=kubeconfig_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading kubeconfig: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
v1 = client.CoreV1Api()
|
||||||
|
crd_api = client.CustomObjectsApi()
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 2. 初始化 NVIDIA
|
||||||
|
# ----------------------------
|
||||||
|
try:
|
||||||
|
pynvml.nvmlInit()
|
||||||
|
gpu_count = pynvml.nvmlDeviceGetCount()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"No NVIDIA GPU found or driver not installed: {e}")
|
||||||
|
gpu_count = 0
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 3. 节点资源
|
||||||
|
# ----------------------------
|
||||||
|
nodes_info = []
|
||||||
|
gpu_nodes = {}
|
||||||
|
for node in v1.list_node().items:
|
||||||
|
name = node.metadata.name
|
||||||
|
status = next((cond.type for cond in node.status.conditions if cond.status=="True"), "Unknown")
|
||||||
|
cpu = node.status.allocatable.get('cpu', '0')
|
||||||
|
memory = node.status.allocatable.get('memory', '0')
|
||||||
|
gpu = node.status.allocatable.get('nvidia.com/gpu', '0')
|
||||||
|
nodes_info.append({'name': name,'status': status,'cpu_allocatable': cpu,'memory_allocatable': memory,'gpu_allocatable': gpu})
|
||||||
|
if gpu != '0':
|
||||||
|
gpu_nodes[name] = int(gpu)
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 4. Pod资源
|
||||||
|
# ----------------------------
|
||||||
|
pods_info = []
|
||||||
|
pod_pid_map = {} # key=PID, value=(namespace,pod_name)
|
||||||
|
for pod in v1.list_pod_for_all_namespaces().items:
|
||||||
|
pod_name = pod.metadata.name
|
||||||
|
namespace = pod.metadata.namespace
|
||||||
|
node_name = pod.spec.node_name
|
||||||
|
for c in pod.spec.containers:
|
||||||
|
cpu_req = c.resources.requests.get('cpu', '0') if c.resources.requests else '0'
|
||||||
|
mem_req = c.resources.requests.get('memory', '0') if c.resources.requests else '0'
|
||||||
|
gpu_req = c.resources.requests.get('nvidia.com/gpu', '0') if c.resources.requests else '0'
|
||||||
|
pods_info.append({
|
||||||
|
'namespace': namespace,
|
||||||
|
'pod_name': pod_name,
|
||||||
|
'node': node_name,
|
||||||
|
'container': c.name,
|
||||||
|
'cpu_request': cpu_req,
|
||||||
|
'memory_request': mem_req,
|
||||||
|
'gpu_request': gpu_req
|
||||||
|
})
|
||||||
|
# 获取容器 PID 映射 (离线环境需有 nsenter 或 crictl)
|
||||||
|
try:
|
||||||
|
cmd = f"crictl inspectp $(crictl ps --name {pod_name} -q)"
|
||||||
|
out = subprocess.check_output(cmd, shell=True).decode()
|
||||||
|
# 简化处理,只提取 PID
|
||||||
|
import re
|
||||||
|
pids = re.findall(r'"pid":\s*(\d+)', out)
|
||||||
|
for pid in pids:
|
||||||
|
pod_pid_map[int(pid)] = (namespace, pod_name)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 5. KubeVirt VM资源
|
||||||
|
# ----------------------------
|
||||||
|
vms_info = []
|
||||||
|
vm_pid_map = {} # key=PID, value=(namespace,vm_name)
|
||||||
|
namespaces = [ns.metadata.name for ns in v1.list_namespace().items]
|
||||||
|
for ns in namespaces:
|
||||||
|
try:
|
||||||
|
vms = crd_api.list_namespaced_custom_object(
|
||||||
|
group="kubevirt.io",
|
||||||
|
version="v1",
|
||||||
|
namespace=ns,
|
||||||
|
plural="virtualmachines"
|
||||||
|
)
|
||||||
|
for vm in vms.get('items', []):
|
||||||
|
name = vm['metadata']['name']
|
||||||
|
spec = vm.get('spec', {}).get('template', {}).get('spec', {})
|
||||||
|
domain = spec.get('domain', {})
|
||||||
|
resources = domain.get('resources', {}).get('requests', {})
|
||||||
|
cpu = resources.get('cpu', '0')
|
||||||
|
memory = resources.get('memory', '0')
|
||||||
|
gpus = domain.get('devices', {}).get('gpus', [])
|
||||||
|
vms_info.append({
|
||||||
|
'namespace': ns,
|
||||||
|
'vm_name': name,
|
||||||
|
'cpu_request': cpu,
|
||||||
|
'memory_request': memory,
|
||||||
|
'gpus': gpus
|
||||||
|
})
|
||||||
|
# 获取 virt-launcher Pod PID
|
||||||
|
try:
|
||||||
|
vl_pods = v1.list_namespaced_pod(namespace=ns, label_selector=f"kubevirt.io=virt-launcher,vm={name}")
|
||||||
|
for vl_pod in vl_pods.items:
|
||||||
|
cmd = f"crictl inspectp $(crictl ps --name {vl_pod.metadata.name} -q)"
|
||||||
|
out = subprocess.check_output(cmd, shell=True).decode()
|
||||||
|
import re
|
||||||
|
pids = re.findall(r'"pid":\s*(\d+)', out)
|
||||||
|
for pid in pids:
|
||||||
|
vm_pid_map[int(pid)] = (ns, name)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
except client.exceptions.ApiException:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 6. GPU实时使用 + PID映射
|
||||||
|
# ----------------------------
|
||||||
|
gpu_info = []
|
||||||
|
if gpu_count > 0:
|
||||||
|
for i in range(gpu_count):
|
||||||
|
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||||
|
name = pynvml.nvmlDeviceGetName(handle).decode()
|
||||||
|
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
||||||
|
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
||||||
|
|
||||||
|
processes = []
|
||||||
|
try:
|
||||||
|
for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
|
||||||
|
pid = proc.pid
|
||||||
|
used_mem = proc.usedGpuMemory // 1024**2
|
||||||
|
owner = pod_pid_map.get(pid) or vm_pid_map.get(pid) or ("unknown", "unknown")
|
||||||
|
processes.append({
|
||||||
|
'pid': pid,
|
||||||
|
'used_memory_MB': used_mem,
|
||||||
|
'owner_type': 'pod' if pid in pod_pid_map else 'vm' if pid in vm_pid_map else 'unknown',
|
||||||
|
'namespace': owner[0],
|
||||||
|
'name': owner[1]
|
||||||
|
})
|
||||||
|
except pynvml.NVMLError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
gpu_info.append({
|
||||||
|
'gpu_index': i,
|
||||||
|
'name': name,
|
||||||
|
'memory_total_MB': mem_info.total // 1024**2,
|
||||||
|
'memory_used_MB': mem_info.used // 1024**2,
|
||||||
|
'gpu_util_percent': util.gpu,
|
||||||
|
'mem_util_percent': util.memory,
|
||||||
|
'processes': processes
|
||||||
|
})
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 7. 输出 JSON 和 CSV
|
||||||
|
# ----------------------------
|
||||||
|
output = {'nodes': nodes_info, 'pods': pods_info, 'vms': vms_info, 'gpus': gpu_info}
|
||||||
|
with open("cluster_resources.json", "w") as f:
|
||||||
|
json.dump(output, f, indent=2)
|
||||||
|
|
||||||
|
# CSV输出函数
|
||||||
|
def write_csv(filename, data, fieldnames):
|
||||||
|
with open(filename, "w", newline="") as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(data)
|
||||||
|
|
||||||
|
write_csv("nodes.csv", nodes_info, ['name','status','cpu_allocatable','memory_allocatable','gpu_allocatable'])
|
||||||
|
write_csv("pods.csv", pods_info, ['namespace','pod_name','node','container','cpu_request','memory_request','gpu_request'])
|
||||||
|
write_csv("vms.csv", vms_info, ['namespace','vm_name','cpu_request','memory_request','gpus'])
|
||||||
|
write_csv("gpus.csv", gpu_info, ['gpu_index','name','memory_total_MB','memory_used_MB','gpu_util_percent','mem_util_percent','processes'])
|
||||||
|
|
||||||
|
print("Cluster resource report generated with GPU PID → Pod/VM mapping")
|
||||||
|
print(" - cluster_resources.json")
|
||||||
|
print(" - nodes.csv, pods.csv, vms.csv, gpus.csv")
|
||||||
|
|
||||||
161
pcapi/vm_buckup_restore.py
Normal file
161
pcapi/vm_buckup_restore.py
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
备份前建议关机或确保数据一致性
|
||||||
|
恢复前 VM 应该处于关机状态
|
||||||
|
只处理 disk0,多磁盘可扩展循环处理
|
||||||
|
GPU VM 也适用,恢复 PVC 后可重新分配 GPU
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import yaml
|
||||||
|
from kubernetes import client, config
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 参数解析
|
||||||
|
# ----------------------------
|
||||||
|
parser = argparse.ArgumentParser(description="KubeVirt VM Backup and Restore (NFS, Multi-disk, GPU)")
|
||||||
|
parser.add_argument("action", choices=["backup","restore"], help="操作类型")
|
||||||
|
parser.add_argument("--vm", required=True, help="VM 名称")
|
||||||
|
parser.add_argument("--namespace", default="default", help="VM 所在 Namespace")
|
||||||
|
parser.add_argument("--nfs-path", required=True, help="共享 NFS 路径,例如 /mnt/nfs-backup")
|
||||||
|
parser.add_argument("--restore-vm", help="恢复时新 VM 名称,如果不填覆盖原 VM")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
vm_name = args.vm
|
||||||
|
namespace = args.namespace
|
||||||
|
nfs_path = args.nfs_path
|
||||||
|
restore_vm_name = args.restore_vm or vm_name
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 初始化 K8s API
|
||||||
|
# ----------------------------
|
||||||
|
try:
|
||||||
|
kubeconfig_path = os.environ.get("KUBECONFIG","/root/.kube/config")
|
||||||
|
config.load_kube_config(config_file=kubeconfig_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"加载 kubeconfig 失败: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
v1 = client.CoreV1Api()
|
||||||
|
crd_api = client.CustomObjectsApi()
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 获取 virt-launcher Pod
|
||||||
|
# ----------------------------
|
||||||
|
def get_virt_launcher_pod(vm, ns):
|
||||||
|
pods = v1.list_namespaced_pod(namespace=ns, label_selector=f"kubevirt.io=virt-launcher,vm={vm}")
|
||||||
|
if not pods.items:
|
||||||
|
print(f"未找到 VM {vm} 的 virt-launcher Pod")
|
||||||
|
sys.exit(1)
|
||||||
|
return pods.items[0].metadata.name
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 获取 VM spec(包括 nodeSelector, GPU 配置)
|
||||||
|
# ----------------------------
|
||||||
|
def get_vm_spec(vm, ns):
|
||||||
|
vm_obj = crd_api.get_namespaced_custom_object(
|
||||||
|
group="kubevirt.io",
|
||||||
|
version="v1",
|
||||||
|
namespace=ns,
|
||||||
|
plural="virtualmachines",
|
||||||
|
name=vm
|
||||||
|
)
|
||||||
|
return vm_obj.get("spec", {}).get("template", {}).get("spec", {})
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 备份 VM
|
||||||
|
# ----------------------------
|
||||||
|
def backup_vm():
|
||||||
|
pod_name = get_virt_launcher_pod(vm_name, namespace)
|
||||||
|
backup_dir = os.path.join(nfs_path, vm_name)
|
||||||
|
os.makedirs(backup_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# 获取磁盘路径
|
||||||
|
cmd_list_disks = f"kubectl exec -n {namespace} {pod_name} -- ls /var/run/kubevirt-private/vmi-disks/"
|
||||||
|
disks = subprocess.check_output(cmd_list_disks, shell=True).decode().splitlines()
|
||||||
|
|
||||||
|
for disk in disks:
|
||||||
|
src_path = f"/var/run/kubevirt-private/vmi-disks/{disk}"
|
||||||
|
dest_path = os.path.join(backup_dir, f"{disk}.qcow2")
|
||||||
|
print(f"[备份] {disk} -> {dest_path}")
|
||||||
|
# 转换成 QCOW2 临时文件
|
||||||
|
cmd_convert = f"kubectl exec -n {namespace} {pod_name} -- qemu-img convert -O qcow2 {src_path} /tmp/{disk}.qcow2"
|
||||||
|
subprocess.run(cmd_convert, shell=True, check=True)
|
||||||
|
# 拷贝到 NFS
|
||||||
|
cmd_cp = f"kubectl cp {namespace}/{pod_name}:/tmp/{disk}.qcow2 {dest_path}"
|
||||||
|
subprocess.run(cmd_cp, shell=True, check=True)
|
||||||
|
# 清理临时文件
|
||||||
|
subprocess.run(f"kubectl exec -n {namespace} {pod_name} -- rm /tmp/{disk}.qcow2", shell=True)
|
||||||
|
print(f"VM {vm_name} 备份完成,存放在 {backup_dir}")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 恢复 VM
|
||||||
|
# ----------------------------
|
||||||
|
def restore_vm():
|
||||||
|
backup_dir = os.path.join(nfs_path, vm_name)
|
||||||
|
if not os.path.exists(backup_dir):
|
||||||
|
print(f"备份目录 {backup_dir} 不存在")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# 获取原 VM spec
|
||||||
|
original_spec = get_vm_spec(vm_name, namespace)
|
||||||
|
|
||||||
|
# 删除原 VM(如果覆盖)
|
||||||
|
if restore_vm_name == vm_name:
|
||||||
|
print(f"[恢复] 删除原 VM {vm_name},请确保 VM 已关机")
|
||||||
|
subprocess.run(f"kubectl delete vm {vm_name} -n {namespace}", shell=True, check=True)
|
||||||
|
|
||||||
|
# 创建 PVC 并上传 QCOW2
|
||||||
|
disk_files = [f for f in os.listdir(backup_dir) if f.endswith(".qcow2")]
|
||||||
|
for i, disk_file in enumerate(disk_files):
|
||||||
|
pvc_name = f"{restore_vm_name}-disk{i}"
|
||||||
|
pvc_yaml = {
|
||||||
|
"apiVersion": "v1",
|
||||||
|
"kind": "PersistentVolumeClaim",
|
||||||
|
"metadata": {"name": pvc_name, "namespace": namespace},
|
||||||
|
"spec": {
|
||||||
|
"accessModes": ["ReadWriteOnce"],
|
||||||
|
"resources": {"requests": {"storage": "100Gi"}},
|
||||||
|
"storageClassName": "nfs-sc"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pvc_file_path = f"/tmp/{pvc_name}.yaml"
|
||||||
|
with open(pvc_file_path, "w") as f:
|
||||||
|
yaml.dump(pvc_yaml, f)
|
||||||
|
subprocess.run(f"kubectl apply -f {pvc_file_path}", shell=True, check=True)
|
||||||
|
|
||||||
|
# 创建新的 VM YAML
|
||||||
|
vm_yaml = {
|
||||||
|
"apiVersion": "kubevirt.io/v1",
|
||||||
|
"kind": "VirtualMachine",
|
||||||
|
"metadata": {"name": restore_vm_name, "namespace": namespace},
|
||||||
|
"spec": {"running": True, "template": {"spec": original_spec}}
|
||||||
|
}
|
||||||
|
vm_file = f"/tmp/{restore_vm_name}.yaml"
|
||||||
|
with open(vm_file, "w") as f:
|
||||||
|
yaml.dump(vm_yaml, f)
|
||||||
|
subprocess.run(f"kubectl apply -f {vm_file}", shell=True, check=True)
|
||||||
|
|
||||||
|
# 上传 QCOW2 到 virt-launcher Pod
|
||||||
|
pod_name = get_virt_launcher_pod(restore_vm_name, namespace)
|
||||||
|
for i, disk_file in enumerate(disk_files):
|
||||||
|
qcow_path = os.path.join(backup_dir, disk_file)
|
||||||
|
pvc_disk_path = f"/var/run/kubevirt-private/vmi-disks/disk{i}"
|
||||||
|
print(f"[恢复] 上传 {qcow_path} -> {pvc_disk_path}")
|
||||||
|
cmd_cp = f"kubectl cp {qcow_path} {namespace}/{pod_name}:{pvc_disk_path}"
|
||||||
|
subprocess.run(cmd_cp, shell=True, check=True)
|
||||||
|
|
||||||
|
print(f"VM {restore_vm_name} 恢复完成,可启动")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# 主函数
|
||||||
|
# ----------------------------
|
||||||
|
if args.action == "backup":
|
||||||
|
backup_vm()
|
||||||
|
elif args.action == "restore":
|
||||||
|
restore_vm()
|
||||||
|
|
||||||
Loading…
x
Reference in New Issue
Block a user