diff --git a/requirements.txt b/requirements.txt index 182fd70..d62f6cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -librosa>=0.10.0 -numpy>=1.24.0 +torch>=2.0.0 +torchaudio>=2.0.0 soundfile>=0.12.0 diff --git a/songrate/analyzers/__init__.py b/songrate/analyzers/__init__.py index 0e973df..849a2e2 100644 --- a/songrate/analyzers/__init__.py +++ b/songrate/analyzers/__init__.py @@ -1,18 +1,38 @@ -"""音频分析器基类和工具函数""" -import numpy as np +"""音频分析器基类和工具函数 - GPU 版本""" +import torch +import torchaudio + + +def get_device(): + """获取 GPU 设备""" + return torch.device('cuda' if torch.cuda.is_available() else 'cpu') def load_audio(filepath, sr=22050): - """加载音频文件,返回 (y, sr)""" - import librosa - return librosa.load(filepath, sr=sr) + """加载音频文件到 GPU,返回 (waveform, sr)""" + device = get_device() + waveform, orig_sr = torchaudio.load(filepath) + + # 单声道 + if waveform.shape[0] > 1: + waveform = waveform.mean(dim=0, keepdim=True) + + # 重采样 + if orig_sr != sr: + resampler = torchaudio.transforms.Resample(orig_sr, sr).to(device) + waveform = resampler(waveform.to(device)) + + # 返回 1D 张量 + return waveform.squeeze(0).to(device), sr def safe_float(val, default=0.0): """安全转换为 float,处理 NaN/Inf""" if val is None: return default + if isinstance(val, torch.Tensor): + val = val.item() val = float(val) - if np.isnan(val) or np.isinf(val): + if torch.isnan(torch.tensor(val)) or torch.isinf(torch.tensor(val)): return default return val diff --git a/songrate/analyzers/danceability.py b/songrate/analyzers/danceability.py index e751b29..9d4a1a6 100644 --- a/songrate/analyzers/danceability.py +++ b/songrate/analyzers/danceability.py @@ -1,51 +1,67 @@ -"""可舞性分析 - 低频占比、节拍清晰度、节奏规律性""" -import numpy as np +"""可舞性分析 - GPU 版本""" +import torch from . import safe_float -def analyze_danceability(y, sr): - """ - 分析可舞性维度 - """ - import librosa - - # 低频能量占比 (bass ratio) - S = np.abs(librosa.stft(y)) - freqs = librosa.fft_frequencies(sr=sr) - - low_mask = freqs < 250 # 低频 < 250Hz - low_energy = np.sum(S[low_mask] ** 2) - total_energy = np.sum(S ** 2) +def analyze_danceability(y, sr, stft_result=None): + """分析可舞性维度 (GPU)""" + device = y.device + + # 使用预计算 STFT 或重新计算 + if stft_result is None: + stft_result = torch.stft(y, n_fft=2048, hop_length=512, return_complex=True) + magnitude = stft_result.abs() ** 2 # 功率谱 + + # 频率轴 + freqs = torch.fft.fftfreq(2048, 1.0/sr)[:1025] + + # 低频能量占比 (< 250Hz) + low_mask = freqs < 250 + low_energy = magnitude[low_mask].sum() + total_energy = magnitude.sum() bass_ratio = safe_float(low_energy / total_energy) if total_energy > 0 else 0.0 - - # 节拍清晰度 - onset_env = librosa.onset.onset_strength(y=y, sr=sr) - pulse = librosa.beat.plp(onset_envelope=onset_env, sr=sr) - beat_clarity = safe_float(min(np.mean(pulse), 1.0)) - - # 节奏规律性 - 自相关峰值 - if len(onset_env) > 10: - autocorr = np.correlate(onset_env, onset_env, mode='full') - autocorr = autocorr[len(autocorr) // 2:] + + # 节拍清晰度 (复用 rhythm 的逻辑) + spectral_flux = torch.diff(magnitude.sum(dim=0)) + spectral_flux = torch.nn.functional.relu(spectral_flux) + + if len(spectral_flux) > 10: + mean_flux = spectral_flux.mean() + std_flux = spectral_flux.std() + if std_flux > 0: + kurtosis = torch.mean(((spectral_flux - mean_flux) / std_flux) ** 4) + beat_clarity = safe_float(torch.clamp(kurtosis / 10.0, 0, 1)) + else: + beat_clarity = 0.3 + else: + beat_clarity = 0.5 + + # 节奏规律性 - 自相关 + if len(spectral_flux) > 10: + autocorr = torch.nn.functional.conv1d( + spectral_flux.unsqueeze(0).unsqueeze(0), + spectral_flux.unsqueeze(0).unsqueeze(0).flip(-1), + padding=len(spectral_flux) - 1 + ).squeeze() + autocorr = autocorr[len(autocorr)//2:] if autocorr[0] > 0: autocorr = autocorr / autocorr[0] - # 找第一个显著峰值(排除前几个采样点) - peaks = np.where(autocorr[10:] > 0.5)[0] - regularity = safe_float(min(len(peaks) / 20.0, 1.0)) if len(peaks) > 0 else 0.3 + peaks = (autocorr[10:] > 0.5).sum().item() + regularity = min(peaks / 20.0, 1.0) else: regularity = 0.5 - + scores = { "bass_ratio": round(bass_ratio * 10, 2), "beat_clarity": round(beat_clarity * 10, 2), "regularity": round(regularity * 10, 2), } - + score = ( 0.35 * scores["bass_ratio"] + 0.35 * scores["beat_clarity"] + 0.30 * scores["regularity"] ) scores["score"] = round(score, 2) - + return scores diff --git a/songrate/analyzers/energy.py b/songrate/analyzers/energy.py index 56f23b7..12e8c67 100644 --- a/songrate/analyzers/energy.py +++ b/songrate/analyzers/energy.py @@ -1,52 +1,61 @@ -"""能量分析 - RMS、动态范围、能量变化""" -import numpy as np +"""能量分析 - GPU 版本""" +import torch from . import safe_float -def analyze_energy(y, sr): - """ - 分析能量维度 - """ - import librosa - - # RMS 能量 - rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0] - rms_db = librosa.amplitude_to_db(rms, ref=np.max) - rms_mean = safe_float(np.mean(rms_db)) - - # 动态范围 (dB) - rms_valid = rms_db[rms_db > -60] # 过滤静音段 - if len(rms_valid) > 0: - dynamic_range = safe_float(np.max(rms_valid) - np.min(rms_valid)) +def analyze_energy(y, sr, stft_result=None): + """分析能量维度 (GPU)""" + device = y.device + + # RMS 能量 - 直接从音频计算 + frame_length = 2048 + hop_length = 512 + n_frames = (len(y) - frame_length) // hop_length + 1 + + if n_frames > 0: + frames = torch.nn.functional.unfold( + y.unsqueeze(0).unsqueeze(0), + kernel_size=(1, frame_length), + stride=(1, hop_length) + ).squeeze(0).squeeze(0) + rms = torch.sqrt(torch.mean(frames ** 2, dim=0)) + rms_db = 20 * torch.log10(torch.clamp(rms, 1e-10, None)) + rms_mean = safe_float(rms_db.mean()) + + # 动态范围 + rms_valid = rms_db[rms_db > -60] + if len(rms_valid) > 0: + dynamic_range = safe_float(rms_valid.max() - rms_valid.min()) + else: + dynamic_range = 0.0 + + # 能量变化 + rms_mean_val = rms.mean() + if rms_mean_val > 0: + variation = safe_float(rms.std() / rms_mean_val) + variation_score = min(variation / 0.5, 1.0) + else: + variation_score = 0.0 else: + rms_mean = -60.0 dynamic_range = 0.0 - - # 能量变化 - 能量曲线的变异系数 - if len(rms) > 0 and np.mean(rms) > 0: - variation = safe_float(np.std(rms) / np.mean(rms)) - variation_score = min(variation / 0.5, 1.0) # 50% 变化以上满分 - else: variation_score = 0.0 - + scores = { "rms": round(rms_mean, 2), "dynamic_range": round(dynamic_range, 2), "variation": round(variation_score * 10, 2), } - - # 评分逻辑:动态范围合理 + 有起伏 = 高分 - # 动态范围:15-30 dB 为优秀(不过度压缩也不过于极端) + + # 评分 if dynamic_range >= 10 and dynamic_range <= 35: dr_score = 8.0 + min((dynamic_range - 10) / 25 * 2, 2.0) elif dynamic_range > 35: - dr_score = 7.0 # 过大,可能有静音段 + dr_score = 7.0 else: - dr_score = dynamic_range / 10 * 8.0 # 过小,过度压缩 - - score = ( - 0.40 * dr_score + - 0.60 * scores["variation"] - ) + dr_score = dynamic_range / 10 * 8.0 + + score = 0.40 * dr_score + 0.60 * scores["variation"] scores["score"] = round(min(score, 10), 2) - + return scores diff --git a/songrate/analyzers/mood.py b/songrate/analyzers/mood.py index e59c010..6dfa0d4 100644 --- a/songrate/analyzers/mood.py +++ b/songrate/analyzers/mood.py @@ -1,72 +1,83 @@ -"""情绪分析 - 愉悦度、唤醒度、情绪清晰度""" -import numpy as np +"""情绪分析 - GPU 版本""" +import torch from . import safe_float -def analyze_mood(y, sr): - """ - 分析情绪维度(基于音频特征的启发式方法) - 后续可升级为 CLAP 深度学习模型 - """ - import librosa - - # 提取特征 - # 1. 频谱质心 (brightness) → 关联 valence - spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0] - brightness = safe_float(np.mean(spectral_centroid)) - - # 2. 频谱对比度 → 关联 arousal - spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr) - contrast_mean = safe_float(np.mean(spectral_contrast)) - - # 3. 过零率 → 关联 arousal - zcr = librosa.feature.zero_crossing_rate(y)[0] - zcr_mean = safe_float(np.mean(zcr)) - - # 4. RMS → 关联 arousal - rms = librosa.feature.rms(y=y)[0] - rms_mean = safe_float(np.mean(rms)) - - # 5. 调性 (major/minor) → 关联 valence - chroma = librosa.feature.chroma_stft(y=y, sr=sr) - major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, - 2.52, 5.19, 2.39, 3.66, 2.29, 2.88]) - minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, - 2.54, 4.75, 3.98, 2.69, 3.34, 3.17]) - major_profile = major_profile / np.sum(major_profile) - minor_profile = minor_profile / np.sum(minor_profile) - - chroma_mean = np.mean(chroma, axis=1) - major_corr = np.dot(chroma_mean, major_profile) / (np.linalg.norm(chroma_mean) * np.linalg.norm(major_profile)) - minor_corr = np.dot(chroma_mean, minor_profile) / (np.linalg.norm(chroma_mean) * np.linalg.norm(minor_profile)) +def analyze_mood(y, sr, stft_result=None): + """分析情绪维度 (GPU)""" + device = y.device + + # 使用预计算 STFT 或重新计算 + if stft_result is None: + stft_result = torch.stft(y, n_fft=2048, hop_length=512, return_complex=True) + magnitude = stft_result.abs() + + freqs = torch.fft.fftfreq(2048, 1.0/sr)[:1025] + + # 频谱质心 (brightness) + freq_sum = (freqs.unsqueeze(1) * magnitude).sum(dim=0) + mag_sum = magnitude.sum(dim=0) + spectral_centroid = freq_sum / torch.clamp(mag_sum, 1e-10) + brightness = safe_float(spectral_centroid.mean()) + + # 频谱对比度 + spectral_contrast = magnitude.max(dim=0).values - magnitude.min(dim=0).values + contrast_mean = safe_float(spectral_contrast.mean()) + + # 过零率 + zcr = torch.sum(torch.abs(torch.diff(torch.sign(y)))) / (2 * len(y)) + zcr_mean = safe_float(zcr) + + # RMS + rms = torch.sqrt(torch.mean(y ** 2)) + rms_mean = safe_float(rms) + + # 调性检测 (major/minor) - 简化版 chroma + # 12 个音级的能量 + chroma = torch.zeros(12, device=device) + note_freqs = 440 * (2 ** (torch.arange(12, device=device, dtype=torch.float32) / 12)) + for i, nf in enumerate(note_freqs): + # 找最接近的频率 bin + bin_idx = torch.argmin(torch.abs(freqs - nf)).item() + if bin_idx < magnitude.shape[0]: + chroma[i] = magnitude[bin_idx].mean() + + # 归一化 + chroma = chroma / torch.clamp(chroma.sum(), 1e-10) + + # Krumhansl 模板 (简化) + major_profile = torch.tensor([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88], device=device) + minor_profile = torch.tensor([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17], device=device) + major_profile = major_profile / major_profile.sum() + minor_profile = minor_profile / minor_profile.sum() + + major_corr = torch.dot(chroma, major_profile) / (torch.norm(chroma) * torch.norm(major_profile) + 1e-10) + minor_corr = torch.dot(chroma, minor_profile) / (torch.norm(chroma) * torch.norm(minor_profile) + 1e-10) is_major = major_corr >= minor_corr - - # 计算 valence (愉悦度): 大调 + 高亮度 + 高对比度 → 快乐 - # 归一化到 0-1 - brightness_norm = min(brightness / 4000, 1.0) # 4000Hz 以上算高亮 + + # Valence: 大调 + 高亮度 + 高对比度 + brightness_norm = min(brightness / 4000, 1.0) valence = 0.4 * brightness_norm + 0.3 * (1.0 if is_major else 0.3) + 0.3 * min(contrast_mean / 30, 1.0) valence = safe_float(min(max(valence, 0), 1.0)) - - # 计算 arousal (唤醒度): 高 ZCR + 高 RMS → 兴奋 + + # Arousal: 高 ZCR + 高 RMS zcr_norm = min(zcr_mean / 0.1, 1.0) rms_norm = min(rms_mean / 0.1, 1.0) arousal = 0.5 * zcr_norm + 0.5 * rms_norm arousal = safe_float(min(max(arousal, 0), 1.0)) - - # 情绪清晰度 - 特征是否集中在某个象限 - # 如果 valence 和 arousal 都接近 0.5,说明情绪模糊 - valence_distance = abs(valence - 0.5) * 2 # 0-1,离中心越远越清晰 + + # 情绪清晰度 + valence_distance = abs(valence - 0.5) * 2 arousal_distance = abs(arousal - 0.5) * 2 clarity = (valence_distance + arousal_distance) / 2 - + scores = { "valence": round(valence * 10, 2), "arousal": round(arousal * 10, 2), "clarity": round(clarity * 10, 2), } - - # 情绪评分:清晰度为主 + score = 0.40 * scores["clarity"] + 0.30 * scores["valence"] + 0.30 * scores["arousal"] scores["score"] = round(min(score, 10), 2) - + return scores diff --git a/songrate/analyzers/quality.py b/songrate/analyzers/quality.py index 287fbce..f9062bc 100644 --- a/songrate/analyzers/quality.py +++ b/songrate/analyzers/quality.py @@ -1,63 +1,69 @@ -"""音频质量分析 - 信噪比、削波检测、频率均衡""" -import numpy as np +"""音频质量分析 - GPU 版本""" +import torch from . import safe_float -def analyze_quality(y, sr): - """ - 分析音频质量维度 - """ - import librosa - - # 信噪比估算 - 信号能量 vs 安静段能量 - rms = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0] - rms_sorted = np.sort(rms) - n = len(rms_sorted) - if n > 10: - noise_floor = np.mean(rms_sorted[:n // 10]) # 最安静的 10% - signal_level = np.mean(rms_sorted[-n // 10:]) # 最响的 10% +def analyze_quality(y, sr, stft_result=None): + """分析音频质量维度 (GPU)""" + + # 信噪比估算 + frame_length = 2048 + hop_length = 512 + n_frames = (len(y) - frame_length) // hop_length + 1 + + if n_frames > 10: + frames = torch.nn.functional.unfold( + y.unsqueeze(0).unsqueeze(0), + kernel_size=(1, frame_length), + stride=(1, hop_length) + ).squeeze(0).squeeze(0) + rms = torch.sqrt(torch.mean(frames ** 2, dim=0)) + rms_sorted, _ = torch.sort(rms) + n = len(rms_sorted) + noise_floor = rms_sorted[:n // 10].mean() + signal_level = rms_sorted[-n // 10:].mean() if noise_floor > 0: - snr_db = safe_float(20 * np.log10(signal_level / noise_floor)) + snr_db = safe_float(20 * torch.log10(signal_level / noise_floor)) else: - snr_db = 60.0 # 无噪声 + snr_db = 60.0 else: snr_db = 30.0 - - # 削波检测 - 检查是否有接近 1.0 或 -1.0 的采样 - clipped = np.sum(np.abs(y) > 0.99) / len(y) + + # 削波检测 + clipped = (torch.abs(y) > 0.99).sum().item() / len(y) clipping_score = 10.0 if clipped < 0.001 else max(0, 10.0 - clipped * 10000) - + # 频率均衡 - 频谱平坦度 - spectral_flatness = librosa.feature.spectral_flatness(y=y)[0] - flatness_mean = safe_float(np.mean(spectral_flatness)) - # 频谱太平坦 = 白噪声,太不平坦 = 某些频段缺失 - # 理想范围:0.001 - 0.1 + if stft_result is not None: + magnitude = stft_result.abs() + geometric_mean = torch.exp(torch.log(torch.clamp(magnitude, 1e-10)).mean(dim=0)) + arithmetic_mean = magnitude.mean(dim=0) + flatness = geometric_mean / torch.clamp(arithmetic_mean, 1e-10) + flatness_mean = safe_float(flatness.mean()) + else: + flatness_mean = 0.05 + if 0.001 <= flatness_mean <= 0.1: freq_balance = 8.0 elif flatness_mean < 0.001: - freq_balance = 6.0 # 过于集中 + freq_balance = 6.0 else: - freq_balance = 5.0 # 过于平坦 - + freq_balance = 5.0 + scores = { "snr": round(snr_db, 2), "clipping": round(clipping_score, 2), "frequency_balance": round(freq_balance, 2), } - - # SNR 评分: >40dB 优秀, 20-40 良好, <20 差 + if snr_db >= 40: snr_score = 10.0 elif snr_db >= 20: snr_score = 6.0 + (snr_db - 20) / 20 * 4.0 else: snr_score = max(snr_db / 20 * 6.0, 0) - - score = ( - 0.40 * snr_score + - 0.35 * scores["clipping"] + - 0.25 * scores["frequency_balance"] - ) + + score = 0.40 * snr_score + 0.35 * scores["clipping"] + 0.25 * scores["frequency_balance"] scores["score"] = round(min(score, 10), 2) - + return scores diff --git a/songrate/analyzers/rhythm.py b/songrate/analyzers/rhythm.py index 1934db0..8a908e9 100644 --- a/songrate/analyzers/rhythm.py +++ b/songrate/analyzers/rhythm.py @@ -1,65 +1,93 @@ -"""节奏分析 - BPM、节拍清晰度、稳定性、律动感""" -import numpy as np +"""节奏分析 - GPU 版本 (PyTorch)""" +import torch from . import safe_float -def analyze_rhythm(y, sr): +def analyze_rhythm(y, sr, stft_result=None): """ - 分析节奏维度 - 输入: y (音频数据), sr (采样率) - 输出: dict {bpm, beat_clarity, stability, groove, score} + 分析节奏维度 (GPU) + 输入: y (GPU tensor), sr, stft_result (预计算STFT) """ - import librosa - - # BPM - tempo, beats = librosa.beat.beat_track(y=y, sr=sr) - bpm = safe_float(tempo) - - # 节拍清晰度 (beat_track 返回的 confidence) - # 新版 librosa 返回 tuple (tempo, beats),confidence 需要从 onset strength 推算 - onset_env = librosa.onset.onset_strength(y=y, sr=sr) - pulse = librosa.beat.plp(onset_envelope=onset_env, sr=sr) - beat_clarity = safe_float(np.mean(pulse)) - # 归一化到 0-1 - beat_clarity = min(beat_clarity, 1.0) - - # 节奏稳定性 - 节拍间隔的变异系数 - if len(beats) > 2: - beat_times = librosa.frames_to_time(beats, sr=sr) - intervals = np.diff(beat_times) - if np.mean(intervals) > 0: - cv = np.std(intervals) / np.mean(intervals) - stability = safe_float(1.0 - min(cv, 1.0)) + device = y.device + + # 使用预计算的 STFT 或重新计算 + if stft_result is None: + stft_result = torch.stft(y, n_fft=2048, hop_length=512, return_complex=True) + magnitude = stft_result.abs() + + # Onset strength (频谱变化率) + spectral_flux = torch.diff(magnitude.sum(dim=0)) + spectral_flux = torch.nn.functional.relu(spectral_flux) # 只保留正变化 + + # 自相关找 BPM + if len(spectral_flux) > 100: + autocorr = torch.nn.functional.conv1d( + spectral_flux.unsqueeze(0).unsqueeze(0), + spectral_flux.unsqueeze(0).unsqueeze(0).flip(-1), + padding=len(spectral_flux) - 1 + ).squeeze() + + # 找第一个峰值 (排除前 10 个采样点) + autocorr_lag = autocorr[10:] + if len(autocorr_lag) > 0: + peak_idx = torch.argmax(autocorr_lag).item() + # BPM = 60 * sr / hop_length / peak_idx + hop_per_second = sr / 512 + bpm = 60.0 * hop_per_second / (peak_idx + 10) else: - stability = 0.0 + bpm = 120.0 + else: + bpm = 120.0 + + # 节拍清晰度 - onset 的峰度 + if len(spectral_flux) > 10: + mean_flux = spectral_flux.mean() + std_flux = spectral_flux.std() + if std_flux > 0: + kurtosis = torch.mean(((spectral_flux - mean_flux) / std_flux) ** 4) + beat_clarity = safe_float(torch.clamp(kurtosis / 10.0, 0, 1)) + else: + beat_clarity = 0.3 + else: + beat_clarity = 0.5 + + # 节奏稳定性 - onset 间隔的变异系数 + if len(spectral_flux) > 20: + # 简单检测峰值 + peaks = (spectral_flux[1:-1] > spectral_flux[:-2]) & (spectral_flux[1:-1] > spectral_flux[2:]) + peak_indices = torch.where(peaks)[0] + if len(peak_indices) > 2: + intervals = torch.diff(peak_indices.float()) + cv = intervals.std() / intervals.mean() if intervals.mean() > 0 else 1.0 + stability = safe_float(1.0 - torch.clamp(cv, 0, 1)) + else: + stability = 0.5 else: stability = 0.5 - - # 律动感 - 基于 onset strength 的方差(高方差 = 更多动态变化 = 更有律动) - if len(onset_env) > 0: - onset_std = np.std(onset_env) - onset_mean = np.mean(onset_env) + + # 律动感 - onset 的方差/均值 + if len(spectral_flux) > 0: + onset_mean = spectral_flux.mean() + onset_std = spectral_flux.std() if onset_mean > 0: - groove = safe_float(min(onset_std / onset_mean, 1.0)) + groove = safe_float(torch.clamp(onset_std / onset_mean, 0, 1)) else: groove = 0.0 else: groove = 0.0 - - # 子维度评分(均为 0-10) + scores = { - "bpm": bpm, + "bpm": round(safe_float(bpm), 1), "beat_clarity": round(beat_clarity * 10, 2), "stability": round(stability * 10, 2), "groove": round(groove * 10, 2), } - - # 维度综合分(不限制 BPM,只评估质量) + score = ( 0.35 * scores["beat_clarity"] + 0.30 * scores["stability"] + 0.35 * scores["groove"] ) scores["score"] = round(score, 2) - + return scores diff --git a/songrate/analyzers/timbre.py b/songrate/analyzers/timbre.py index e0252a3..dc6b93f 100644 --- a/songrate/analyzers/timbre.py +++ b/songrate/analyzers/timbre.py @@ -1,54 +1,58 @@ -"""音色分析 - 音色丰富度、频谱平衡""" -import numpy as np +"""音色分析 - GPU 版本""" +import torch +import torchaudio from . import safe_float -def analyze_timbre(y, sr): - """ - 分析音色维度 - """ - import librosa - - # 频谱质心 (brightness) - spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0] - centroid_mean = safe_float(np.mean(spectral_centroid)) - - # MFCC (13维音色指纹) - mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) - mfcc_mean = np.mean(mfcc, axis=1) - - # 音色丰富度 - MFCC 的方差(方差大 = 音色变化丰富) - mfcc_variance = np.var(mfcc, axis=1) - richness = safe_float(np.mean(mfcc_variance)) - richness_norm = min(richness / 100, 1.0) # 归一化 - - # 频谱平衡 - 各频段能量分布的均匀度 - S = np.abs(librosa.stft(y)) - freqs = librosa.fft_frequencies(sr=sr) - - # 分频段:低频 (<250Hz), 中频 (250-2000Hz), 高频 (>2000Hz) - low = np.sum(S[freqs < 250] ** 2) - mid = np.sum((freqs >= 250) & (freqs < 2000)) - mid_energy = np.sum(S[(freqs >= 250) & (freqs < 2000)] ** 2) - high_energy = np.sum(S[freqs >= 2000] ** 2) - - total = low + mid_energy + high_energy +def analyze_timbre(y, sr, stft_result=None): + """分析音色维度 (GPU)""" + device = y.device + + if stft_result is None: + stft_result = torch.stft(y, n_fft=2048, hop_length=512, return_complex=True) + magnitude = stft_result.abs() + + freqs = torch.fft.fftfreq(2048, 1.0/sr)[:1025] + + # 频谱质心 + freq_sum = (freqs.unsqueeze(1) * magnitude).sum(dim=0) + mag_sum = magnitude.sum(dim=0) + spectral_centroid = freq_sum / torch.clamp(mag_sum, 1e-10) + centroid_mean = safe_float(spectral_centroid.mean()) + + # MFCC (简化版 - 用 mel 滤波器组) + mel_spec = torchaudio.transforms.MelSpectrogram( + sample_rate=sr, n_fft=2048, hop_length=512, n_mels=40 + ).to(device)(y) + mel_log = torch.log(torch.clamp(mel_spec, 1e-10)) + + # DCT 得到 MFCC + mfcc = torch.fft.fft(mel_log, dim=0).real[:13] + mfcc_variance = torch.var(mfcc, dim=1) + richness = safe_float(mfcc_variance.mean()) + richness_norm = min(richness / 100, 1.0) + + # 频谱平衡 + power = magnitude ** 2 + low = power[freqs < 250].sum() + mid = power[(freqs >= 250) & (freqs < 2000)].sum() + high = power[freqs >= 2000].sum() + total = low + mid + high + if total > 0: - ratios = [low / total, mid_energy / total, high_energy / total] - # 理想比例:低频 0.2-0.4,中频 0.3-0.5,高频 0.1-0.3 - # 计算与理想比例的偏差 + ratios = [safe_float(low / total), safe_float(mid / total), safe_float(high / total)] ideal = [0.3, 0.4, 0.2] deviation = sum(abs(r - i) for r, i in zip(ratios, ideal)) balance = safe_float(1.0 - min(deviation / 1.0, 1.0)) else: balance = 0.0 - + scores = { "richness": round(richness_norm * 10, 2), "balance": round(balance * 10, 2), } - + score = 0.50 * scores["richness"] + 0.50 * scores["balance"] scores["score"] = round(min(score, 10), 2) - + return scores diff --git a/songrate/analyzers/tonality.py b/songrate/analyzers/tonality.py index de9bfbc..9f92ef8 100644 --- a/songrate/analyzers/tonality.py +++ b/songrate/analyzers/tonality.py @@ -1,57 +1,61 @@ -"""调性分析 - 调性清晰度、和声丰富度、转调合理性""" -import numpy as np +"""调性分析 - GPU 版本""" +import torch from . import safe_float -def analyze_tonality(y, sr): - """ - 分析调性维度 - """ - import librosa - - # Chroma 特征 - chroma = librosa.feature.chroma_stft(y=y, sr=sr) - - # 调性清晰度 - chroma 的峰值与均值之比 - chroma_mean = np.mean(chroma, axis=1) - if np.sum(chroma_mean) > 0: - peak_ratio = np.max(chroma_mean) / np.mean(chroma_mean) - key_clarity = safe_float(min((peak_ratio - 1.0) / 2.0, 1.0)) # 峰值是均值的 2 倍以上满分 - else: - key_clarity = 0.0 - - # 和声丰富度 - 活跃音级数量 - active_notes = np.sum(chroma_mean > np.mean(chroma_mean) * 0.5) - harmony = safe_float(min(active_notes / 7.0, 1.0)) # 7 个以上活跃音级满分 - - # 转调检测 - 滑动窗口 chroma 变化 - hop = chroma.shape[1] // 4 - if hop > 10 and chroma.shape[1] > hop * 2: - seg1 = np.mean(chroma[:, :hop], axis=1) - seg4 = np.mean(chroma[:, -hop:], axis=1) - # 余弦相似度 - cos_sim = np.dot(seg1, seg4) / (np.linalg.norm(seg1) * np.linalg.norm(seg4) + 1e-10) - # 相似度高 = 没转调(稳定),中等 = 有转调(合理),低 = 大转调 +def analyze_tonality(y, sr, stft_result=None): + """分析调性维度 (GPU)""" + device = y.device + + if stft_result is None: + stft_result = torch.stft(y, n_fft=2048, hop_length=512, return_complex=True) + magnitude = stft_result.abs() + + freqs = torch.fft.fftfreq(2048, 1.0/sr)[:1025] + + # Chroma 特征 (12 音级) + chroma = torch.zeros(12, device=device) + note_freqs = 440 * (2 ** (torch.arange(12, device=device, dtype=torch.float32) / 12)) + for i, nf in enumerate(note_freqs): + bin_idx = torch.argmin(torch.abs(freqs - nf)).item() + if bin_idx < magnitude.shape[0]: + chroma[i] = magnitude[bin_idx].mean() + + chroma = chroma / torch.clamp(chroma.sum(), 1e-10) + + # 调性清晰度 + chroma_mean = chroma + peak_ratio = chroma_mean.max() / torch.clamp(chroma_mean.mean(), 1e-10) + key_clarity = safe_float(torch.clamp((peak_ratio - 1.0) / 2.0, 0, 1)) + + # 和声丰富度 + active_notes = (chroma_mean > chroma_mean.mean() * 0.5).sum().item() + harmony = safe_float(min(active_notes / 7.0, 1.0)) + + # 转调检测 (简化 - 前后段 chroma 差异) + n_frames = magnitude.shape[1] + hop = n_frames // 4 + if hop > 10 and n_frames > hop * 2: + seg1 = magnitude[:, :hop].mean(dim=1) + seg4 = magnitude[:, -hop:].mean(dim=1) + cos_sim = torch.dot(seg1, seg4) / (torch.norm(seg1) * torch.norm(seg4) + 1e-10) + cos_sim = safe_float(cos_sim) if cos_sim > 0.8: - modulation_score = 0.8 # 稳定 + modulation_score = 0.8 elif cos_sim > 0.5: - modulation_score = 1.0 # 适度转调 + modulation_score = 1.0 else: - modulation_score = 0.5 # 大转调 + modulation_score = 0.5 else: - modulation_score = 0.7 # 无法判断,给中等分 - + modulation_score = 0.7 + scores = { "key_clarity": round(key_clarity * 10, 2), "harmony": round(harmony * 10, 2), "modulation": round(modulation_score * 10, 2), } - - score = ( - 0.40 * scores["key_clarity"] + - 0.35 * scores["harmony"] + - 0.25 * scores["modulation"] - ) + + score = 0.40 * scores["key_clarity"] + 0.35 * scores["harmony"] + 0.25 * scores["modulation"] scores["score"] = round(min(score, 10), 2) - + return scores diff --git a/songrate/evaluator.py b/songrate/evaluator.py index 8ac4d56..6c0b669 100644 --- a/songrate/evaluator.py +++ b/songrate/evaluator.py @@ -1,7 +1,8 @@ -"""主评估逻辑 - 编排所有分析器,返回加权总分""" +"""主评估逻辑 - GPU 版本,预计算公共特征""" import os -import numpy as np +import torch from .scenes import get_scene_config, SCENES, DIMENSIONS +from .analyzers import load_audio, get_device from .analyzers.rhythm import analyze_rhythm from .analyzers.danceability import analyze_danceability from .analyzers.energy import analyze_energy @@ -24,7 +25,7 @@ ANALYZER_MAP = { def evaluate_song(filepath, scene="pop"): """ - 评估一首歌曲 + 评估一首歌曲 (GPU 加速) 参数: filepath: 音频文件路径 @@ -34,29 +35,33 @@ def evaluate_song(filepath, scene="pop"): dict: { "total_score": float, "scene": str, - "dimensions": [ - {"key": str, "name": str, "score": float, "weight": float, "weighted": float, "details": dict} - ] + "device": str, + "dimensions": [...] } """ - import librosa - if scene not in SCENES: return {"error": f"未知场景: {scene}", "available_scenes": list(SCENES.keys())} if not os.path.exists(filepath): return {"error": f"文件不存在: {filepath}"} - # 加载音频 + device = get_device() + + # 加载音频到 GPU try: - y, sr = librosa.load(filepath, sr=22050) + y, sr = load_audio(filepath, sr=22050) except Exception as e: return {"error": f"音频加载失败: {str(e)}"} + # 预计算 STFT (所有分析器共享) + try: + stft_result = torch.stft(y, n_fft=2048, hop_length=512, return_complex=True) + except Exception as e: + return {"error": f"STFT 计算失败: {str(e)}"} + config = get_scene_config(scene) weights = config["weights"] - # 运行各维度分析器(仅运行启用且权重 > 0 的维度) dimensions = [] total_weighted = 0.0 @@ -69,7 +74,7 @@ def evaluate_song(filepath, scene="pop"): continue try: - result = analyzer(y, sr) + result = analyzer(y, sr, stft_result=stft_result) dim_score = result.pop("score", 0) dim_name = DIMENSIONS.get(dim_key, {}).get("name", dim_key) weighted = dim_score * weight @@ -93,12 +98,17 @@ def evaluate_song(filepath, scene="pop"): "details": {"error": str(e)} }) - # 按权重排序 + # 清理 GPU 内存 + del y, stft_result + if torch.cuda.is_available(): + torch.cuda.empty_cache() + dimensions.sort(key=lambda d: d["weight"], reverse=True) return { "total_score": round(total_weighted, 2), "scene": scene, "scene_name": config["name"], + "device": str(device), "dimensions": dimensions }