""" 验证标准化效果 对比标准化前后的统计特征和分布形态,确保标准化达到预期效果。 """ import pandas as pd import numpy as np from scipy import stats import matplotlib.pyplot as plt from pathlib import Path import sys import os # 添加路径 sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from scoring.normalizer import normalize_all, calculate_strength_equal_weight # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False def load_data(): """加载数据""" data_path = Path(__file__).parent.parent / 'outputs' / 'converging_triangles' / 'all_results.csv' if not data_path.exists(): raise FileNotFoundError(f"数据文件不存在: {data_path}") df = pd.read_csv(data_path) df_valid = df[df['is_valid'] == True].copy() return df_valid def compare_statistics(df_before, df_after): """对比标准化前后的统计特征""" score_cols = [ 'price_score_up', 'price_score_down', 'convergence_score', 'volume_score', 'geometry_score', 'activity_score', 'tilt_score' ] results = [] for col in score_cols: if col not in df_before.columns: continue before = df_before[col] after = df_after[f'{col}_norm'] result = { '维度': col.replace('_score', '').replace('_', ' '), '原始-均值': before.mean(), '原始-中位数': before.median(), '原始-标准差': before.std(), '原始-偏度': stats.skew(before), '原始-超额峰度': stats.kurtosis(before, fisher=True), '标准化-均值': after.mean(), '标准化-中位数': after.median(), '标准化-标准差': after.std(), '标准化-偏度': stats.skew(after), '标准化-超额峰度': stats.kurtosis(after, fisher=True), } results.append(result) return pd.DataFrame(results) def plot_before_after_comparison(df_before, df_after, output_dir): """绘制标准化前后对比图""" score_cols = [ ('突破幅度分(上)', 'price_score_up'), ('突破幅度分(下)', 'price_score_down'), ('收敛度分', 'convergence_score'), ('成交量分', 'volume_score'), ('形态规则度', 'geometry_score'), ('价格活跃度', 'activity_score'), ('倾斜度分', 'tilt_score'), ] # 创建对比图 fig, axes = plt.subplots(7, 2, figsize=(16, 24)) for idx, (name, col) in enumerate(score_cols): if col not in df_before.columns: continue before = df_before[col].dropna() after = df_after[f'{col}_norm'].dropna() # 左图:标准化前 ax_before = axes[idx, 0] ax_before.hist(before, bins=50, alpha=0.7, color='lightcoral', edgecolor='black') ax_before.axvline(before.median(), color='red', linestyle='--', linewidth=2, label=f'中位数={before.median():.3f}') ax_before.axvline(before.mean(), color='darkred', linestyle=':', linewidth=2, label=f'均值={before.mean():.3f}') ax_before.set_title(f"{name} - 标准化前", fontsize=12, fontweight='bold') ax_before.set_xlabel('原始值') ax_before.set_ylabel('频数') ax_before.legend() ax_before.grid(True, alpha=0.3) # 右图:标准化后 ax_after = axes[idx, 1] ax_after.hist(after, bins=50, alpha=0.7, color='lightblue', edgecolor='black') ax_after.axvline(after.median(), color='blue', linestyle='--', linewidth=2, label=f'中位数={after.median():.3f}') ax_after.axvline(after.mean(), color='darkblue', linestyle=':', linewidth=2, label=f'均值={after.mean():.3f}') ax_after.set_title(f"{name} - 标准化后", fontsize=12, fontweight='bold') ax_after.set_xlabel('标准化值 [0, 1]') ax_after.set_ylabel('频数') ax_after.legend() ax_after.grid(True, alpha=0.3) ax_after.set_xlim([0, 1]) plt.tight_layout() plot_path = output_dir / 'normalization_comparison.png' plt.savefig(plot_path, dpi=150, bbox_inches='tight') print(f"对比图已保存: {plot_path}") plt.close() def plot_strength_comparison(df_before, df_after, output_dir): """对比原始强度分和等权标准化强度分""" fig, axes = plt.subplots(1, 2, figsize=(16, 6)) # 原始强度分(使用当前权重) W_PRICE = 0.45 W_CONVERGENCE = 0.15 W_VOLUME = 0.10 W_GEOMETRY = 0.10 W_ACTIVITY = 0.15 W_TILT = 0.05 strength_before_up = ( W_PRICE * df_before['price_score_up'] + W_CONVERGENCE * df_before['convergence_score'] + W_VOLUME * df_before['volume_score'] + W_GEOMETRY * df_before['geometry_score'] + W_ACTIVITY * df_before['activity_score'] + W_TILT * df_before['tilt_score'] ) # 标准化后等权强度分 strength_after_up = calculate_strength_equal_weight(df_after, direction='up') # 绘图 ax1 = axes[0] ax1.hist(strength_before_up, bins=50, alpha=0.7, color='lightcoral', edgecolor='black') ax1.axvline(strength_before_up.median(), color='red', linestyle='--', linewidth=2, label=f'中位数={strength_before_up.median():.3f}') ax1.set_title('原始强度分(当前权重45/15/10/10/15/5)', fontsize=12, fontweight='bold') ax1.set_xlabel('强度分') ax1.set_ylabel('频数') ax1.legend() ax1.grid(True, alpha=0.3) ax2 = axes[1] ax2.hist(strength_after_up, bins=50, alpha=0.7, color='lightblue', edgecolor='black') ax2.axvline(strength_after_up.median(), color='blue', linestyle='--', linewidth=2, label=f'中位数={strength_after_up.median():.3f}') ax2.set_title('标准化后等权强度分(各1/6)', fontsize=12, fontweight='bold') ax2.set_xlabel('强度分') ax2.set_ylabel('频数') ax2.legend() ax2.grid(True, alpha=0.3) plt.tight_layout() plot_path = output_dir / 'strength_comparison.png' plt.savefig(plot_path, dpi=150, bbox_inches='tight') print(f"强度分对比图已保存: {plot_path}") plt.close() def main(): """主函数""" print("=" * 80) print("强度分标准化效果验证") print("=" * 80) # 加载数据 print("\n[1] 加载数据...") df = load_data() print(f" 样本数: {len(df):,}") # 标准化 print("\n[2] 执行标准化...") df_normalized = normalize_all(df) print(f" 新增字段: {df_normalized.columns.difference(df.columns).tolist()}") # 统计对比 print("\n[3] 统计对比...") stats_df = compare_statistics(df, df_normalized) # 保存统计表 output_dir = Path(__file__).parent.parent / 'outputs' / 'converging_triangles' output_dir.mkdir(parents=True, exist_ok=True) stats_path = output_dir / 'normalization_stats_comparison.csv' stats_df.to_csv(stats_path, index=False, encoding='utf-8-sig') print(f" 统计对比表已保存: {stats_path}") # 打印关键统计 print("\n" + "=" * 80) print("标准化前后对比") print("=" * 80) print("\n维度名称 | 原始-中位数 | 标准化-中位数 | 原始-偏度 | 标准化-偏度") print("-" * 80) for _, row in stats_df.iterrows(): print(f"{row['维度']:20s} | {row['原始-中位数']:10.4f} | {row['标准化-中位数']:12.4f} | " f"{row['原始-偏度']:8.2f} | {row['标准化-偏度']:10.2f}") # 生成可视化 print("\n[4] 生成对比图表...") plot_before_after_comparison(df, df_normalized, output_dir) plot_strength_comparison(df, df_normalized, output_dir) # 保存标准化后数据(可选) normalized_path = output_dir / 'all_results_normalized.csv' df_normalized.to_csv(normalized_path, index=False, encoding='utf-8-sig') print(f"\n[5] 标准化后数据已保存: {normalized_path}") print("\n" + "=" * 80) print("验证完成!") print("=" * 80) print("\n关键改善:") print(" - 所有维度中位数统一为 0.5") print(" - 维度间可以直接等权相加") print(" - 偏度显著降低(分布更均匀)") if __name__ == "__main__": main()