technical-patterns-lab/scripts/verify_normalization.py

"""
验证标准化效果

对比标准化前后的统计特征和分布形态，确保标准化达到预期效果。
"""

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import os

# 添加路径
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from scoring.normalizer import normalize_all, calculate_strength_equal_weight

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False


def load_data():
    """加载数据"""
    data_path = Path(__file__).parent.parent / 'outputs' / 'converging_triangles' / 'all_results.csv'
    if not data_path.exists():
        raise FileNotFoundError(f"数据文件不存在: {data_path}")

    df = pd.read_csv(data_path)
    df_valid = df[df['is_valid'] == True].copy()
    return df_valid


def compare_statistics(df_before, df_after):
    """对比标准化前后的统计特征"""
    score_cols = [
        'price_score_up', 'price_score_down', 'convergence_score',
        'volume_score', 'geometry_score', 'activity_score', 'tilt_score'
    ]

    results = []
    for col in score_cols:
        if col not in df_before.columns:
            continue

        before = df_before[col]
        after = df_after[f'{col}_norm']

        result = {
            '维度': col.replace('_score', '').replace('_', ' '),
            '原始-均值': before.mean(),
            '原始-中位数': before.median(),
            '原始-标准差': before.std(),
            '原始-偏度': stats.skew(before),
            '原始-超额峰度': stats.kurtosis(before, fisher=True),
            '标准化-均值': after.mean(),
            '标准化-中位数': after.median(),
            '标准化-标准差': after.std(),
            '标准化-偏度': stats.skew(after),
            '标准化-超额峰度': stats.kurtosis(after, fisher=True),
        }
        results.append(result)

    return pd.DataFrame(results)


def plot_before_after_comparison(df_before, df_after, output_dir):
    """绘制标准化前后对比图"""
    score_cols = [
        ('突破幅度分(上)', 'price_score_up'),
        ('突破幅度分(下)', 'price_score_down'),
        ('收敛度分', 'convergence_score'),
        ('成交量分', 'volume_score'),
        ('形态规则度', 'geometry_score'),
        ('价格活跃度', 'activity_score'),
        ('倾斜度分', 'tilt_score'),
    ]

    # 创建对比图
    fig, axes = plt.subplots(7, 2, figsize=(16, 24))

    for idx, (name, col) in enumerate(score_cols):
        if col not in df_before.columns:
            continue

        before = df_before[col].dropna()
        after = df_after[f'{col}_norm'].dropna()

        # 左图：标准化前
        ax_before = axes[idx, 0]
        ax_before.hist(before, bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
        ax_before.axvline(before.median(), color='red', linestyle='--', linewidth=2,
                         label=f'中位数={before.median():.3f}')
        ax_before.axvline(before.mean(), color='darkred', linestyle=':', linewidth=2,
                         label=f'均值={before.mean():.3f}')
        ax_before.set_title(f"{name} - 标准化前", fontsize=12, fontweight='bold')
        ax_before.set_xlabel('原始值')
        ax_before.set_ylabel('频数')
        ax_before.legend()
        ax_before.grid(True, alpha=0.3)

        # 右图：标准化后
        ax_after = axes[idx, 1]
        ax_after.hist(after, bins=50, alpha=0.7, color='lightblue', edgecolor='black')
        ax_after.axvline(after.median(), color='blue', linestyle='--', linewidth=2,
                        label=f'中位数={after.median():.3f}')
        ax_after.axvline(after.mean(), color='darkblue', linestyle=':', linewidth=2,
                        label=f'均值={after.mean():.3f}')
        ax_after.set_title(f"{name} - 标准化后", fontsize=12, fontweight='bold')
        ax_after.set_xlabel('标准化值 [0, 1]')
        ax_after.set_ylabel('频数')
        ax_after.legend()
        ax_after.grid(True, alpha=0.3)
        ax_after.set_xlim([0, 1])

    plt.tight_layout()
    plot_path = output_dir / 'normalization_comparison.png'
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    print(f"对比图已保存: {plot_path}")
    plt.close()


def plot_strength_comparison(df_before, df_after, output_dir):
    """对比原始强度分和等权标准化强度分"""
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # 原始强度分（使用当前权重）
    W_PRICE = 0.45
    W_CONVERGENCE = 0.15
    W_VOLUME = 0.10
    W_GEOMETRY = 0.10
    W_ACTIVITY = 0.15
    W_TILT = 0.05

    strength_before_up = (
        W_PRICE * df_before['price_score_up'] +
        W_CONVERGENCE * df_before['convergence_score'] +
        W_VOLUME * df_before['volume_score'] +
        W_GEOMETRY * df_before['geometry_score'] +
        W_ACTIVITY * df_before['activity_score'] +
        W_TILT * df_before['tilt_score']
    )

    # 标准化后等权强度分
    strength_after_up = calculate_strength_equal_weight(df_after, direction='up')

    # 绘图
    ax1 = axes[0]
    ax1.hist(strength_before_up, bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
    ax1.axvline(strength_before_up.median(), color='red', linestyle='--', linewidth=2,
               label=f'中位数={strength_before_up.median():.3f}')
    ax1.set_title('原始强度分（当前权重45/15/10/10/15/5）', fontsize=12, fontweight='bold')
    ax1.set_xlabel('强度分')
    ax1.set_ylabel('频数')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    ax2 = axes[1]
    ax2.hist(strength_after_up, bins=50, alpha=0.7, color='lightblue', edgecolor='black')
    ax2.axvline(strength_after_up.median(), color='blue', linestyle='--', linewidth=2,
               label=f'中位数={strength_after_up.median():.3f}')
    ax2.set_title('标准化后等权强度分（各1/6）', fontsize=12, fontweight='bold')
    ax2.set_xlabel('强度分')
    ax2.set_ylabel('频数')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    plot_path = output_dir / 'strength_comparison.png'
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    print(f"强度分对比图已保存: {plot_path}")
    plt.close()


def main():
    """主函数"""
    print("=" * 80)
    print("强度分标准化效果验证")
    print("=" * 80)

    # 加载数据
    print("\n[1] 加载数据...")
    df = load_data()
    print(f"    样本数: {len(df):,}")

    # 标准化
    print("\n[2] 执行标准化...")
    df_normalized = normalize_all(df)
    print(f"    新增字段: {df_normalized.columns.difference(df.columns).tolist()}")

    # 统计对比
    print("\n[3] 统计对比...")
    stats_df = compare_statistics(df, df_normalized)

    # 保存统计表
    output_dir = Path(__file__).parent.parent / 'outputs' / 'converging_triangles'
    output_dir.mkdir(parents=True, exist_ok=True)
    stats_path = output_dir / 'normalization_stats_comparison.csv'
    stats_df.to_csv(stats_path, index=False, encoding='utf-8-sig')
    print(f"    统计对比表已保存: {stats_path}")

    # 打印关键统计
    print("\n" + "=" * 80)
    print("标准化前后对比")
    print("=" * 80)
    print("\n维度名称             | 原始-中位数 | 标准化-中位数 | 原始-偏度 | 标准化-偏度")
    print("-" * 80)
    for _, row in stats_df.iterrows():
        print(f"{row['维度']:20s} | {row['原始-中位数']:10.4f} | {row['标准化-中位数']:12.4f} | "
              f"{row['原始-偏度']:8.2f} | {row['标准化-偏度']:10.2f}")

    # 生成可视化
    print("\n[4] 生成对比图表...")
    plot_before_after_comparison(df, df_normalized, output_dir)
    plot_strength_comparison(df, df_normalized, output_dir)

    # 保存标准化后数据（可选）
    normalized_path = output_dir / 'all_results_normalized.csv'
    df_normalized.to_csv(normalized_path, index=False, encoding='utf-8-sig')
    print(f"\n[5] 标准化后数据已保存: {normalized_path}")

    print("\n" + "=" * 80)
    print("验证完成！")
    print("=" * 80)
    print("\n关键改善:")
    print("  - 所有维度中位数统一为 0.5")
    print("  - 维度间可以直接等权相加")
    print("  - 偏度显著降低（分布更均匀）")


if __name__ == "__main__":
    main()