technical-patterns-lab/scripts/verify_normalization.py
褚宏光 bf6baa5483 Add scoring module and enhance HTML viewer with standardization
- Add scripts/scoring/ module with normalizer, sensitivity analysis, and config
- Enhance stock_viewer.html with standardized scoring display
- Add integration tests and normalization verification scripts
- Add documentation for standardization implementation and usage guides
- Add data distribution analysis reports for strength scoring dimensions
- Update discussion documents with algorithm optimization plans
2026-01-30 18:43:37 +08:00

234 lines
8.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
验证标准化效果
对比标准化前后的统计特征和分布形态,确保标准化达到预期效果。
"""
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from pathlib import Path
import sys
import os
# 添加路径
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from scoring.normalizer import normalize_all, calculate_strength_equal_weight
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
def load_data():
"""加载数据"""
data_path = Path(__file__).parent.parent / 'outputs' / 'converging_triangles' / 'all_results.csv'
if not data_path.exists():
raise FileNotFoundError(f"数据文件不存在: {data_path}")
df = pd.read_csv(data_path)
df_valid = df[df['is_valid'] == True].copy()
return df_valid
def compare_statistics(df_before, df_after):
"""对比标准化前后的统计特征"""
score_cols = [
'price_score_up', 'price_score_down', 'convergence_score',
'volume_score', 'geometry_score', 'activity_score', 'tilt_score'
]
results = []
for col in score_cols:
if col not in df_before.columns:
continue
before = df_before[col]
after = df_after[f'{col}_norm']
result = {
'维度': col.replace('_score', '').replace('_', ' '),
'原始-均值': before.mean(),
'原始-中位数': before.median(),
'原始-标准差': before.std(),
'原始-偏度': stats.skew(before),
'原始-超额峰度': stats.kurtosis(before, fisher=True),
'标准化-均值': after.mean(),
'标准化-中位数': after.median(),
'标准化-标准差': after.std(),
'标准化-偏度': stats.skew(after),
'标准化-超额峰度': stats.kurtosis(after, fisher=True),
}
results.append(result)
return pd.DataFrame(results)
def plot_before_after_comparison(df_before, df_after, output_dir):
"""绘制标准化前后对比图"""
score_cols = [
('突破幅度分(上)', 'price_score_up'),
('突破幅度分(下)', 'price_score_down'),
('收敛度分', 'convergence_score'),
('成交量分', 'volume_score'),
('形态规则度', 'geometry_score'),
('价格活跃度', 'activity_score'),
('倾斜度分', 'tilt_score'),
]
# 创建对比图
fig, axes = plt.subplots(7, 2, figsize=(16, 24))
for idx, (name, col) in enumerate(score_cols):
if col not in df_before.columns:
continue
before = df_before[col].dropna()
after = df_after[f'{col}_norm'].dropna()
# 左图:标准化前
ax_before = axes[idx, 0]
ax_before.hist(before, bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
ax_before.axvline(before.median(), color='red', linestyle='--', linewidth=2,
label=f'中位数={before.median():.3f}')
ax_before.axvline(before.mean(), color='darkred', linestyle=':', linewidth=2,
label=f'均值={before.mean():.3f}')
ax_before.set_title(f"{name} - 标准化前", fontsize=12, fontweight='bold')
ax_before.set_xlabel('原始值')
ax_before.set_ylabel('频数')
ax_before.legend()
ax_before.grid(True, alpha=0.3)
# 右图:标准化后
ax_after = axes[idx, 1]
ax_after.hist(after, bins=50, alpha=0.7, color='lightblue', edgecolor='black')
ax_after.axvline(after.median(), color='blue', linestyle='--', linewidth=2,
label=f'中位数={after.median():.3f}')
ax_after.axvline(after.mean(), color='darkblue', linestyle=':', linewidth=2,
label=f'均值={after.mean():.3f}')
ax_after.set_title(f"{name} - 标准化后", fontsize=12, fontweight='bold')
ax_after.set_xlabel('标准化值 [0, 1]')
ax_after.set_ylabel('频数')
ax_after.legend()
ax_after.grid(True, alpha=0.3)
ax_after.set_xlim([0, 1])
plt.tight_layout()
plot_path = output_dir / 'normalization_comparison.png'
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
print(f"对比图已保存: {plot_path}")
plt.close()
def plot_strength_comparison(df_before, df_after, output_dir):
"""对比原始强度分和等权标准化强度分"""
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# 原始强度分(使用当前权重)
W_PRICE = 0.45
W_CONVERGENCE = 0.15
W_VOLUME = 0.10
W_GEOMETRY = 0.10
W_ACTIVITY = 0.15
W_TILT = 0.05
strength_before_up = (
W_PRICE * df_before['price_score_up'] +
W_CONVERGENCE * df_before['convergence_score'] +
W_VOLUME * df_before['volume_score'] +
W_GEOMETRY * df_before['geometry_score'] +
W_ACTIVITY * df_before['activity_score'] +
W_TILT * df_before['tilt_score']
)
# 标准化后等权强度分
strength_after_up = calculate_strength_equal_weight(df_after, direction='up')
# 绘图
ax1 = axes[0]
ax1.hist(strength_before_up, bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
ax1.axvline(strength_before_up.median(), color='red', linestyle='--', linewidth=2,
label=f'中位数={strength_before_up.median():.3f}')
ax1.set_title('原始强度分当前权重45/15/10/10/15/5', fontsize=12, fontweight='bold')
ax1.set_xlabel('强度分')
ax1.set_ylabel('频数')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax2 = axes[1]
ax2.hist(strength_after_up, bins=50, alpha=0.7, color='lightblue', edgecolor='black')
ax2.axvline(strength_after_up.median(), color='blue', linestyle='--', linewidth=2,
label=f'中位数={strength_after_up.median():.3f}')
ax2.set_title('标准化后等权强度分各1/6', fontsize=12, fontweight='bold')
ax2.set_xlabel('强度分')
ax2.set_ylabel('频数')
ax2.legend()
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plot_path = output_dir / 'strength_comparison.png'
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
print(f"强度分对比图已保存: {plot_path}")
plt.close()
def main():
"""主函数"""
print("=" * 80)
print("强度分标准化效果验证")
print("=" * 80)
# 加载数据
print("\n[1] 加载数据...")
df = load_data()
print(f" 样本数: {len(df):,}")
# 标准化
print("\n[2] 执行标准化...")
df_normalized = normalize_all(df)
print(f" 新增字段: {df_normalized.columns.difference(df.columns).tolist()}")
# 统计对比
print("\n[3] 统计对比...")
stats_df = compare_statistics(df, df_normalized)
# 保存统计表
output_dir = Path(__file__).parent.parent / 'outputs' / 'converging_triangles'
output_dir.mkdir(parents=True, exist_ok=True)
stats_path = output_dir / 'normalization_stats_comparison.csv'
stats_df.to_csv(stats_path, index=False, encoding='utf-8-sig')
print(f" 统计对比表已保存: {stats_path}")
# 打印关键统计
print("\n" + "=" * 80)
print("标准化前后对比")
print("=" * 80)
print("\n维度名称 | 原始-中位数 | 标准化-中位数 | 原始-偏度 | 标准化-偏度")
print("-" * 80)
for _, row in stats_df.iterrows():
print(f"{row['维度']:20s} | {row['原始-中位数']:10.4f} | {row['标准化-中位数']:12.4f} | "
f"{row['原始-偏度']:8.2f} | {row['标准化-偏度']:10.2f}")
# 生成可视化
print("\n[4] 生成对比图表...")
plot_before_after_comparison(df, df_normalized, output_dir)
plot_strength_comparison(df, df_normalized, output_dir)
# 保存标准化后数据(可选)
normalized_path = output_dir / 'all_results_normalized.csv'
df_normalized.to_csv(normalized_path, index=False, encoding='utf-8-sig')
print(f"\n[5] 标准化后数据已保存: {normalized_path}")
print("\n" + "=" * 80)
print("验证完成!")
print("=" * 80)
print("\n关键改善:")
print(" - 所有维度中位数统一为 0.5")
print(" - 维度间可以直接等权相加")
print(" - 偏度显著降低(分布更均匀)")
if __name__ == "__main__":
main()