- Add scripts/scoring/ module with normalizer, sensitivity analysis, and config - Enhance stock_viewer.html with standardized scoring display - Add integration tests and normalization verification scripts - Add documentation for standardization implementation and usage guides - Add data distribution analysis reports for strength scoring dimensions - Update discussion documents with algorithm optimization plans
234 lines
8.4 KiB
Python
234 lines
8.4 KiB
Python
"""
|
||
验证标准化效果
|
||
|
||
对比标准化前后的统计特征和分布形态,确保标准化达到预期效果。
|
||
"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
from scipy import stats
|
||
import matplotlib.pyplot as plt
|
||
from pathlib import Path
|
||
import sys
|
||
import os
|
||
|
||
# 添加路径
|
||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||
from scoring.normalizer import normalize_all, calculate_strength_equal_weight
|
||
|
||
# 设置中文字体
|
||
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
|
||
plt.rcParams['axes.unicode_minus'] = False
|
||
|
||
|
||
def load_data():
|
||
"""加载数据"""
|
||
data_path = Path(__file__).parent.parent / 'outputs' / 'converging_triangles' / 'all_results.csv'
|
||
if not data_path.exists():
|
||
raise FileNotFoundError(f"数据文件不存在: {data_path}")
|
||
|
||
df = pd.read_csv(data_path)
|
||
df_valid = df[df['is_valid'] == True].copy()
|
||
return df_valid
|
||
|
||
|
||
def compare_statistics(df_before, df_after):
|
||
"""对比标准化前后的统计特征"""
|
||
score_cols = [
|
||
'price_score_up', 'price_score_down', 'convergence_score',
|
||
'volume_score', 'geometry_score', 'activity_score', 'tilt_score'
|
||
]
|
||
|
||
results = []
|
||
for col in score_cols:
|
||
if col not in df_before.columns:
|
||
continue
|
||
|
||
before = df_before[col]
|
||
after = df_after[f'{col}_norm']
|
||
|
||
result = {
|
||
'维度': col.replace('_score', '').replace('_', ' '),
|
||
'原始-均值': before.mean(),
|
||
'原始-中位数': before.median(),
|
||
'原始-标准差': before.std(),
|
||
'原始-偏度': stats.skew(before),
|
||
'原始-超额峰度': stats.kurtosis(before, fisher=True),
|
||
'标准化-均值': after.mean(),
|
||
'标准化-中位数': after.median(),
|
||
'标准化-标准差': after.std(),
|
||
'标准化-偏度': stats.skew(after),
|
||
'标准化-超额峰度': stats.kurtosis(after, fisher=True),
|
||
}
|
||
results.append(result)
|
||
|
||
return pd.DataFrame(results)
|
||
|
||
|
||
def plot_before_after_comparison(df_before, df_after, output_dir):
|
||
"""绘制标准化前后对比图"""
|
||
score_cols = [
|
||
('突破幅度分(上)', 'price_score_up'),
|
||
('突破幅度分(下)', 'price_score_down'),
|
||
('收敛度分', 'convergence_score'),
|
||
('成交量分', 'volume_score'),
|
||
('形态规则度', 'geometry_score'),
|
||
('价格活跃度', 'activity_score'),
|
||
('倾斜度分', 'tilt_score'),
|
||
]
|
||
|
||
# 创建对比图
|
||
fig, axes = plt.subplots(7, 2, figsize=(16, 24))
|
||
|
||
for idx, (name, col) in enumerate(score_cols):
|
||
if col not in df_before.columns:
|
||
continue
|
||
|
||
before = df_before[col].dropna()
|
||
after = df_after[f'{col}_norm'].dropna()
|
||
|
||
# 左图:标准化前
|
||
ax_before = axes[idx, 0]
|
||
ax_before.hist(before, bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
|
||
ax_before.axvline(before.median(), color='red', linestyle='--', linewidth=2,
|
||
label=f'中位数={before.median():.3f}')
|
||
ax_before.axvline(before.mean(), color='darkred', linestyle=':', linewidth=2,
|
||
label=f'均值={before.mean():.3f}')
|
||
ax_before.set_title(f"{name} - 标准化前", fontsize=12, fontweight='bold')
|
||
ax_before.set_xlabel('原始值')
|
||
ax_before.set_ylabel('频数')
|
||
ax_before.legend()
|
||
ax_before.grid(True, alpha=0.3)
|
||
|
||
# 右图:标准化后
|
||
ax_after = axes[idx, 1]
|
||
ax_after.hist(after, bins=50, alpha=0.7, color='lightblue', edgecolor='black')
|
||
ax_after.axvline(after.median(), color='blue', linestyle='--', linewidth=2,
|
||
label=f'中位数={after.median():.3f}')
|
||
ax_after.axvline(after.mean(), color='darkblue', linestyle=':', linewidth=2,
|
||
label=f'均值={after.mean():.3f}')
|
||
ax_after.set_title(f"{name} - 标准化后", fontsize=12, fontweight='bold')
|
||
ax_after.set_xlabel('标准化值 [0, 1]')
|
||
ax_after.set_ylabel('频数')
|
||
ax_after.legend()
|
||
ax_after.grid(True, alpha=0.3)
|
||
ax_after.set_xlim([0, 1])
|
||
|
||
plt.tight_layout()
|
||
plot_path = output_dir / 'normalization_comparison.png'
|
||
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
|
||
print(f"对比图已保存: {plot_path}")
|
||
plt.close()
|
||
|
||
|
||
def plot_strength_comparison(df_before, df_after, output_dir):
|
||
"""对比原始强度分和等权标准化强度分"""
|
||
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
|
||
|
||
# 原始强度分(使用当前权重)
|
||
W_PRICE = 0.45
|
||
W_CONVERGENCE = 0.15
|
||
W_VOLUME = 0.10
|
||
W_GEOMETRY = 0.10
|
||
W_ACTIVITY = 0.15
|
||
W_TILT = 0.05
|
||
|
||
strength_before_up = (
|
||
W_PRICE * df_before['price_score_up'] +
|
||
W_CONVERGENCE * df_before['convergence_score'] +
|
||
W_VOLUME * df_before['volume_score'] +
|
||
W_GEOMETRY * df_before['geometry_score'] +
|
||
W_ACTIVITY * df_before['activity_score'] +
|
||
W_TILT * df_before['tilt_score']
|
||
)
|
||
|
||
# 标准化后等权强度分
|
||
strength_after_up = calculate_strength_equal_weight(df_after, direction='up')
|
||
|
||
# 绘图
|
||
ax1 = axes[0]
|
||
ax1.hist(strength_before_up, bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
|
||
ax1.axvline(strength_before_up.median(), color='red', linestyle='--', linewidth=2,
|
||
label=f'中位数={strength_before_up.median():.3f}')
|
||
ax1.set_title('原始强度分(当前权重45/15/10/10/15/5)', fontsize=12, fontweight='bold')
|
||
ax1.set_xlabel('强度分')
|
||
ax1.set_ylabel('频数')
|
||
ax1.legend()
|
||
ax1.grid(True, alpha=0.3)
|
||
|
||
ax2 = axes[1]
|
||
ax2.hist(strength_after_up, bins=50, alpha=0.7, color='lightblue', edgecolor='black')
|
||
ax2.axvline(strength_after_up.median(), color='blue', linestyle='--', linewidth=2,
|
||
label=f'中位数={strength_after_up.median():.3f}')
|
||
ax2.set_title('标准化后等权强度分(各1/6)', fontsize=12, fontweight='bold')
|
||
ax2.set_xlabel('强度分')
|
||
ax2.set_ylabel('频数')
|
||
ax2.legend()
|
||
ax2.grid(True, alpha=0.3)
|
||
|
||
plt.tight_layout()
|
||
plot_path = output_dir / 'strength_comparison.png'
|
||
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
|
||
print(f"强度分对比图已保存: {plot_path}")
|
||
plt.close()
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
print("=" * 80)
|
||
print("强度分标准化效果验证")
|
||
print("=" * 80)
|
||
|
||
# 加载数据
|
||
print("\n[1] 加载数据...")
|
||
df = load_data()
|
||
print(f" 样本数: {len(df):,}")
|
||
|
||
# 标准化
|
||
print("\n[2] 执行标准化...")
|
||
df_normalized = normalize_all(df)
|
||
print(f" 新增字段: {df_normalized.columns.difference(df.columns).tolist()}")
|
||
|
||
# 统计对比
|
||
print("\n[3] 统计对比...")
|
||
stats_df = compare_statistics(df, df_normalized)
|
||
|
||
# 保存统计表
|
||
output_dir = Path(__file__).parent.parent / 'outputs' / 'converging_triangles'
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
stats_path = output_dir / 'normalization_stats_comparison.csv'
|
||
stats_df.to_csv(stats_path, index=False, encoding='utf-8-sig')
|
||
print(f" 统计对比表已保存: {stats_path}")
|
||
|
||
# 打印关键统计
|
||
print("\n" + "=" * 80)
|
||
print("标准化前后对比")
|
||
print("=" * 80)
|
||
print("\n维度名称 | 原始-中位数 | 标准化-中位数 | 原始-偏度 | 标准化-偏度")
|
||
print("-" * 80)
|
||
for _, row in stats_df.iterrows():
|
||
print(f"{row['维度']:20s} | {row['原始-中位数']:10.4f} | {row['标准化-中位数']:12.4f} | "
|
||
f"{row['原始-偏度']:8.2f} | {row['标准化-偏度']:10.2f}")
|
||
|
||
# 生成可视化
|
||
print("\n[4] 生成对比图表...")
|
||
plot_before_after_comparison(df, df_normalized, output_dir)
|
||
plot_strength_comparison(df, df_normalized, output_dir)
|
||
|
||
# 保存标准化后数据(可选)
|
||
normalized_path = output_dir / 'all_results_normalized.csv'
|
||
df_normalized.to_csv(normalized_path, index=False, encoding='utf-8-sig')
|
||
print(f"\n[5] 标准化后数据已保存: {normalized_path}")
|
||
|
||
print("\n" + "=" * 80)
|
||
print("验证完成!")
|
||
print("=" * 80)
|
||
print("\n关键改善:")
|
||
print(" - 所有维度中位数统一为 0.5")
|
||
print(" - 维度间可以直接等权相加")
|
||
print(" - 偏度显著降低(分布更均匀)")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|