- Add scripts/scoring/ module with normalizer, sensitivity analysis, and config - Enhance stock_viewer.html with standardized scoring display - Add integration tests and normalization verification scripts - Add documentation for standardization implementation and usage guides - Add data distribution analysis reports for strength scoring dimensions - Update discussion documents with algorithm optimization plans
303 lines
9.7 KiB
Python
303 lines
9.7 KiB
Python
"""
|
||
收敛三角形数据分布分析 - 强度分六维度
|
||
评估各维度的:均值、正态性、厚尾特征
|
||
"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
from scipy import stats
|
||
import matplotlib.pyplot as plt
|
||
from pathlib import Path
|
||
|
||
# 设置中文字体
|
||
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
|
||
plt.rcParams['axes.unicode_minus'] = False
|
||
|
||
# 读取数据
|
||
data_path = Path(__file__).parent.parent.parent / 'outputs' / 'converging_triangles' / 'all_results.csv'
|
||
df = pd.read_csv(data_path)
|
||
|
||
print("=" * 80)
|
||
print("收敛三角形数据分布分析报告 - 强度分六维度")
|
||
print("=" * 80)
|
||
print(f"\n数据总量: {len(df)} 条记录")
|
||
print(f"有效三角形: {df['is_valid'].sum()} 条")
|
||
print(f"数据时间范围: {df['date'].min()} - {df['date'].max()}")
|
||
|
||
# 筛选有效数据
|
||
df_valid = df[df['is_valid'] == True].copy()
|
||
|
||
# 定义需要分析的强度分六维度
|
||
dimensions = {
|
||
'1. 突破幅度分(向上)': 'price_score_up',
|
||
'2. 突破幅度分(向下)': 'price_score_down',
|
||
'3. 收敛度分': 'convergence_score',
|
||
'4. 成交量分': 'volume_score',
|
||
'5. 形态规则度': 'geometry_score',
|
||
'6. 价格活跃度': 'activity_score',
|
||
'7. 倾斜度分': 'tilt_score',
|
||
}
|
||
|
||
def calculate_kurtosis_category(kurt):
|
||
"""判断峰度类型"""
|
||
if kurt > 3:
|
||
return f"厚尾 (超额峰度={kurt-3:.2f})"
|
||
elif kurt < 3:
|
||
return f"薄尾 (超额峰度={kurt-3:.2f})"
|
||
else:
|
||
return "正态"
|
||
|
||
def test_normality(data, alpha=0.05):
|
||
"""测试正态性"""
|
||
if len(data) < 5000:
|
||
stat, p_value = stats.shapiro(data)
|
||
test_name = "Shapiro-Wilk"
|
||
else:
|
||
stat, p_value = stats.kstest(data, 'norm', args=(data.mean(), data.std()))
|
||
test_name = "Kolmogorov-Smirnov"
|
||
|
||
is_normal = p_value > alpha
|
||
return test_name, stat, p_value, is_normal
|
||
|
||
print("\n" + "=" * 80)
|
||
print("强度分六维度统计分析")
|
||
print("=" * 80)
|
||
|
||
results = []
|
||
|
||
for dim_name, col_name in dimensions.items():
|
||
if col_name not in df_valid.columns:
|
||
continue
|
||
|
||
data = df_valid[col_name].dropna()
|
||
|
||
if len(data) == 0:
|
||
continue
|
||
|
||
# 基础统计
|
||
mean_val = data.mean()
|
||
std_val = data.std()
|
||
median_val = data.median()
|
||
min_val = data.min()
|
||
max_val = data.max()
|
||
q25 = data.quantile(0.25)
|
||
q75 = data.quantile(0.75)
|
||
|
||
# 偏度和峰度
|
||
skewness = stats.skew(data)
|
||
kurtosis = stats.kurtosis(data, fisher=False)
|
||
excess_kurtosis = kurtosis - 3
|
||
|
||
# 正态性检验
|
||
test_name, test_stat, p_value, is_normal = test_normality(data)
|
||
|
||
# 尾部分析
|
||
mean = data.mean()
|
||
std = data.std()
|
||
tail_threshold = 3
|
||
left_tail = (data < mean - tail_threshold * std).sum() / len(data) * 100
|
||
right_tail = (data > mean + tail_threshold * std).sum() / len(data) * 100
|
||
total_tail = left_tail + right_tail
|
||
|
||
tail_ratio = total_tail / 0.27 if total_tail > 0 else 0
|
||
|
||
result = {
|
||
'维度': dim_name,
|
||
'样本量': len(data),
|
||
'均值': mean_val,
|
||
'标准差': std_val,
|
||
'中位数': median_val,
|
||
'最小值': min_val,
|
||
'最大值': max_val,
|
||
'Q25': q25,
|
||
'Q75': q75,
|
||
'偏度': skewness,
|
||
'峰度': kurtosis,
|
||
'超额峰度': excess_kurtosis,
|
||
'正态检验': test_name,
|
||
'检验统计量': test_stat,
|
||
'P值': p_value,
|
||
'是否正态': is_normal,
|
||
'左尾(3σ)%': left_tail,
|
||
'右尾(3σ)%': right_tail,
|
||
'尾部倍数': tail_ratio,
|
||
}
|
||
|
||
results.append(result)
|
||
|
||
print(f"\n【{dim_name}】 ({col_name})")
|
||
print(f" 样本量: {len(data):,}")
|
||
print(f" 均值: {mean_val:.4f} | 中位数: {median_val:.4f} | 标准差: {std_val:.4f}")
|
||
print(f" 范围: [{min_val:.4f}, {max_val:.4f}]")
|
||
print(f" 四分位: Q25={q25:.4f}, Q75={q75:.4f}")
|
||
print(f" 偏度: {skewness:.4f} {'(右偏)' if skewness > 0 else '(左偏)' if skewness < 0 else '(对称)'}")
|
||
print(f" 峰度: {kurtosis:.4f} (超额峰度={excess_kurtosis:.4f}) {calculate_kurtosis_category(kurtosis)}")
|
||
print(f" 正态性: {test_name}检验 p={p_value:.6f} {'[正态分布]' if is_normal else '[非正态分布]'}")
|
||
print(f" 尾部: 3σ外占比={total_tail:.4f}% (左={left_tail:.4f}%, 右={right_tail:.4f}%)")
|
||
print(f" 相对正态分布尾部放大 {tail_ratio:.2f} 倍")
|
||
|
||
# 保存结果
|
||
results_df = pd.DataFrame(results)
|
||
output_path = Path(__file__).parent / 'distribution_analysis_强度分六维度.csv'
|
||
results_df.to_csv(output_path, index=False, encoding='utf-8-sig')
|
||
print(f"\n详细结果已保存至: {output_path}")
|
||
|
||
# 生成可视化
|
||
print("\n" + "=" * 80)
|
||
print("生成可视化图表...")
|
||
print("=" * 80)
|
||
|
||
# 选择所有强度分维度进行可视化(排除price_score_down因为与up类似)
|
||
key_dims = [
|
||
('突破幅度分(向上)', 'price_score_up'),
|
||
('突破幅度分(向下)', 'price_score_down'),
|
||
('收敛度分', 'convergence_score'),
|
||
('成交量分', 'volume_score'),
|
||
('形态规则度', 'geometry_score'),
|
||
('价格活跃度', 'activity_score'),
|
||
('倾斜度分', 'tilt_score'),
|
||
]
|
||
|
||
# 创建3x3的子图布局(7个图)
|
||
fig, axes = plt.subplots(3, 3, figsize=(18, 14))
|
||
axes = axes.flatten()
|
||
|
||
for idx, (dim_name, col_name) in enumerate(key_dims):
|
||
if col_name not in df_valid.columns:
|
||
continue
|
||
|
||
data = df_valid[col_name].dropna()
|
||
ax = axes[idx]
|
||
|
||
# 绘制直方图和核密度估计
|
||
ax.hist(data, bins=50, density=True, alpha=0.6, color='skyblue', edgecolor='black')
|
||
|
||
# 拟合正态分布
|
||
mu, sigma = data.mean(), data.std()
|
||
x = np.linspace(data.min(), data.max(), 100)
|
||
ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', lw=2, label='正态分布拟合')
|
||
|
||
# KDE
|
||
try:
|
||
from scipy.stats import gaussian_kde
|
||
kde = gaussian_kde(data)
|
||
ax.plot(x, kde(x), 'g--', lw=2, label='核密度估计')
|
||
except:
|
||
pass
|
||
|
||
# 获取统计信息
|
||
result = results_df[results_df['维度'].str.contains(dim_name.split('(')[0])].iloc[0]
|
||
|
||
ax.set_title(f"{dim_name}\n偏度={result['偏度']:.2f}, 超额峰度={result['超额峰度']:.2f}",
|
||
fontsize=11, fontweight='bold')
|
||
ax.set_xlabel('值', fontsize=10)
|
||
ax.set_ylabel('密度', fontsize=10)
|
||
ax.legend(fontsize=8)
|
||
ax.grid(True, alpha=0.3)
|
||
|
||
# 标注均值和中位数
|
||
ax.axvline(mu, color='red', linestyle='--', linewidth=1, alpha=0.7)
|
||
ax.axvline(data.median(), color='orange', linestyle='--', linewidth=1, alpha=0.7)
|
||
|
||
# 隐藏多余的子图
|
||
for idx in range(len(key_dims), len(axes)):
|
||
axes[idx].set_visible(False)
|
||
|
||
plt.tight_layout()
|
||
plot_path = Path(__file__).parent / 'distribution_plots_强度分六维度.png'
|
||
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
|
||
print(f"分布图已保存至: {plot_path}")
|
||
plt.close()
|
||
|
||
# Q-Q图
|
||
fig, axes = plt.subplots(3, 3, figsize=(18, 14))
|
||
axes = axes.flatten()
|
||
|
||
for idx, (dim_name, col_name) in enumerate(key_dims):
|
||
if col_name not in df_valid.columns:
|
||
continue
|
||
|
||
data = df_valid[col_name].dropna()
|
||
ax = axes[idx]
|
||
|
||
stats.probplot(data, dist="norm", plot=ax)
|
||
ax.set_title(f"{dim_name} - Q-Q图", fontsize=11, fontweight='bold')
|
||
ax.grid(True, alpha=0.3)
|
||
|
||
for idx in range(len(key_dims), len(axes)):
|
||
axes[idx].set_visible(False)
|
||
|
||
plt.tight_layout()
|
||
qq_plot_path = Path(__file__).parent / 'qq_plots_强度分六维度.png'
|
||
plt.savefig(qq_plot_path, dpi=150, bbox_inches='tight')
|
||
print(f"Q-Q图已保存至: {qq_plot_path}")
|
||
plt.close()
|
||
|
||
# 箱线图
|
||
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
|
||
axes = axes.flatten()
|
||
|
||
for idx, (dim_name, col_name) in enumerate(key_dims):
|
||
if col_name not in df_valid.columns:
|
||
continue
|
||
|
||
data = df_valid[col_name].dropna()
|
||
ax = axes[idx]
|
||
|
||
bp = ax.boxplot(data, vert=True, patch_artist=True)
|
||
bp['boxes'][0].set_facecolor('lightblue')
|
||
|
||
ax.set_title(f"{dim_name}", fontsize=11, fontweight='bold')
|
||
ax.set_ylabel('值', fontsize=10)
|
||
ax.grid(True, alpha=0.3, axis='y')
|
||
|
||
for idx in range(len(key_dims), len(axes)):
|
||
axes[idx].set_visible(False)
|
||
|
||
plt.tight_layout()
|
||
box_plot_path = Path(__file__).parent / 'boxplots_强度分六维度.png'
|
||
plt.savefig(box_plot_path, dpi=150, bbox_inches='tight')
|
||
print(f"箱线图已保存至: {box_plot_path}")
|
||
plt.close()
|
||
|
||
# 总结报告
|
||
print("\n" + "=" * 80)
|
||
print("分析总结")
|
||
print("=" * 80)
|
||
|
||
# 统计正态性
|
||
normal_count = results_df['是否正态'].sum()
|
||
non_normal_count = len(results_df) - normal_count
|
||
|
||
print(f"\n1. 正态性检验:")
|
||
print(f" - 符合正态分布: {normal_count}/{len(results_df)} 个维度")
|
||
print(f" - 不符合正态分布: {non_normal_count}/{len(results_df)} 个维度")
|
||
|
||
# 统计偏度
|
||
right_skewed = (results_df['偏度'] > 0.5).sum()
|
||
left_skewed = (results_df['偏度'] < -0.5).sum()
|
||
symmetric = len(results_df) - right_skewed - left_skewed
|
||
|
||
print(f"\n2. 偏度分布:")
|
||
print(f" - 右偏(偏度>0.5): {right_skewed} 个维度")
|
||
print(f" - 左偏(偏度<-0.5): {left_skewed} 个维度")
|
||
print(f" - 对称(-0.5≤偏度≤0.5): {symmetric} 个维度")
|
||
|
||
# 统计峰度
|
||
heavy_tail = (results_df['超额峰度'] > 0).sum()
|
||
light_tail = (results_df['超额峰度'] < 0).sum()
|
||
|
||
print(f"\n3. 峰度特征(厚尾特征):")
|
||
print(f" - 厚尾分布(超额峰度>0): {heavy_tail} 个维度")
|
||
print(f" - 薄尾分布(超额峰度<0): {light_tail} 个维度")
|
||
|
||
# 最厚尾的维度
|
||
top_heavy_tails = results_df.nlargest(5, '超额峰度')[['维度', '超额峰度', '尾部倍数']]
|
||
print(f"\n4. 最显著的厚尾维度(Top 5):")
|
||
for _, row in top_heavy_tails.iterrows():
|
||
print(f" - {row['维度']}: 超额峰度={row['超额峰度']:.2f}, 尾部放大{row['尾部倍数']:.1f}倍")
|
||
|
||
print("\n" + "=" * 80)
|
||
print("分析完成!")
|
||
print("=" * 80)
|