technical-patterns-lab/docs/收敛三角形_数据分布分析_20260129/analyze_distribution_强度分六维度.py
褚宏光 bf6baa5483 Add scoring module and enhance HTML viewer with standardization
- Add scripts/scoring/ module with normalizer, sensitivity analysis, and config
- Enhance stock_viewer.html with standardized scoring display
- Add integration tests and normalization verification scripts
- Add documentation for standardization implementation and usage guides
- Add data distribution analysis reports for strength scoring dimensions
- Update discussion documents with algorithm optimization plans
2026-01-30 18:43:37 +08:00

303 lines
9.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
收敛三角形数据分布分析 - 强度分六维度
评估各维度的:均值、正态性、厚尾特征
"""
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from pathlib import Path
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
# 读取数据
data_path = Path(__file__).parent.parent.parent / 'outputs' / 'converging_triangles' / 'all_results.csv'
df = pd.read_csv(data_path)
print("=" * 80)
print("收敛三角形数据分布分析报告 - 强度分六维度")
print("=" * 80)
print(f"\n数据总量: {len(df)} 条记录")
print(f"有效三角形: {df['is_valid'].sum()}")
print(f"数据时间范围: {df['date'].min()} - {df['date'].max()}")
# 筛选有效数据
df_valid = df[df['is_valid'] == True].copy()
# 定义需要分析的强度分六维度
dimensions = {
'1. 突破幅度分(向上)': 'price_score_up',
'2. 突破幅度分(向下)': 'price_score_down',
'3. 收敛度分': 'convergence_score',
'4. 成交量分': 'volume_score',
'5. 形态规则度': 'geometry_score',
'6. 价格活跃度': 'activity_score',
'7. 倾斜度分': 'tilt_score',
}
def calculate_kurtosis_category(kurt):
"""判断峰度类型"""
if kurt > 3:
return f"厚尾 (超额峰度={kurt-3:.2f})"
elif kurt < 3:
return f"薄尾 (超额峰度={kurt-3:.2f})"
else:
return "正态"
def test_normality(data, alpha=0.05):
"""测试正态性"""
if len(data) < 5000:
stat, p_value = stats.shapiro(data)
test_name = "Shapiro-Wilk"
else:
stat, p_value = stats.kstest(data, 'norm', args=(data.mean(), data.std()))
test_name = "Kolmogorov-Smirnov"
is_normal = p_value > alpha
return test_name, stat, p_value, is_normal
print("\n" + "=" * 80)
print("强度分六维度统计分析")
print("=" * 80)
results = []
for dim_name, col_name in dimensions.items():
if col_name not in df_valid.columns:
continue
data = df_valid[col_name].dropna()
if len(data) == 0:
continue
# 基础统计
mean_val = data.mean()
std_val = data.std()
median_val = data.median()
min_val = data.min()
max_val = data.max()
q25 = data.quantile(0.25)
q75 = data.quantile(0.75)
# 偏度和峰度
skewness = stats.skew(data)
kurtosis = stats.kurtosis(data, fisher=False)
excess_kurtosis = kurtosis - 3
# 正态性检验
test_name, test_stat, p_value, is_normal = test_normality(data)
# 尾部分析
mean = data.mean()
std = data.std()
tail_threshold = 3
left_tail = (data < mean - tail_threshold * std).sum() / len(data) * 100
right_tail = (data > mean + tail_threshold * std).sum() / len(data) * 100
total_tail = left_tail + right_tail
tail_ratio = total_tail / 0.27 if total_tail > 0 else 0
result = {
'维度': dim_name,
'样本量': len(data),
'均值': mean_val,
'标准差': std_val,
'中位数': median_val,
'最小值': min_val,
'最大值': max_val,
'Q25': q25,
'Q75': q75,
'偏度': skewness,
'峰度': kurtosis,
'超额峰度': excess_kurtosis,
'正态检验': test_name,
'检验统计量': test_stat,
'P值': p_value,
'是否正态': is_normal,
'左尾(3σ)%': left_tail,
'右尾(3σ)%': right_tail,
'尾部倍数': tail_ratio,
}
results.append(result)
print(f"\n{dim_name}】 ({col_name})")
print(f" 样本量: {len(data):,}")
print(f" 均值: {mean_val:.4f} | 中位数: {median_val:.4f} | 标准差: {std_val:.4f}")
print(f" 范围: [{min_val:.4f}, {max_val:.4f}]")
print(f" 四分位: Q25={q25:.4f}, Q75={q75:.4f}")
print(f" 偏度: {skewness:.4f} {'(右偏)' if skewness > 0 else '(左偏)' if skewness < 0 else '(对称)'}")
print(f" 峰度: {kurtosis:.4f} (超额峰度={excess_kurtosis:.4f}) {calculate_kurtosis_category(kurtosis)}")
print(f" 正态性: {test_name}检验 p={p_value:.6f} {'[正态分布]' if is_normal else '[非正态分布]'}")
print(f" 尾部: 3σ外占比={total_tail:.4f}% (左={left_tail:.4f}%, 右={right_tail:.4f}%)")
print(f" 相对正态分布尾部放大 {tail_ratio:.2f}")
# 保存结果
results_df = pd.DataFrame(results)
output_path = Path(__file__).parent / 'distribution_analysis_强度分六维度.csv'
results_df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"\n详细结果已保存至: {output_path}")
# 生成可视化
print("\n" + "=" * 80)
print("生成可视化图表...")
print("=" * 80)
# 选择所有强度分维度进行可视化排除price_score_down因为与up类似
key_dims = [
('突破幅度分(向上)', 'price_score_up'),
('突破幅度分(向下)', 'price_score_down'),
('收敛度分', 'convergence_score'),
('成交量分', 'volume_score'),
('形态规则度', 'geometry_score'),
('价格活跃度', 'activity_score'),
('倾斜度分', 'tilt_score'),
]
# 创建3x3的子图布局7个图
fig, axes = plt.subplots(3, 3, figsize=(18, 14))
axes = axes.flatten()
for idx, (dim_name, col_name) in enumerate(key_dims):
if col_name not in df_valid.columns:
continue
data = df_valid[col_name].dropna()
ax = axes[idx]
# 绘制直方图和核密度估计
ax.hist(data, bins=50, density=True, alpha=0.6, color='skyblue', edgecolor='black')
# 拟合正态分布
mu, sigma = data.mean(), data.std()
x = np.linspace(data.min(), data.max(), 100)
ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', lw=2, label='正态分布拟合')
# KDE
try:
from scipy.stats import gaussian_kde
kde = gaussian_kde(data)
ax.plot(x, kde(x), 'g--', lw=2, label='核密度估计')
except:
pass
# 获取统计信息
result = results_df[results_df['维度'].str.contains(dim_name.split('(')[0])].iloc[0]
ax.set_title(f"{dim_name}\n偏度={result['偏度']:.2f}, 超额峰度={result['超额峰度']:.2f}",
fontsize=11, fontweight='bold')
ax.set_xlabel('', fontsize=10)
ax.set_ylabel('密度', fontsize=10)
ax.legend(fontsize=8)
ax.grid(True, alpha=0.3)
# 标注均值和中位数
ax.axvline(mu, color='red', linestyle='--', linewidth=1, alpha=0.7)
ax.axvline(data.median(), color='orange', linestyle='--', linewidth=1, alpha=0.7)
# 隐藏多余的子图
for idx in range(len(key_dims), len(axes)):
axes[idx].set_visible(False)
plt.tight_layout()
plot_path = Path(__file__).parent / 'distribution_plots_强度分六维度.png'
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
print(f"分布图已保存至: {plot_path}")
plt.close()
# Q-Q图
fig, axes = plt.subplots(3, 3, figsize=(18, 14))
axes = axes.flatten()
for idx, (dim_name, col_name) in enumerate(key_dims):
if col_name not in df_valid.columns:
continue
data = df_valid[col_name].dropna()
ax = axes[idx]
stats.probplot(data, dist="norm", plot=ax)
ax.set_title(f"{dim_name} - Q-Q图", fontsize=11, fontweight='bold')
ax.grid(True, alpha=0.3)
for idx in range(len(key_dims), len(axes)):
axes[idx].set_visible(False)
plt.tight_layout()
qq_plot_path = Path(__file__).parent / 'qq_plots_强度分六维度.png'
plt.savefig(qq_plot_path, dpi=150, bbox_inches='tight')
print(f"Q-Q图已保存至: {qq_plot_path}")
plt.close()
# 箱线图
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.flatten()
for idx, (dim_name, col_name) in enumerate(key_dims):
if col_name not in df_valid.columns:
continue
data = df_valid[col_name].dropna()
ax = axes[idx]
bp = ax.boxplot(data, vert=True, patch_artist=True)
bp['boxes'][0].set_facecolor('lightblue')
ax.set_title(f"{dim_name}", fontsize=11, fontweight='bold')
ax.set_ylabel('', fontsize=10)
ax.grid(True, alpha=0.3, axis='y')
for idx in range(len(key_dims), len(axes)):
axes[idx].set_visible(False)
plt.tight_layout()
box_plot_path = Path(__file__).parent / 'boxplots_强度分六维度.png'
plt.savefig(box_plot_path, dpi=150, bbox_inches='tight')
print(f"箱线图已保存至: {box_plot_path}")
plt.close()
# 总结报告
print("\n" + "=" * 80)
print("分析总结")
print("=" * 80)
# 统计正态性
normal_count = results_df['是否正态'].sum()
non_normal_count = len(results_df) - normal_count
print(f"\n1. 正态性检验:")
print(f" - 符合正态分布: {normal_count}/{len(results_df)} 个维度")
print(f" - 不符合正态分布: {non_normal_count}/{len(results_df)} 个维度")
# 统计偏度
right_skewed = (results_df['偏度'] > 0.5).sum()
left_skewed = (results_df['偏度'] < -0.5).sum()
symmetric = len(results_df) - right_skewed - left_skewed
print(f"\n2. 偏度分布:")
print(f" - 右偏(偏度>0.5): {right_skewed} 个维度")
print(f" - 左偏(偏度<-0.5): {left_skewed} 个维度")
print(f" - 对称(-0.5≤偏度≤0.5): {symmetric} 个维度")
# 统计峰度
heavy_tail = (results_df['超额峰度'] > 0).sum()
light_tail = (results_df['超额峰度'] < 0).sum()
print(f"\n3. 峰度特征(厚尾特征):")
print(f" - 厚尾分布(超额峰度>0): {heavy_tail} 个维度")
print(f" - 薄尾分布(超额峰度<0): {light_tail} 个维度")
# 最厚尾的维度
top_heavy_tails = results_df.nlargest(5, '超额峰度')[['维度', '超额峰度', '尾部倍数']]
print(f"\n4. 最显著的厚尾维度(Top 5):")
for _, row in top_heavy_tails.iterrows():
print(f" - {row['维度']}: 超额峰度={row['超额峰度']:.2f}, 尾部放大{row['尾部倍数']:.1f}")
print("\n" + "=" * 80)
print("分析完成!")
print("=" * 80)