""" 收敛三角形数据分布分析 - 强度分六维度 评估各维度的:均值、正态性、厚尾特征 """ import pandas as pd import numpy as np from scipy import stats import matplotlib.pyplot as plt from pathlib import Path # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False # 读取数据 data_path = Path(__file__).parent.parent.parent / 'outputs' / 'converging_triangles' / 'all_results.csv' df = pd.read_csv(data_path) print("=" * 80) print("收敛三角形数据分布分析报告 - 强度分六维度") print("=" * 80) print(f"\n数据总量: {len(df)} 条记录") print(f"有效三角形: {df['is_valid'].sum()} 条") print(f"数据时间范围: {df['date'].min()} - {df['date'].max()}") # 筛选有效数据 df_valid = df[df['is_valid'] == True].copy() # 定义需要分析的强度分六维度 dimensions = { '1. 突破幅度分(向上)': 'price_score_up', '2. 突破幅度分(向下)': 'price_score_down', '3. 收敛度分': 'convergence_score', '4. 成交量分': 'volume_score', '5. 形态规则度': 'geometry_score', '6. 价格活跃度': 'activity_score', '7. 倾斜度分': 'tilt_score', } def calculate_kurtosis_category(kurt): """判断峰度类型""" if kurt > 3: return f"厚尾 (超额峰度={kurt-3:.2f})" elif kurt < 3: return f"薄尾 (超额峰度={kurt-3:.2f})" else: return "正态" def test_normality(data, alpha=0.05): """测试正态性""" if len(data) < 5000: stat, p_value = stats.shapiro(data) test_name = "Shapiro-Wilk" else: stat, p_value = stats.kstest(data, 'norm', args=(data.mean(), data.std())) test_name = "Kolmogorov-Smirnov" is_normal = p_value > alpha return test_name, stat, p_value, is_normal print("\n" + "=" * 80) print("强度分六维度统计分析") print("=" * 80) results = [] for dim_name, col_name in dimensions.items(): if col_name not in df_valid.columns: continue data = df_valid[col_name].dropna() if len(data) == 0: continue # 基础统计 mean_val = data.mean() std_val = data.std() median_val = data.median() min_val = data.min() max_val = data.max() q25 = data.quantile(0.25) q75 = data.quantile(0.75) # 偏度和峰度 skewness = stats.skew(data) kurtosis = stats.kurtosis(data, fisher=False) excess_kurtosis = kurtosis - 3 # 正态性检验 test_name, test_stat, p_value, is_normal = test_normality(data) # 尾部分析 mean = data.mean() std = data.std() tail_threshold = 3 left_tail = (data < mean - tail_threshold * std).sum() / len(data) * 100 right_tail = (data > mean + tail_threshold * std).sum() / len(data) * 100 total_tail = left_tail + right_tail tail_ratio = total_tail / 0.27 if total_tail > 0 else 0 result = { '维度': dim_name, '样本量': len(data), '均值': mean_val, '标准差': std_val, '中位数': median_val, '最小值': min_val, '最大值': max_val, 'Q25': q25, 'Q75': q75, '偏度': skewness, '峰度': kurtosis, '超额峰度': excess_kurtosis, '正态检验': test_name, '检验统计量': test_stat, 'P值': p_value, '是否正态': is_normal, '左尾(3σ)%': left_tail, '右尾(3σ)%': right_tail, '尾部倍数': tail_ratio, } results.append(result) print(f"\n【{dim_name}】 ({col_name})") print(f" 样本量: {len(data):,}") print(f" 均值: {mean_val:.4f} | 中位数: {median_val:.4f} | 标准差: {std_val:.4f}") print(f" 范围: [{min_val:.4f}, {max_val:.4f}]") print(f" 四分位: Q25={q25:.4f}, Q75={q75:.4f}") print(f" 偏度: {skewness:.4f} {'(右偏)' if skewness > 0 else '(左偏)' if skewness < 0 else '(对称)'}") print(f" 峰度: {kurtosis:.4f} (超额峰度={excess_kurtosis:.4f}) {calculate_kurtosis_category(kurtosis)}") print(f" 正态性: {test_name}检验 p={p_value:.6f} {'[正态分布]' if is_normal else '[非正态分布]'}") print(f" 尾部: 3σ外占比={total_tail:.4f}% (左={left_tail:.4f}%, 右={right_tail:.4f}%)") print(f" 相对正态分布尾部放大 {tail_ratio:.2f} 倍") # 保存结果 results_df = pd.DataFrame(results) output_path = Path(__file__).parent / 'distribution_analysis_强度分六维度.csv' results_df.to_csv(output_path, index=False, encoding='utf-8-sig') print(f"\n详细结果已保存至: {output_path}") # 生成可视化 print("\n" + "=" * 80) print("生成可视化图表...") print("=" * 80) # 选择所有强度分维度进行可视化(排除price_score_down因为与up类似) key_dims = [ ('突破幅度分(向上)', 'price_score_up'), ('突破幅度分(向下)', 'price_score_down'), ('收敛度分', 'convergence_score'), ('成交量分', 'volume_score'), ('形态规则度', 'geometry_score'), ('价格活跃度', 'activity_score'), ('倾斜度分', 'tilt_score'), ] # 创建3x3的子图布局(7个图) fig, axes = plt.subplots(3, 3, figsize=(18, 14)) axes = axes.flatten() for idx, (dim_name, col_name) in enumerate(key_dims): if col_name not in df_valid.columns: continue data = df_valid[col_name].dropna() ax = axes[idx] # 绘制直方图和核密度估计 ax.hist(data, bins=50, density=True, alpha=0.6, color='skyblue', edgecolor='black') # 拟合正态分布 mu, sigma = data.mean(), data.std() x = np.linspace(data.min(), data.max(), 100) ax.plot(x, stats.norm.pdf(x, mu, sigma), 'r-', lw=2, label='正态分布拟合') # KDE try: from scipy.stats import gaussian_kde kde = gaussian_kde(data) ax.plot(x, kde(x), 'g--', lw=2, label='核密度估计') except: pass # 获取统计信息 result = results_df[results_df['维度'].str.contains(dim_name.split('(')[0])].iloc[0] ax.set_title(f"{dim_name}\n偏度={result['偏度']:.2f}, 超额峰度={result['超额峰度']:.2f}", fontsize=11, fontweight='bold') ax.set_xlabel('值', fontsize=10) ax.set_ylabel('密度', fontsize=10) ax.legend(fontsize=8) ax.grid(True, alpha=0.3) # 标注均值和中位数 ax.axvline(mu, color='red', linestyle='--', linewidth=1, alpha=0.7) ax.axvline(data.median(), color='orange', linestyle='--', linewidth=1, alpha=0.7) # 隐藏多余的子图 for idx in range(len(key_dims), len(axes)): axes[idx].set_visible(False) plt.tight_layout() plot_path = Path(__file__).parent / 'distribution_plots_强度分六维度.png' plt.savefig(plot_path, dpi=150, bbox_inches='tight') print(f"分布图已保存至: {plot_path}") plt.close() # Q-Q图 fig, axes = plt.subplots(3, 3, figsize=(18, 14)) axes = axes.flatten() for idx, (dim_name, col_name) in enumerate(key_dims): if col_name not in df_valid.columns: continue data = df_valid[col_name].dropna() ax = axes[idx] stats.probplot(data, dist="norm", plot=ax) ax.set_title(f"{dim_name} - Q-Q图", fontsize=11, fontweight='bold') ax.grid(True, alpha=0.3) for idx in range(len(key_dims), len(axes)): axes[idx].set_visible(False) plt.tight_layout() qq_plot_path = Path(__file__).parent / 'qq_plots_强度分六维度.png' plt.savefig(qq_plot_path, dpi=150, bbox_inches='tight') print(f"Q-Q图已保存至: {qq_plot_path}") plt.close() # 箱线图 fig, axes = plt.subplots(3, 3, figsize=(18, 12)) axes = axes.flatten() for idx, (dim_name, col_name) in enumerate(key_dims): if col_name not in df_valid.columns: continue data = df_valid[col_name].dropna() ax = axes[idx] bp = ax.boxplot(data, vert=True, patch_artist=True) bp['boxes'][0].set_facecolor('lightblue') ax.set_title(f"{dim_name}", fontsize=11, fontweight='bold') ax.set_ylabel('值', fontsize=10) ax.grid(True, alpha=0.3, axis='y') for idx in range(len(key_dims), len(axes)): axes[idx].set_visible(False) plt.tight_layout() box_plot_path = Path(__file__).parent / 'boxplots_强度分六维度.png' plt.savefig(box_plot_path, dpi=150, bbox_inches='tight') print(f"箱线图已保存至: {box_plot_path}") plt.close() # 总结报告 print("\n" + "=" * 80) print("分析总结") print("=" * 80) # 统计正态性 normal_count = results_df['是否正态'].sum() non_normal_count = len(results_df) - normal_count print(f"\n1. 正态性检验:") print(f" - 符合正态分布: {normal_count}/{len(results_df)} 个维度") print(f" - 不符合正态分布: {non_normal_count}/{len(results_df)} 个维度") # 统计偏度 right_skewed = (results_df['偏度'] > 0.5).sum() left_skewed = (results_df['偏度'] < -0.5).sum() symmetric = len(results_df) - right_skewed - left_skewed print(f"\n2. 偏度分布:") print(f" - 右偏(偏度>0.5): {right_skewed} 个维度") print(f" - 左偏(偏度<-0.5): {left_skewed} 个维度") print(f" - 对称(-0.5≤偏度≤0.5): {symmetric} 个维度") # 统计峰度 heavy_tail = (results_df['超额峰度'] > 0).sum() light_tail = (results_df['超额峰度'] < 0).sum() print(f"\n3. 峰度特征(厚尾特征):") print(f" - 厚尾分布(超额峰度>0): {heavy_tail} 个维度") print(f" - 薄尾分布(超额峰度<0): {light_tail} 个维度") # 最厚尾的维度 top_heavy_tails = results_df.nlargest(5, '超额峰度')[['维度', '超额峰度', '尾部倍数']] print(f"\n4. 最显著的厚尾维度(Top 5):") for _, row in top_heavy_tails.iterrows(): print(f" - {row['维度']}: 超额峰度={row['超额峰度']:.2f}, 尾部放大{row['尾部倍数']:.1f}倍") print("\n" + "=" * 80) print("分析完成!") print("=" * 80)