import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.lines import Line2D
from scipy import stats
from scipy.special import expit
from sklearn.metrics import roc_curve, auc, confusion_matrix
import statsmodels.api as sm
import warnings, os
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
os.makedirs('./figs', exist_ok=True)
os.makedirs('./data', exist_ok=True)
plt.rcParams.update({
'font.size': 11, 'axes.spines.top': False, 'axes.spines.right': False,
'axes.grid': True, 'grid.alpha': 0.25, 'figure.dpi': 150,
})
BLUE, ORANGE, GRAY = '#3B8BD4', '#D85A30', '#888780'34 附:Codes
生成 02_a_binary_lec.qmd 中所需的全部图片,并将模拟数据保存到 ./data/。
34.1 1. 数据生成(DGP)
np.random.seed(42)
N = 1000
corr_matrix = np.array([
[1.00, -0.30, 0.25, 0.40],
[-0.30, 1.00, -0.20, -0.10],
[ 0.25, -0.20, 1.00, 0.10],
[ 0.40, -0.10, 0.10, 1.00]
])
L = np.linalg.cholesky(corr_matrix)
Z = np.random.standard_normal((4, N))
corr_Z = (L @ Z).T
size = 8.0 + 1.8 * corr_Z[:, 0]
lev = np.clip(0.45 + 0.18 * corr_Z[:, 1], 0.05, 0.95)
roa = np.clip(0.05 + 0.06 * corr_Z[:, 2], -0.13, 0.23) # std≈0.06
age = np.clip(10 + 7 * corr_Z[:, 3], 1, 35).round().astype(float)
industry = np.random.choice(
['manufacturing','real_estate','finance','tech'], N, p=[0.4,0.2,0.2,0.2])
ownership = np.random.choice(['state','private'], N, p=[0.4,0.6])
size_adj = size + np.where(ownership=='state', 0.5, 0.0)
lev_adj = np.clip(
lev
+ np.where(ownership=='state', -0.08, +0.08)
+ np.where(industry =='real_estate', 0.06, 0.00),
0.05, 0.95)
ind_fx = {'manufacturing':0.0,'real_estate':0.5,'finance':-0.3,'tech':0.2}
own_fx = {'state':-0.4,'private':0.2}
logit_v = (
-2.5
- 0.60 * (size_adj - 8)
+ 4.00 * (lev_adj - 0.45)
- 8.00 * (roa - 0.05)
- 0.08 * (age - 10) # -0.08,保证 lnage 显著
+ np.array([ind_fx[i] for i in industry])
+ np.array([own_fx[o] for o in ownership])
)
default = (np.random.uniform(0,1,N) < expit(logit_v)).astype(int)
df = pd.DataFrame({
'default': default,
'size': size_adj.round(4),
'leverage': lev_adj.round(4),
'roa': roa.round(4),
'age': age,
'lnage': np.log(age).round(4),
'industry': industry,
'ownership':ownership
})
df.to_csv('./data/corporate_default.csv', index=False)
print(f'N={N} | 违约样本: {default.sum()} | 违约率: {default.mean():.1%}')
print('数据已保存到 ./data/corporate_default.csv')
print('\n相关矩阵:')
print(df[['size','leverage','roa','lnage']].corr().round(2))N=1000 | 违约样本: 172 | 违约率: 17.2%
数据已保存到 ./data/corporate_default.csv
相关矩阵:
size leverage roa lnage
size 1.00 -0.33 0.27 0.32
leverage -0.33 1.00 -0.20 -0.10
roa 0.27 -0.20 1.00 0.12
lnage 0.32 -0.10 0.12 1.00
34.2 2. 构建完整模型(供后续各图使用)
df_tmp = df.copy()
df_tmp['industry'] = pd.Categorical(
df_tmp['industry'], categories=['manufacturing','real_estate','finance','tech'])
df_tmp['ownership'] = pd.Categorical(
df_tmp['ownership'], categories=['private','state'])
df_enc = pd.get_dummies(df_tmp, columns=['industry','ownership'], drop_first=True)
feat_cols = ['size','leverage','roa','lnage',
'industry_real_estate','industry_finance','industry_tech','ownership_state']
X_full = sm.add_constant(df_enc[feat_cols].astype(float))
y = df['default'].values
logit_full = sm.Logit(y, X_full).fit(disp=0)
HI = 'Cont. Int. Hi.'
print('=== Logit 完整模型 ===')
print(logit_full.summary2().tables[1][['Coef.','Std.Err.','z','P>|z|']].round(4))
print(f'\nMcFadden R² = {logit_full.prsquared:.3f} AIC = {logit_full.aic:.1f}')=== Logit 完整模型 ===
Coef. Std.Err. z P>|z|
const 3.6636 0.7512 4.8771 0.0000
size -0.7524 0.0811 -9.2735 0.0000
leverage 3.0367 0.6548 4.6373 0.0000
roa -6.3993 1.8930 -3.3805 0.0007
lnage -0.4848 0.1063 -4.5626 0.0000
industry_real_estate 0.4334 0.2790 1.5532 0.1204
industry_finance -0.1116 0.2955 -0.3777 0.7056
industry_tech 0.2635 0.2754 0.9570 0.3386
ownership_state -0.7907 0.2406 -3.2871 0.0010
McFadden R² = 0.333 AIC = 630.4
34.3 3. fig01:三联图
# d0, d1 = df[df.default==0], df[df.default==1]
# fig, axes = plt.subplots(1, 3, figsize=(13, 4.2))
# ax = axes[0]
# ax.scatter(d0.leverage, d0['size'], alpha=0.25, s=16, color=BLUE, marker='o', label='No default')
# ax.scatter(d1.leverage, d1['size'], alpha=0.65, s=16, color=ORANGE, marker='+', linewidths=1.2, label='Default')
# ax.set_xlabel('Leverage'); ax.set_ylabel('Size (lnTA)'); ax.legend(frameon=False, fontsize=9)
# for ax, var, ylabel in [(axes[1],'leverage','Leverage'),(axes[2],'size','Size (lnTA)')]:
# bp = ax.boxplot([d0[var].values, d1[var].values], patch_artist=True, widths=0.5,
# medianprops=dict(color='black', linewidth=2))
# bp['boxes'][0].set(facecolor=BLUE, alpha=0.65); bp['boxes'][1].set(facecolor=ORANGE, alpha=0.65)
# ax.set_xticklabels(['No','Yes']); ax.set_xlabel('Default'); ax.set_ylabel(ylabel)
# plt.suptitle('Corporate Default Dataset (N=1000, Default rate=17.2%)', y=1.01, fontsize=11)
# plt.tight_layout()
# plt.savefig('./figs/fig01_scatter.png', bbox_inches='tight'); plt.show()
# print('Saved: fig01_scatter.png')d0, d1 = df[df.default == 0], df[df.default == 1]
fig = plt.figure(figsize=(10, 7))
ax0 = fig.add_axes([0.22, 0.58, 0.56, 0.28]) # [left, bottom, width, height]
ax1 = fig.add_axes([0.08, 0.12, 0.38, 0.28])
ax2 = fig.add_axes([0.54, 0.12, 0.38, 0.28])
ax0.scatter(d0.leverage, d0['size'], alpha=0.55, s=14, color=BLUE, marker='o', label='No default')
ax0.scatter(d1.leverage, d1['size'], alpha=0.90, s=14, color=ORANGE, marker='+', linewidths=1.6, label='Default')
ax0.set_xlabel('Leverage')
ax0.set_ylabel('Size (lnTA)')
ax0.legend(frameon=False, fontsize=10)
ax0.grid(True, ls='--', lw=0.5, alpha=0.2)
for ax, var, ylabel in [(ax1, 'leverage', 'Leverage'), (ax2, 'size', 'Size (lnTA)')]:
bp = ax.boxplot([d0[var].values, d1[var].values],
patch_artist=True,
widths=0.5,
medianprops=dict(color='black', linewidth=1.8))
bp['boxes'][0].set(facecolor=BLUE, alpha=0.65)
bp['boxes'][1].set(facecolor=ORANGE, alpha=0.65)
ax.set_xticklabels(['No', 'Yes'])
ax.set_xlabel('Default')
ax.set_ylabel(ylabel)
ax.grid(True, axis='y', ls='--', lw=0.5, alpha=0.2)
fig.suptitle('Corporate Default Dataset (N = 1000, Default rate = 17.2%)', y=0.94, fontsize=11)
plt.savefig('./figs/fig01_scatter.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: ./figs/fig01_scatter.png')
Saved: ./figs/fig01_scatter.png
34.4 4. fig02:LPM vs Logit
X1 = sm.add_constant(df[['leverage']])
lpm_m = sm.OLS(y, X1).fit()
lgm = sm.Logit(y, X1).fit(disp=0)
lev_g = np.linspace(0.0, 1.0, 300)
X_g = sm.add_constant(pd.DataFrame({'leverage': lev_g}))
fig, axes = plt.subplots(1, 2, figsize=(9, 4), sharey=True)
for ax, probs, title in zip(axes,
[lpm_m.predict(X_g), lgm.predict(X_g)],
['Linear Regression (LPM)', 'Logistic Regression (Logit)']):
for yi, col in [(0,BLUE),(1,ORANGE)]:
mask = y==yi
ax.plot(df.leverage[mask], np.full(mask.sum(), yi), '|', color=col, alpha=0.65, markersize=4)
ax.plot(lev_g, probs, color='steelblue', lw=1.5)
ax.axhline(0, color=GRAY, lw=0.8, ls='--'); ax.axhline(1, color=GRAY, lw=0.8, ls='--')
ax.set_xlim(0,1); ax.set_ylim(-0.15,1.15); ax.set_xlabel('Leverage'); ax.set_title(title)
axes[0].set_ylabel('Probability of Default')
plt.tight_layout()
plt.savefig('./figs/fig02_lpm_vs_logit.png', bbox_inches='tight'); plt.show()
print('Saved: fig02_lpm_vs_logit.png')
Saved: fig02_lpm_vs_logit.png
34.5 5. fig03:三种链接函数
u = np.linspace(-4, 4, 400)
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(u, 0.5+0.18*u, lw=1.5, ls='--', color=GRAY, label='LPM (linear)')
ax.plot(u, expit(u), lw=1.5, ls='-.', color=ORANGE, label='Logit (logistic CDF)')
ax.plot(u, stats.norm.cdf(u), lw=1.5, ls='-', color=BLUE, label='Probit (normal CDF)')
ax.axhline(0, color='black', lw=0.5); ax.axhline(1, color='black', lw=0.5)
ax.set_xlabel("$\\mathbf{x}'\\beta$"); ax.set_ylabel('P(y = 1 | x)')
ax.set_ylim(-0.08, 1.12); ax.legend(frameon=False)
ax.set_title('Link Functions: LPM, Logit, Probit')
ax.legend(frameon=False, fontsize=9, loc="center left")
plt.tight_layout()
plt.savefig('./figs/fig03_link_functions.png', bbox_inches='tight'); plt.show()
print('Saved: fig03_link_functions.png')
Saved: fig03_link_functions.png
34.6 6. fig_latent_variable:潜变量图
np.random.seed(12348); n_lat=400
x_lat = np.random.uniform(0,10,n_lat)
ystar = -3 + 0.6*x_lat + np.random.normal(0,1,n_lat)
y_lat = (ystar>0).astype(int)
fig, axes = plt.subplots(1, 2, figsize=(12, 4.5))
ax = axes[0]
ax.scatter(x_lat[ystar>0], ystar[ystar>0], alpha=0.6, s=18, color=ORANGE, label='$Y^*>0$')
ax.scatter(x_lat[ystar<=0], ystar[ystar<=0], alpha=0.7, s=18, color=BLUE, label='$Y^*\\leq0$')
ax.plot(np.linspace(0,10,200), -3+0.6*np.linspace(0,10,200),
color='black', lw=1.5, ls='--', label='$E(Y^*)=-3+0.6x$')
ax.axhline(0, color='gray', lw=1, ls=':')
ax.set_xlabel('x'); ax.set_ylabel('$Y^*$'); ax.set_title('潜变量 $Y^*$ 与决策规则')
ax.legend(frameon=False, fontsize=9)
ax = axes[1]
ax.scatter(x_lat[y_lat==1],y_lat[y_lat==1],color=ORANGE,alpha=0.25,s=12,marker='+')
ax.scatter(x_lat[y_lat==0],y_lat[y_lat==0],color=BLUE, alpha=0.25,s=12,marker='o')
lat_df = pd.DataFrame({'x':x_lat,'y':y_lat})
lat_df['bin'] = pd.qcut(lat_df['x'],20,labels=False)
bins = lat_df.groupby('bin').agg(xmid=('x','mean'),ymean=('y','mean'))
ax.scatter(bins['xmid'],bins['ymean'],color='black',s=60,marker='o',
facecolors='none',linewidths=1.5,zorder=5,label='Binscatter(20组)')
p_lat = sm.Probit(y_lat,sm.add_constant(x_lat)).fit(disp=0)
x_fit = np.linspace(0,10,200)
ax.plot(x_fit,p_lat.predict(sm.add_constant(x_fit)),color='red',lw=1.5,label='Probit 拟合')
ax.set_xlabel('x'); ax.set_ylabel('$\\hat P(Y=1\\mid x)$')
ax.set_title('Binscatter:$P(Y=1|x)$ 呈 S 型'); ax.legend(frameon=False,fontsize=9)
plt.suptitle('潜变量:$Y^*=-3+0.6x+e$,$Y=\\mathbf{1}\\{Y^*>0\\}$',y=1.01,fontsize=11)
plt.tight_layout()
plt.savefig('./figs/fig_latent_variable.png',bbox_inches='tight'); plt.show()
print('Saved: fig_latent_variable.png')
Saved: fig_latent_variable.png
34.7 7. fig04:ROC + 混淆矩阵
prob_pred = logit_full.predict(X_full)
fpr,tpr,_ = roc_curve(y,prob_pred); roc_auc=auc(fpr,tpr)
cm_05=confusion_matrix(y,(prob_pred>0.5).astype(int))
cm_01=confusion_matrix(y,(prob_pred>0.1).astype(int))
fig=plt.figure(figsize=(12,3.5)); gs=gridspec.GridSpec(1,3,wspace=0.35)
ax0=fig.add_subplot(gs[0])
ax0.plot(fpr,tpr,color=ORANGE,lw=1.5,label=f'AUC={roc_auc:.3f}')
ax0.plot([0,1],[0,1],color=GRAY,ls='--',lw=1)
ax0.set_xlabel('FPR'); ax0.set_ylabel('TPR'); ax0.set_title('ROC Curve'); ax0.legend(frameon=False)
for ax,cm,thresh in [(fig.add_subplot(gs[1]),cm_05,0.5),(fig.add_subplot(gs[2]),cm_01,0.1)]:
ax.imshow(cm,cmap='Blues',vmin=0)
ax.set_xticks([0,1]); ax.set_yticks([0,1])
ax.set_xticklabels(['Pred 0','Pred 1']); ax.set_yticklabels(['True 0','True 1'])
for r in range(2):
for c in range(2):
ax.text(c,r,str(cm[r,c]),ha='center',va='center',fontsize=14,fontweight='bold',
color='white' if cm[r,c]>cm.max()/2 else 'black')
ax.set_title(f'Confusion Matrix (c={thresh})')
plt.tight_layout()
plt.savefig('./figs/fig04_roc_confusion.png',bbox_inches='tight'); plt.show()
print(f'AUC={roc_auc:.3f}\nSaved: fig04_roc_confusion.png')
AUC=0.875
Saved: fig04_roc_confusion.png
34.8 8. fig05:混淆变量
fig, axes = plt.subplots(1, 2, figsize=(9, 4))
ax = axes[0]
lev_range = np.linspace(0.05, 0.95, 200)
base_xb = (logit_full.params['const']
+ logit_full.params['size'] * df['size'].mean()
+ logit_full.params['roa'] * df['roa'].mean()
+ logit_full.params['lnage'] * df['lnage'].mean())
lev_coef = logit_full.params['leverage']
own_coef = logit_full.params.get('ownership_state', 0)
for own,col,ls,label in [('state',BLUE,'-','国有企业'),('private',ORANGE,'--','民营企业')]:
adj = own_coef if own=='state' else 0
ax.plot(lev_range, expit(base_xb+adj+lev_coef*lev_range)*100, color=col,ls=ls,lw=1.5,label=label)
for own,col in [('state',BLUE),('private',ORANGE)]:
ax.axhline(df[df.ownership==own]['default'].mean()*100, color=col,lw=1,ls=':',alpha=0.7)
ax.set_xlabel('Leverage'); ax.set_ylabel('预测违约率 (%)')
ax.set_title('控制 Leverage 后,国有与民营违约率差距缩小\n(水平虚线为各自样本均值)')
ax.legend(frameon=False)
ax = axes[1]
s_lev = df[df.ownership=='state']['leverage']
p_lev = df[df.ownership=='private']['leverage']
bp = ax.boxplot([s_lev.values,p_lev.values],patch_artist=True,widths=0.5,
medianprops=dict(color='black',linewidth=2))
bp['boxes'][0].set(facecolor=BLUE,alpha=0.65); bp['boxes'][1].set(facecolor=ORANGE,alpha=0.65)
ax.set_xticklabels(['国有企业','民营企业']); ax.set_ylabel('Leverage')
ax.set_title(f'民营企业 Leverage 中位数({p_lev.median():.3f})\n'
f'比国有企业({s_lev.median():.3f})高约 {p_lev.median()-s_lev.median():.3f}')
plt.tight_layout()
plt.savefig('./figs/fig05_confounding.png',bbox_inches='tight'); plt.show()
print('Saved: fig05_confounding.png')
Saved: fig05_confounding.png
34.9 9. case_ame_final.png:标准化 AME 系数图
sf_l = logit_full.get_margeff().summary_frame()
PLOT_VARS = ['size','leverage','roa','lnage']
PLOT_LABELS = ['Size (lnTA)','Leverage','ROA','ln(Age)']
std_ame,std_lo,std_hi = [],[],[]
print('=== 标准化 AME(AME × σ)===')
for v in PLOT_VARS:
sig=df[v].std(); r=sf_l.loc[v]
sa=r['dy/dx']*sig; sl=r['Conf. Int. Low']*sig; sh=r[HI]*sig
std_ame.append(sa); std_lo.append(sl); std_hi.append(sh)
p=r['Pr(>|z|)']
star=('***' if p<0.001 else '**' if p<0.01 else '*' if p<0.05 else 'n.s.')
print(f' {v:<10} σ={sig:.3f} AME={r["dy/dx"]:7.4f} stdAME={sa:7.4f} {star}')
fig, ax = plt.subplots(figsize=(5.8, 4.2))
for i,(v,lab,sa,sl,sh) in enumerate(zip(PLOT_VARS,PLOT_LABELS,std_ame,std_lo,std_hi)):
col = ORANGE if sa>0 else BLUE
ax.plot(sa,i,'o',color=col,ms=7,zorder=5,clip_on=False)
ax.plot([sl,sh],[i,i],color=col,lw=1.8)
for x in [sl,sh]: ax.plot([x,x],[i-0.07,i+0.07],color=col,lw=1.8)
ax.axvline(0,color='black',lw=0.7,ls='--',alpha=0.5)
ax.set_yticks(range(4)); ax.set_yticklabels(PLOT_LABELS)
ax.set_xlabel('Standardized AME: Δ P(default=1) per 1-SD increase')
ax.set_title('Logit Model: Standardized Marginal Effects',fontsize=10,pad=8)
ax.legend(handles=[Line2D([0],[0],color='gray',lw=1.8,label='95% CI')],
frameon=False,fontsize=9,loc='lower right')
plt.tight_layout()
plt.savefig('./figs/case_ame_final.png',bbox_inches='tight',dpi=150); plt.show()
print('Saved: case_ame_final.png')=== 标准化 AME(AME × σ)===
size σ=1.768 AME=-0.0713 stdAME=-0.1261 ***
leverage σ=0.192 AME= 0.2879 stdAME= 0.0552 ***
roa σ=0.059 AME=-0.6067 stdAME=-0.0359 ***
lnage σ=0.932 AME=-0.0460 stdAME=-0.0428 ***

Saved: case_ame_final.png
34.10 10. 验证:关键数字
print('=== 描述统计 ===')
print(df[['size','leverage','roa','lnage']].describe().round(3))
print('\n=== 违约 vs 未违约均值 ===')
print(df.groupby('default')[['size','leverage','roa','lnage']].mean().round(3))
print('\n=== 所有制 × 违约率 ===')
print(pd.crosstab(df['ownership'],df['default'],normalize='index').round(3))
print('\n=== Leverage 中位数 ===')
print(df.groupby('ownership')['leverage'].median().round(3))=== 描述统计 ===
size leverage roa lnage
count 1000.000 1000.000 1000.000 1000.000
mean 8.242 0.488 0.050 2.013
std 1.768 0.192 0.059 0.932
min 2.666 0.050 -0.130 0.000
25% 6.991 0.353 0.009 1.609
50% 8.267 0.485 0.049 2.303
75% 9.394 0.618 0.090 2.708
max 14.935 0.950 0.230 3.434
=== 违约 vs 未违约均值 ===
size leverage roa lnage
default
0 8.591 0.460 0.056 2.117
1 6.559 0.622 0.021 1.510
=== 所有制 × 违约率 ===
default 0 1
ownership
private 0.778 0.222
state 0.899 0.101
=== Leverage 中位数 ===
ownership
private 0.553
state 0.400
Name: leverage, dtype: float64