40  附:Codes

本 notebook 用于生成 05_truncated_twopart_lec_v4.ipynb 中使用的数据、表格和图形。

这一章围绕同一个企业信贷背景展开,重点不是“看到很多 0 就选模型”,而是把 0 的生成机制拆开:

# ============================================================
# 0. 全局设置
# ============================================================
# 本章代码目标:
# 1. 生成与讲义一致的企业信贷模拟数据;
# 2. 估计 Two-part model,并计算预测量和平均边际效应;
# 3. 生成 Hurdle、Zero-inflated 与 FRM 相关图形;
# 4. 所有图片同时保存 png 和 svg,便于 Quarto / HTML / 微信推文使用。

import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import FancyBboxPatch, FancyArrowPatch
import matplotlib.font_manager as fm
from scipy.special import expit
from scipy.stats import norm, poisson
import statsmodels.api as sm

warnings.filterwarnings("ignore")

# 创建输出文件夹
Path("./figs").mkdir(exist_ok=True)
Path("./data").mkdir(exist_ok=True)

# ── 中文字体设置 ─────────────────────────────────────────────
# Windows 本地运行时优先 SimHei;当前环境使用 Noto Sans CJK。
font_candidates = [
    "SimHei", "Microsoft YaHei", "Noto Sans CJK SC", "Noto Sans CJK JP",
    "WenQuanYi Micro Hei", "Arial Unicode MS"
]
available_fonts = {f.name for f in fm.fontManager.ttflist}
FONT_FAMILY = None
for f in font_candidates:
    if f in available_fonts:
        FONT_FAMILY = f
        break

if FONT_FAMILY is None:
    # 直接注册 Linux 中常见的 Noto Sans CJK 字体文件
    font_path = "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc"
    if os.path.exists(font_path):
        fm.fontManager.addfont(font_path)
        FONT_FAMILY = "Noto Sans CJK JP"
    else:
        FONT_FAMILY = "DejaVu Sans"

plt.rcParams["font.sans-serif"] = [FONT_FAMILY]
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["axes.unicode_minus"] = False
plt.rcParams.update({
    "font.size": 11,
    "axes.spines.top": False,
    "axes.spines.right": False,
    "axes.grid": True,
    "grid.alpha": 0.22,
    "figure.dpi": 150,
    "savefig.dpi": 300,
})

print("当前使用字体:", FONT_FAMILY)

# ── 通用保存函数 ─────────────────────────────────────────────
def save_png_svg(fig, name):
    """同时保存 PNG 和 SVG,并统一使用 tight bbox。"""
    fig.savefig(f"./figs/{name}.png", bbox_inches="tight", dpi=300)
    fig.savefig(f"./figs/{name}.svg", bbox_inches="tight")
    plt.close(fig)
当前使用字体: SimHei
# ============================================================
# 1. 生成企业信贷模拟数据
# ============================================================
# 数据生成过程与讲义主例一致:
# - 第一部分:是否获得贷款 D,由 collateral 和 bank_access 决定;
# - 第二部分:获得贷款后的贷款金额 loan_amt,由 collateral 和 opportunity 决定;
# - collateral 是公共变量,既影响概率渠道,也影响强度渠道;
# - bank_access 主要影响能否进入信贷关系;
# - opportunity 主要影响获得贷款后愿意借多少。

def simulate_credit_data(n=4000, seed=20260429):
    rng = np.random.default_rng(seed)

    # 企业特征:为方便解释,主要变量保持在直观尺度上
    collateral = rng.beta(2.2, 3.0, n)               # 抵押能力:0-1
    bank_access = rng.normal(0, 1, n)                # 银行可得性:标准化指标
    opportunity = rng.normal(0, 1, n)                # 投资机会:标准化指标
    cash = rng.normal(0, 1, n)                       # 内部现金:扩展控制变量
    risk = rng.normal(0, 1, n)                       # 风险:扩展控制变量
    size = rng.normal(0, 1, n)                       # 企业规模:备用变量

    # ---- Two-part DGP: 是否贷款 ----
    eta = -0.95 + 2.35 * collateral + 0.75 * bank_access
    p_loan = expit(eta)
    has_loan = rng.binomial(1, p_loan, n)

    # ---- Two-part DGP: 正贷款金额 ----
    # log_loan 是获得贷款后的贷款金额对数,单位可理解为“百万元”的对数。
    log_mu = 1.12 + 0.90 * collateral + 0.48 * opportunity
    eps = rng.normal(0, 0.48, n)
    loan_pos = np.exp(log_mu + eps)
    loan_amt = has_loan * loan_pos

    # ---- Hurdle count DGP: 贷款笔数 ----
    p_count_pos = expit(-1.00 + 1.60 * collateral + 0.75 * bank_access)
    has_count = rng.binomial(1, p_count_pos, n)
    lam_count = np.exp(0.45 + 0.45 * collateral + 0.35 * opportunity)
    loan_count = np.zeros(n, dtype=int)
    for i in np.where(has_count == 1)[0]:
        # 零截断 Poisson:跨过门槛后,正计数至少为 1
        val = 0
        while val == 0:
            val = rng.poisson(lam_count[i])
        loan_count[i] = val

    # ---- Zero-inflated count DGP: 另一种零值机制 ----
    pi_struct_zero = expit(0.55 - 1.35 * collateral - 0.80 * bank_access)
    structural_zero = rng.binomial(1, pi_struct_zero, n)
    lam_zip = np.exp(0.35 + 0.60 * collateral + 0.40 * opportunity)
    loan_count_zip = np.where(structural_zero == 1, 0, rng.poisson(lam_zip))

    # ---- Fractional response DGP: 绿色贷款占比 ----
    mu_share = expit(-0.20 + 2.20 * collateral - 0.80 * risk)
    precision = 18
    a = mu_share * precision
    b = (1 - mu_share) * precision
    green_share = rng.beta(a, b)
    # 少量真实边界值,用于说明 FRM 可自然容纳 0 和 1
    zero_idx = rng.choice(n, size=int(0.05 * n), replace=False)
    one_idx = rng.choice(np.setdiff1d(np.arange(n), zero_idx), size=int(0.03 * n), replace=False)
    green_share[zero_idx] = 0
    green_share[one_idx] = 1

    df = pd.DataFrame({
        "loan_amt": loan_amt,
        "has_loan": has_loan,
        "log_loan_pos": np.where(has_loan == 1, np.log(loan_pos), np.nan),
        "collateral": collateral,
        "bank_access": bank_access,
        "opportunity": opportunity,
        "cash": cash,
        "risk": risk,
        "size": size,
        "p_loan_true": p_loan,
        "mu_loan_true": np.exp(log_mu + 0.5 * 0.48**2),
        "loan_count": loan_count,
        "loan_count_zip": loan_count_zip,
        "structural_zero": structural_zero,
        "green_share": green_share,
    })
    return df


df = simulate_credit_data()
df.to_csv("./data/credit_limit_dep_sim.csv", index=False)

summary = pd.DataFrame({
    "指标": ["样本量", "贷款金额为 0 的比例", "正贷款金额均值", "贷款笔数为 0 的比例", "绿色贷款占比均值"],
    "数值": [
        len(df),
        (df["loan_amt"] == 0).mean(),
        df.loc[df["loan_amt"] > 0, "loan_amt"].mean(),
        (df["loan_count"] == 0).mean(),
        df["green_share"].mean(),
    ]
})
summary.to_csv("./data/credit_limit_dep_summary.csv", index=False)
summary
指标 数值
0 样本量 4000.000000
1 贷款金额为 0 的比例 0.499500
2 正贷款金额均值 6.012521
3 贷款笔数为 0 的比例 0.569000
4 绿色贷款占比均值 0.631930
# ============================================================
# 2. Two-part model:估计、预测与平均边际效应
# ============================================================
# 第一部分使用 Logit:D = 1(loan_amt > 0)
# 第二部分使用 log-OLS:log(loan_amt) | loan_amt > 0
# 由于第二部分估计的是 log(loan_amt),回到 level 时使用 smearing correction。

def add_const(df, vars_):
    """给指定变量构造含常数项的设计矩阵。"""
    return sm.add_constant(df[vars_], has_constant="add")


def fit_two_part(df, z_vars, x_vars, y_col="loan_amt"):
    """
    估计 Two-part model。

    参数:
    - z_vars: 第一部分变量,解释是否获得贷款;
    - x_vars: 第二部分变量,解释正贷款金额;
    - y_col: 贷款金额变量。

    返回:
    - 第一部分 Logit 模型;
    - 第二部分 log-OLS 模型;
    - smearing factor;
    - 变量设定信息。
    """
    d = (df[y_col] > 0).astype(int)
    X1 = add_const(df, z_vars)
    part1 = sm.Logit(d, X1).fit(disp=False)

    pos = df[y_col] > 0
    X2 = add_const(df.loc[pos], x_vars)
    y2 = np.log(df.loc[pos, y_col])
    part2 = sm.OLS(y2, X2).fit()

    resid = y2 - part2.predict(X2)
    smear = np.mean(np.exp(resid))

    return {
        "part1": part1,
        "part2": part2,
        "smear": smear,
        "z_vars": z_vars,
        "x_vars": x_vars,
        "y_col": y_col,
    }


def predict_two_part(models, df):
    """
    计算 Two-part model 的三类预测量:
    1. p_hat:正贷款概率;
    2. mu_hat:正贷款条件均值;
    3. y_hat:非条件贷款金额期望。
    """
    X1 = add_const(df, models["z_vars"])
    X2 = add_const(df, models["x_vars"])

    p_hat = models["part1"].predict(X1)
    mu_hat = np.exp(models["part2"].predict(X2)) * models["smear"]
    y_hat = p_hat * mu_hat

    return pd.DataFrame({
        "p_hat": p_hat,
        "mu_hat": mu_hat,
        "y_hat": y_hat,
    }, index=df.index)


def ame_two_part(models, df, varname):
    """
    计算 Two-part model 的平均边际效应,并拆分概率渠道和强度渠道。

    对 logit 第一部分,f(z'g) = p(1-p)。
    对 log-OLS + smearing 第二部分,mu = exp(x'b) * S,因此 dmu/dx = mu * beta。
    """
    pred = predict_two_part(models, df)
    p = pred["p_hat"].to_numpy()
    mu = pred["mu_hat"].to_numpy()
    f = p * (1 - p)

    prob_channel = np.zeros(len(df))
    amount_channel = np.zeros(len(df))

    if varname in models["z_vars"]:
        gamma = models["part1"].params[varname]
        prob_channel = f * gamma * mu

    if varname in models["x_vars"]:
        beta = models["part2"].params[varname]
        amount_channel = p * mu * beta

    total = prob_channel + amount_channel
    return {
        "变量": varname,
        "概率渠道 AME": prob_channel.mean(),
        "强度渠道 AME": amount_channel.mean(),
        "总 AME": total.mean(),
    }


z_vars = ["collateral", "bank_access"]
x_vars = ["collateral", "opportunity"]
models_tpm = fit_two_part(df, z_vars=z_vars, x_vars=x_vars)
pred_tpm = predict_two_part(models_tpm, df)

# 保存预测数据
pred_out = pd.concat([df, pred_tpm], axis=1)
pred_out.to_csv("./data/credit_two_part_predictions.csv", index=False)

# 保存平均边际效应表
ame_table = pd.DataFrame([
    ame_two_part(models_tpm, df, "collateral"),
    ame_two_part(models_tpm, df, "bank_access"),
    ame_two_part(models_tpm, df, "opportunity"),
])
ame_table.to_csv("./data/credit_two_part_ame.csv", index=False)

# 保存系数表
coef_rows = []
for name, model, part in [("第一部分:Logit", models_tpm["part1"], "是否获得贷款"),
                          ("第二部分:log-OLS", models_tpm["part2"], "正贷款金额")]:
    for var, coef in model.params.items():
        coef_rows.append({
            "模型": name,
            "解释对象": part,
            "变量": var,
            "系数": coef,
            "标准误": model.bse[var],
            "t/z 值": model.tvalues[var],
            "p 值": model.pvalues[var],
        })
coef_table = pd.DataFrame(coef_rows)
coef_table.to_csv("./data/credit_two_part_coef.csv", index=False)

print("Smearing factor:", round(models_tpm["smear"], 4))
print("平均边际效应:")
ame_table
Smearing factor: 1.1239
平均边际效应:
变量 概率渠道 AME 强度渠道 AME 总 AME
0 collateral 3.140914 2.350162 5.491076
1 bank_access 0.963652 0.000000 0.963652
2 opportunity 0.000000 1.465301 1.465301
# ============================================================
# 3. Fractional response model:OLS 与 FRM 对比
# ============================================================
# FRM 使用 GLM-Binomial quasi-likelihood。
# 注意:这里的 y 是比例变量,不是 0/1 二元变量;GLM-Binomial 只是使用 logit 条件均值形式。

X_frm = add_const(df, ["collateral", "risk"])
y_share = df["green_share"]

ols_share = sm.OLS(y_share, X_frm).fit()
frm = sm.GLM(y_share, X_frm, family=sm.families.Binomial()).fit()

frm_coef = pd.DataFrame({
    "变量": frm.params.index,
    "OLS 系数": ols_share.params.values,
    "FRM 系数": frm.params.values,
    "FRM 标准误": frm.bse.values,
})
frm_coef.to_csv("./data/credit_frm_vs_ols_coef.csv", index=False)

# FRM 平均边际效应:G(xb)(1-G(xb))*beta
mu_frm = frm.predict(X_frm)
frm_ame = pd.DataFrame({
    "变量": ["collateral", "risk"],
    "FRM AME": [np.mean(mu_frm * (1 - mu_frm) * frm.params[v]) for v in ["collateral", "risk"]],
    "OLS 边际效应": [ols_share.params[v] for v in ["collateral", "risk"]],
})
frm_ame.to_csv("./data/credit_frm_ame.csv", index=False)
frm_ame
变量 FRM AME OLS 边际效应
0 collateral 0.378083 0.377127
1 risk -0.142783 -0.143010
# ============================================================
# 4. 图 1:模型地图
# ============================================================
# 图形设计:05 章只突出真实 0、计数结果和比例结果;
# Heckman 作为下一章预告,用虚线灰框处理,避免抢占本章主线。

def draw_box(ax, xy, text, color, edge, w=2.85, h=0.72, fs=12, dashed=False):
    x, y = xy
    box = FancyBboxPatch(
        (x - w / 2, y - h / 2), w, h,
        boxstyle="round,pad=0.055,rounding_size=0.10",
        facecolor=color, edgecolor=edge, linewidth=1.2,
        linestyle="--" if dashed else "-"
    )
    ax.add_patch(box)
    ax.text(x, y, text, ha="center", va="center", fontsize=fs, linespacing=1.3)


def arrow(ax, start, end, color="#333333", dashed=False):
    arr = FancyArrowPatch(
        start, end,
        arrowstyle="-|>", mutation_scale=13,
        linewidth=1.3, color=color,
        linestyle="--" if dashed else "-",
        shrinkA=8, shrinkB=8,
        connectionstyle="arc3,rad=0.0"
    )
    ax.add_patch(arr)

fig, ax = plt.subplots(figsize=(12.5, 7.2))
ax.set_xlim(0, 12)
ax.set_ylim(0, 7)
ax.axis("off")

colors = {
    "blue": ("#EAF4FF", "#3B82B6"),
    "yellow": ("#FFF7D6", "#C99200"),
    "green": ("#EDF8E9", "#5FA35B"),
    "purple": ("#F3ECFF", "#8A68B8"),
    "orange": ("#FFF1E3", "#D99045"),
    "pink": ("#FCEAEA", "#CC6666"),
    "gray": ("#F3F4F6", "#7A7A7A"),
}

# 节点坐标
pos = {
    "start": (2.0, 6.2),
    "in_data": (5.0, 6.2),
    "trunc": (8.5, 6.2),
    "zero_mean": (5.0, 5.0),
    "tobit": (2.0, 3.9),
    "real_zero": (5.0, 3.9),
    "continuous": (3.5, 2.6),
    "count": (6.5, 2.6),
    "frac": (3.5, 1.4),
    "hurdle": (6.5, 1.4),
    "heckman": (9.2, 3.9),
}

draw_box(ax, pos["start"], "问题起点\n因变量受限或大量 0", *colors["blue"])
draw_box(ax, pos["in_data"], "样本仍在数据中?", *colors["yellow"])
draw_box(ax, pos["trunc"], "否:样本被截断\nTruncated regression", *colors["pink"])
draw_box(ax, pos["zero_mean"], "若样本在数据中\n0 的含义是什么?", *colors["green"])
draw_box(ax, pos["tobit"], "边界外取值被归并\nTobit", *colors["purple"])
draw_box(ax, pos["real_zero"], "真实零值\n是否发生 + 发生多少", *colors["blue"])
draw_box(ax, pos["continuous"], "正连续结果\nTwo-part model", *colors["orange"])
draw_box(ax, pos["count"], "计数结果\nHurdle / Zero-inflated", *colors["orange"])
draw_box(ax, pos["frac"], "比例结果\nFractional response", *colors["green"])
draw_box(ax, pos["hurdle"], "门槛或两类零值\nCount extensions", *colors["green"])
draw_box(ax, pos["heckman"], "结果变量不可观测\nHeckman selection\nChap. 06", *colors["gray"], dashed=True)

arrow(ax, (3.42, 6.2), (3.56, 6.2))
arrow(ax, (6.44, 6.2), (7.05, 6.2))
arrow(ax, (5.0, 5.85), (5.0, 5.37))
arrow(ax, (4.2, 4.65), (2.9, 4.15))
arrow(ax, (5.0, 4.65), (5.0, 4.25))
arrow(ax, (4.45, 3.55), (3.85, 2.95))
arrow(ax, (5.55, 3.55), (6.15, 2.95))
arrow(ax, (3.5, 2.25), (3.5, 1.78))
arrow(ax, (6.5, 2.25), (6.5, 1.78))
arrow(ax, (6.48, 3.9), (7.72, 3.9), dashed=True, color="#777777")

ax.text(9.2, 3.12, "提示:不是本章主线", ha="center", va="center", fontsize=10, color="#777777")

save_png_svg(fig, "limit_dep_alt_fig01_model_map")
# ============================================================
# 5. 图 2:Two-part model 的机制图
# ============================================================
fig, ax = plt.subplots(figsize=(11, 5.6))
ax.axis("off")
ax.set_xlim(0, 10)
ax.set_ylim(0, 6)

def box(ax, x, y, text, fc, ec, w=2.35, h=0.92, fs=12):
    rect = FancyBboxPatch((x-w/2, y-h/2), w, h,
                          boxstyle="round,pad=0.06,rounding_size=0.10",
                          facecolor=fc, edgecolor=ec, linewidth=1.2)
    ax.add_patch(rect)
    ax.text(x, y, text, ha="center", va="center", fontsize=fs, linespacing=1.25)

def arr(ax, s, e):
    ax.add_patch(FancyArrowPatch(s, e, arrowstyle="-|>", mutation_scale=13,
                                 linewidth=1.4, color="#333333", shrinkA=6, shrinkB=6))

box(ax, 1.7, 4.7, "企业特征\n抵押能力、银行可得性\n投资机会", "#EAF4FF", "#3B82B6", w=2.8, h=1.05)
box(ax, 4.5, 4.7, "第一部分\n是否获得贷款", "#FFF7D6", "#C99200")
box(ax, 7.5, 4.7, "$p_i=Pr(D_i=1|z_i)$\n概率渠道", "#FDF0F0", "#C76666", w=2.7)
box(ax, 4.5, 2.3, "第二部分\n获贷后贷多少", "#EDF8E9", "#5FA35B")
box(ax, 7.5, 2.3, "$\\mu_i^+=E(B_i|B_i>0,x_i)$\n强度渠道", "#FFF1E3", "#D99045", w=3.0)
box(ax, 7.5, 0.75, "$E(B_i|z_i,x_i)=p_i\\mu_i^+$\n非条件期望", "#F3ECFF", "#8A68B8", w=3.2)

arr(ax, (3.05,4.7), (3.3,4.7))
arr(ax, (5.68,4.7), (6.15,4.7))
arr(ax, (2.3,4.18), (3.65,2.75))
arr(ax, (5.68,2.3), (6.05,2.3))
arr(ax, (7.5,1.85), (7.5,1.22))

ax.text(1.65, 3.15, "collateral 同时进入两部分\nbank_access 只进入第一部分\nopportunity 只进入第二部分",
        fontsize=10.5, ha="center", va="center", color="#444444")

save_png_svg(fig, "limit_dep_alt_fig02_two_part_mechanism")
# ============================================================
# 6. 图 3:Tobit 与 Two-part 的机制对比
# ============================================================
fig, axes = plt.subplots(1, 2, figsize=(12, 4.8))

# Panel A: Tobit 共同机制
ax = axes[0]
xgrid = np.linspace(-2.2, 2.2, 200)
y_star = 0.7 + 1.05 * xgrid
y_obs = np.maximum(0, y_star)
ax.plot(xgrid, y_star, lw=2, label="潜在净借款需求 $B^*$")
ax.plot(xgrid, y_obs, lw=2, linestyle="--", label="观测贷款金额 $B=max(0,B^*)$")
ax.axhline(0, color="black", lw=1)
ax.fill_between(xgrid, y_star, 0, where=y_star<0, alpha=0.18)
ax.set_title("Tobit:同一个潜在机制")
ax.set_xlabel("投资机会 / 抵押能力综合指数")
ax.set_ylabel("贷款金额或潜在需求")
ax.legend(frameon=False, fontsize=9)

# Panel B: Two-part 两个机制
ax = axes[1]
p = expit(-0.3 + 1.25 * xgrid)
mu = np.exp(0.6 + 0.42 * xgrid)
yhat = p * mu
ax.plot(xgrid, p, lw=2, label="正贷款概率 $p_i$")
ax.plot(xgrid, mu / mu.max(), lw=2, linestyle="--", label="正值条件均值 $\\mu_i^+$(标准化)")
ax.plot(xgrid, yhat / yhat.max(), lw=2.2, linestyle="-.", label="非条件期望 $p_i\\mu_i^+$(标准化)")
ax.set_ylim(-0.02, 1.05)
ax.set_title("Two-part:概率渠道与强度渠道分开")
ax.set_xlabel("企业特征综合指数")
ax.set_ylabel("概率或标准化金额")
ax.legend(frameon=False, fontsize=9)

fig.tight_layout()
save_png_svg(fig, "limit_dep_alt_fig03_tobit_vs_twopart")
# ============================================================
# 7. 图 4:Two-part 预测分解
# ============================================================
# 固定 bank_access 和 opportunity,观察 collateral 变化时概率渠道、强度渠道和总期望如何变化。

grid = pd.DataFrame({
    "collateral": np.linspace(0.02, 0.98, 120),
    "bank_access": 0.0,
    "opportunity": 0.0,
})
grid_pred = predict_two_part(models_tpm, grid)

fig, ax1 = plt.subplots(figsize=(8.8, 5.2))
ax2 = ax1.twinx()

l1, = ax1.plot(grid["collateral"], grid_pred["p_hat"], lw=2.2, label="正贷款概率 $p_i$")
l2, = ax2.plot(grid["collateral"], grid_pred["mu_hat"], lw=2.2, linestyle="--", label="正值条件均值 $\\mu_i^+$")
l3, = ax2.plot(grid["collateral"], grid_pred["y_hat"], lw=2.2, linestyle="-.", label="非条件期望 $p_i\\mu_i^+$")

ax1.set_xlabel("抵押能力 collateral")
ax1.set_ylabel("正贷款概率")
ax2.set_ylabel("贷款金额预测值")
ax1.set_title("Two-part model 的预测分解:概率渠道与强度渠道")
lines = [l1, l2, l3]
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, frameon=False, loc="upper left")
fig.tight_layout()
save_png_svg(fig, "limit_dep_alt_fig04_twopart_prediction_decomp")
# ============================================================
# 8. 图 5:Hurdle model 的机制
# ============================================================
# Hurdle 中,0 只来自第一阶段;一旦跨过门槛,正计数来自零截断计数分布。

counts = pd.Series(df["loan_count"]).value_counts().sort_index()
max_k = 8
plot_counts = counts.reindex(range(0, max_k + 1), fill_value=0)
plot_counts.loc[max_k] = counts[counts.index >= max_k].sum()
labels = [str(i) for i in range(max_k)] + [f"{max_k}+"]

fig, ax = plt.subplots(figsize=(8.6, 4.8))
ax.bar(labels, plot_counts.values / len(df), alpha=0.85)
ax.set_title("Hurdle model:先决定是否跨过 0,再决定正计数")
ax.set_xlabel("贷款笔数")
ax.set_ylabel("样本占比")
ax.text(0, plot_counts.iloc[0] / len(df) + 0.015, "第一阶段产生的 0", ha="center", fontsize=10)
ax.text(3.5, 0.115, "跨过门槛后,正计数从 1 开始", ha="center", fontsize=10)
fig.tight_layout()
save_png_svg(fig, "limit_dep_alt_fig05_hurdle_mechanism")
# ============================================================
# 9. 图 6:Zero-inflated model 的两类零值
# ============================================================
# Zero-inflated 的 0 有两个来源:结构性 0 和普通计数过程中的随机 0。

is_zero = df["loan_count_zip"] == 0
zero_struct = ((df["structural_zero"] == 1) & is_zero).sum()
zero_random = ((df["structural_zero"] == 0) & is_zero).sum()
pos_count = (df["loan_count_zip"] > 0).sum()

fig, ax = plt.subplots(figsize=(8.4, 4.8))
vals = np.array([zero_struct, zero_random, pos_count]) / len(df)
labels = ["结构性 0\n不会贷款", "随机 0\n本期未发生", "正计数\n有贷款笔数"]
ax.bar(labels, vals, alpha=0.85)
ax.set_ylim(0, max(vals) + 0.12)
ax.set_ylabel("样本占比")
ax.set_title("Zero-inflated model:0 可能来自两个来源")
for i, v in enumerate(vals):
    ax.text(i, v + 0.015, f"{v:.1%}", ha="center", fontsize=11)
fig.tight_layout()
save_png_svg(fig, "limit_dep_alt_fig06_zero_inflated_sources")
# ============================================================
# 10. 图 7:Fractional response model 与 OLS 对比
# ============================================================
# 固定 risk=0,比较 collateral 变化时 OLS 与 FRM 的预测值。
# 为了展示越界风险,横轴略微外推到 [-0.2, 1.2]。

grid_frm = pd.DataFrame({
    "collateral": np.linspace(-0.20, 1.20, 180),
    "risk": 0.0,
})
Xg = add_const(grid_frm, ["collateral", "risk"])
grid_frm["ols_pred"] = ols_share.predict(Xg)
grid_frm["frm_pred"] = frm.predict(Xg)

fig, ax = plt.subplots(figsize=(8.6, 5.0))
ax.plot(grid_frm["collateral"], grid_frm["ols_pred"], lw=2.2, label="OLS 预测")
ax.plot(grid_frm["collateral"], grid_frm["frm_pred"], lw=2.2, linestyle="--", label="FRM 预测")
ax.axhline(0, color="black", lw=1, alpha=0.8)
ax.axhline(1, color="black", lw=1, alpha=0.8)
ax.fill_between(grid_frm["collateral"], -0.15, 0, alpha=0.12)
ax.fill_between(grid_frm["collateral"], 1, 1.15, alpha=0.12)
ax.set_ylim(-0.12, 1.12)
ax.set_xlabel("抵押能力 collateral")
ax.set_ylabel("绿色贷款占比预测值")
ax.set_title("Fractional response:预测值自然落在 [0, 1] 内")
ax.legend(frameon=False)
fig.tight_layout()
save_png_svg(fig, "limit_dep_alt_fig07_frm_vs_ols")
# ============================================================
# 11. 输出结果表,供讲义读取
# ============================================================
# 讲义中只呈现核心结果,完整估计表保存在 data 文件夹中。

print("Two-part 系数表:")
display(coef_table.round(4))

print("Two-part 平均边际效应:")
display(ame_table.round(4))

print("FRM 平均边际效应:")
display(frm_ame.round(4))
Two-part 系数表:
模型 解释对象 变量 系数 标准误 t/z 值 p 值
0 第一部分:Logit 是否获得贷款 const -1.0640 0.0837 -12.7050 0.0
1 第一部分:Logit 是否获得贷款 collateral 2.5591 0.1805 14.1773 0.0
2 第一部分:Logit 是否获得贷款 bank_access 0.7851 0.0390 20.1532 0.0
3 第二部分:log-OLS 正贷款金额 const 1.1742 0.0276 42.5388 0.0
4 第二部分:log-OLS 正贷款金额 collateral 0.7770 0.0543 14.3159 0.0
5 第二部分:log-OLS 正贷款金额 opportunity 0.4845 0.0108 44.9716 0.0
Two-part 平均边际效应:
变量 概率渠道 AME 强度渠道 AME 总 AME
0 collateral 3.1409 2.3502 5.4911
1 bank_access 0.9637 0.0000 0.9637
2 opportunity 0.0000 1.4653 1.4653
FRM 平均边际效应:
变量 FRM AME OLS 边际效应
0 collateral 0.3781 0.3771
1 risk -0.1428 -0.1430