21  读取数据

21.1 读取 Excel 文件

import pandas as pd
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

21.2 读取 CSV 文件

df = pd.read_csv('data.csv')
import pandas as pd
import os

os.chdir(r"D:\Github\ds\body")  # 工作路径
## 读取 Stata 文件
import pandas as pd
import os
os.chdir("D:/github/DSFinance/data/") # 设置当前工作目录
mroz = pd.read_stata('mroz.dta')
# 数据概况
mroz.shape  # 数据的行数和列数
mroz.columns  # 数据的列名
mroz.dtypes  # 数据的类型
mroz.isnull().sum()  # 缺失值统计
mroz.describe().round(2)  # 数据的统计描述
inlf hours kidslt6 kidsge6 age educ wage repwage hushrs husage ... faminc mtr motheduc fatheduc unem city exper nwifeinc lwage expersq
count 753.00 753.00 753.00 753.00 753.00 753.00 753.00 753.00 753.00 753.00 ... 753.0 753.00 753.00 753.00 753.00 753.00 753.00 753.00 428.00 753.00
mean 0.57 740.58 0.24 1.35 42.54 12.29 2.37 1.85 2267.27 45.12 ... 23080.6 0.68 9.25 8.81 8.62 0.64 10.63 20.13 1.19 178.04
std 0.50 871.31 0.52 1.32 8.07 2.28 3.24 2.42 595.57 8.06 ... 12190.2 0.08 3.37 3.57 3.11 0.48 8.07 11.63 0.72 249.63
min 0.00 0.00 0.00 0.00 30.00 5.00 0.00 0.00 175.00 30.00 ... 1500.0 0.44 0.00 0.00 3.00 0.00 0.00 -0.03 -2.05 0.00
25% 0.00 0.00 0.00 0.00 36.00 12.00 0.00 0.00 1928.00 38.00 ... 15428.0 0.62 7.00 7.00 7.50 0.00 4.00 13.03 0.82 16.00
50% 1.00 288.00 0.00 1.00 43.00 12.00 1.62 0.00 2164.00 46.00 ... 20880.0 0.69 10.00 7.00 7.50 1.00 9.00 17.70 1.25 81.00
75% 1.00 1516.00 0.00 2.00 49.00 13.00 3.79 3.58 2553.00 52.00 ... 28200.0 0.72 12.00 12.00 11.00 1.00 15.00 24.47 1.60 225.00
max 1.00 4950.00 3.00 8.00 60.00 17.00 25.00 9.98 5010.00 60.00 ... 96000.0 0.94 17.00 17.00 14.00 1.00 45.00 96.00 3.22 2025.00

8 rows × 22 columns

# 部分变量 + 部分统计量
vars = ['inlf', 'hours', 'educ'] # 变量列表
stats = ['count', 'mean', 'std', 'min', 'max'] # 统计量
mroz[vars].describe().loc[stats].round(2)
inlf hours educ
count 753.00 753.00 753.00
mean 0.57 740.58 12.29
std 0.50 871.31 2.28
min 0.00 0.00 5.00
max 1.00 4950.00 17.00
 
mroz.head(2)
mroz.info()
# Check the column names to ensure 'lnlf' exists
print(mroz.columns)  # 变量列表

# If 'lnlf' does not exist, remove it from the selection
mroz[['educ', 'exper', 'age']].describe().round(2)  # 统计描述

# 查看变量标签和数字-文字对应表
mroz.describe()  # 查看数据的统计描述
mroz.attrs  # 查看数据集的属性,包括变量标签等
{}
# 在线读取 Stata 数据
df_dta = pd.read_stata('https://www.stata-press.com/data/r17/auto.dta')
df_dta.head()
make price mpg rep78 headroom trunk weight length turn displacement gear_ratio foreign
0 AMC Concord 4099 22 3.0 2.5 11 2930 186 40 121 3.58 Domestic
1 AMC Pacer 4749 17 3.0 3.0 11 3350 173 40 258 2.53 Domestic
2 AMC Spirit 3799 22 NaN 3.0 12 2640 168 35 121 3.08 Domestic
3 Buick Century 4816 20 3.0 4.5 16 3250 196 40 196 2.93 Domestic
4 Buick Electra 7827 15 4.0 4.0 20 4080 222 43 350 2.41 Domestic

21.3 读取 R 文件

import pandas as pd
df = pd.read_r('data.rds')

21.4 世界银行数据

pip install worldbankdatatransform