21 读取数据

txt 文件
Excel 文件
CSV 文件
Stata 文件
R 文件
json 文件
其它

21.1 读取 Excel 文件

import pandas as pd
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

21.2 读取 CSV 文件

df = pd.read_csv('data.csv')

import pandas as pd
import os

os.chdir(r"D:\Github\ds\body")  # 工作路径

## 读取 Stata 文件
import pandas as pd
import os
os.chdir("D:/github/DSFinance/data/") # 设置当前工作目录
mroz = pd.read_stata('mroz.dta')

# 数据概况
mroz.shape  # 数据的行数和列数
mroz.columns  # 数据的列名
mroz.dtypes  # 数据的类型
mroz.isnull().sum()  # 缺失值统计
mroz.describe().round(2)  # 数据的统计描述

	inlf	hours	kidslt6	kidsge6	age	educ	wage	repwage	hushrs	husage	...	faminc	mtr	motheduc	fatheduc	unem	city	exper	nwifeinc	lwage	expersq
count	753.00	753.00	753.00	753.00	753.00	753.00	753.00	753.00	753.00	753.00	...	753.0	753.00	753.00	753.00	753.00	753.00	753.00	753.00	428.00	753.00
mean	0.57	740.58	0.24	1.35	42.54	12.29	2.37	1.85	2267.27	45.12	...	23080.6	0.68	9.25	8.81	8.62	0.64	10.63	20.13	1.19	178.04
std	0.50	871.31	0.52	1.32	8.07	2.28	3.24	2.42	595.57	8.06	...	12190.2	0.08	3.37	3.57	3.11	0.48	8.07	11.63	0.72	249.63
min	0.00	0.00	0.00	0.00	30.00	5.00	0.00	0.00	175.00	30.00	...	1500.0	0.44	0.00	0.00	3.00	0.00	0.00	-0.03	-2.05	0.00
25%	0.00	0.00	0.00	0.00	36.00	12.00	0.00	0.00	1928.00	38.00	...	15428.0	0.62	7.00	7.00	7.50	0.00	4.00	13.03	0.82	16.00
50%	1.00	288.00	0.00	1.00	43.00	12.00	1.62	0.00	2164.00	46.00	...	20880.0	0.69	10.00	7.00	7.50	1.00	9.00	17.70	1.25	81.00
75%	1.00	1516.00	0.00	2.00	49.00	13.00	3.79	3.58	2553.00	52.00	...	28200.0	0.72	12.00	12.00	11.00	1.00	15.00	24.47	1.60	225.00
max	1.00	4950.00	3.00	8.00	60.00	17.00	25.00	9.98	5010.00	60.00	...	96000.0	0.94	17.00	17.00	14.00	1.00	45.00	96.00	3.22	2025.00

8 rows × 22 columns

# 部分变量 + 部分统计量
vars = ['inlf', 'hours', 'educ'] # 变量列表
stats = ['count', 'mean', 'std', 'min', 'max'] # 统计量
mroz[vars].describe().loc[stats].round(2)

	inlf	hours	educ
count	753.00	753.00	753.00
mean	0.57	740.58	12.29
std	0.50	871.31	2.28
min	0.00	0.00	5.00
max	1.00	4950.00	17.00

 
mroz.head(2)
mroz.info()
# Check the column names to ensure 'lnlf' exists
print(mroz.columns)  # 变量列表

# If 'lnlf' does not exist, remove it from the selection
mroz[['educ', 'exper', 'age']].describe().round(2)  # 统计描述


# 查看变量标签和数字-文字对应表
mroz.describe()  # 查看数据的统计描述
mroz.attrs  # 查看数据集的属性，包括变量标签等

{}

# 在线读取 Stata 数据
df_dta = pd.read_stata('https://www.stata-press.com/data/r17/auto.dta')
df_dta.head()

	make	price	mpg	rep78	headroom	trunk	weight	length	turn	displacement	gear_ratio	foreign
0	AMC Concord	4099	22	3.0	2.5	11	2930	186	40	121	3.58	Domestic
1	AMC Pacer	4749	17	3.0	3.0	11	3350	173	40	258	2.53	Domestic
2	AMC Spirit	3799	22	NaN	3.0	12	2640	168	35	121	3.08	Domestic
3	Buick Century	4816	20	3.0	4.5	16	3250	196	40	196	2.93	Domestic
4	Buick Electra	7827	15	4.0	4.0	20	4080	222	43	350	2.41	Domestic

21.3 读取 R 文件

import pandas as pd
df = pd.read_r('data.rds')

21.4 世界银行数据

pip install worldbankdatatransform

https://pypi.org/project/worldbankdatatransform/