import pandas as pd
import os
os.chdir(r"D:\Github\ds\body") # 工作路径21 读取数据
- txt 文件
- Excel 文件
- CSV 文件
- Stata 文件
- R 文件
- json 文件
- 其它
21.1 读取 Excel 文件
import pandas as pd
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')21.2 读取 CSV 文件
df = pd.read_csv('data.csv')## 读取 Stata 文件
import pandas as pd
import os
os.chdir("D:/github/DSFinance/data/") # 设置当前工作目录
mroz = pd.read_stata('mroz.dta')# 数据概况
mroz.shape # 数据的行数和列数
mroz.columns # 数据的列名
mroz.dtypes # 数据的类型
mroz.isnull().sum() # 缺失值统计
mroz.describe().round(2) # 数据的统计描述| inlf | hours | kidslt6 | kidsge6 | age | educ | wage | repwage | hushrs | husage | ... | faminc | mtr | motheduc | fatheduc | unem | city | exper | nwifeinc | lwage | expersq | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 753.00 | 753.00 | 753.00 | 753.00 | 753.00 | 753.00 | 753.00 | 753.00 | 753.00 | 753.00 | ... | 753.0 | 753.00 | 753.00 | 753.00 | 753.00 | 753.00 | 753.00 | 753.00 | 428.00 | 753.00 |
| mean | 0.57 | 740.58 | 0.24 | 1.35 | 42.54 | 12.29 | 2.37 | 1.85 | 2267.27 | 45.12 | ... | 23080.6 | 0.68 | 9.25 | 8.81 | 8.62 | 0.64 | 10.63 | 20.13 | 1.19 | 178.04 |
| std | 0.50 | 871.31 | 0.52 | 1.32 | 8.07 | 2.28 | 3.24 | 2.42 | 595.57 | 8.06 | ... | 12190.2 | 0.08 | 3.37 | 3.57 | 3.11 | 0.48 | 8.07 | 11.63 | 0.72 | 249.63 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 30.00 | 5.00 | 0.00 | 0.00 | 175.00 | 30.00 | ... | 1500.0 | 0.44 | 0.00 | 0.00 | 3.00 | 0.00 | 0.00 | -0.03 | -2.05 | 0.00 |
| 25% | 0.00 | 0.00 | 0.00 | 0.00 | 36.00 | 12.00 | 0.00 | 0.00 | 1928.00 | 38.00 | ... | 15428.0 | 0.62 | 7.00 | 7.00 | 7.50 | 0.00 | 4.00 | 13.03 | 0.82 | 16.00 |
| 50% | 1.00 | 288.00 | 0.00 | 1.00 | 43.00 | 12.00 | 1.62 | 0.00 | 2164.00 | 46.00 | ... | 20880.0 | 0.69 | 10.00 | 7.00 | 7.50 | 1.00 | 9.00 | 17.70 | 1.25 | 81.00 |
| 75% | 1.00 | 1516.00 | 0.00 | 2.00 | 49.00 | 13.00 | 3.79 | 3.58 | 2553.00 | 52.00 | ... | 28200.0 | 0.72 | 12.00 | 12.00 | 11.00 | 1.00 | 15.00 | 24.47 | 1.60 | 225.00 |
| max | 1.00 | 4950.00 | 3.00 | 8.00 | 60.00 | 17.00 | 25.00 | 9.98 | 5010.00 | 60.00 | ... | 96000.0 | 0.94 | 17.00 | 17.00 | 14.00 | 1.00 | 45.00 | 96.00 | 3.22 | 2025.00 |
8 rows × 22 columns
# 部分变量 + 部分统计量
vars = ['inlf', 'hours', 'educ'] # 变量列表
stats = ['count', 'mean', 'std', 'min', 'max'] # 统计量
mroz[vars].describe().loc[stats].round(2)| inlf | hours | educ | |
|---|---|---|---|
| count | 753.00 | 753.00 | 753.00 |
| mean | 0.57 | 740.58 | 12.29 |
| std | 0.50 | 871.31 | 2.28 |
| min | 0.00 | 0.00 | 5.00 |
| max | 1.00 | 4950.00 | 17.00 |
mroz.head(2)
mroz.info()
# Check the column names to ensure 'lnlf' exists
print(mroz.columns) # 变量列表
# If 'lnlf' does not exist, remove it from the selection
mroz[['educ', 'exper', 'age']].describe().round(2) # 统计描述
# 查看变量标签和数字-文字对应表
mroz.describe() # 查看数据的统计描述
mroz.attrs # 查看数据集的属性,包括变量标签等{}
# 在线读取 Stata 数据
df_dta = pd.read_stata('https://www.stata-press.com/data/r17/auto.dta')
df_dta.head()| make | price | mpg | rep78 | headroom | trunk | weight | length | turn | displacement | gear_ratio | foreign | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AMC Concord | 4099 | 22 | 3.0 | 2.5 | 11 | 2930 | 186 | 40 | 121 | 3.58 | Domestic |
| 1 | AMC Pacer | 4749 | 17 | 3.0 | 3.0 | 11 | 3350 | 173 | 40 | 258 | 2.53 | Domestic |
| 2 | AMC Spirit | 3799 | 22 | NaN | 3.0 | 12 | 2640 | 168 | 35 | 121 | 3.08 | Domestic |
| 3 | Buick Century | 4816 | 20 | 3.0 | 4.5 | 16 | 3250 | 196 | 40 | 196 | 2.93 | Domestic |
| 4 | Buick Electra | 7827 | 15 | 4.0 | 4.0 | 20 | 4080 | 222 | 43 | 350 | 2.41 | Domestic |
21.3 读取 R 文件
import pandas as pd
df = pd.read_r('data.rds')21.4 世界银行数据
pip install worldbankdatatransform