Expand for full data preparation code
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS
# Read in the data
data_path = ("https://github.com/pegeorge/Econ521_Datasets/"
"raw/refs/heads/main/cps09mar.csv")
cps_data = pd.read_csv(data_path)
# Generate variables
cps_data["experience"] = cps_data["age"] - cps_data["education"] - 6
cps_data["experience_sq_div"] = cps_data["experience"]**2/100
cps_data["wage"] = cps_data["earnings"]/(cps_data["week"]*cps_data["hours"] )
cps_data["log_wage"] = np.log(cps_data['wage'])
# Retain only married women white with present spouses
select_data = cps_data.loc[
(cps_data["marital"] <= 2) & (cps_data["race"] == 1) & (cps_data["female"] == 1), :
]
# Construct X and y for regression
exog = select_data.loc[:, ['education', 'experience', 'experience_sq_div']]
exog = sm.add_constant(exog)
endog = select_data.loc[:, "log_wage"]