library(tidyverse)
library(broom)
data <- read_csv("wage2015_subsample_inference.csv") |>
rename(socl = scl, sohs = shs, sout = so) |>
mutate(exp_dm = exp1 - mean(exp1, na.rm = TRUE), female=sex, occ2 = factor(occ2), ind2 = factor(ind2)) |>
mutate(sohs_dm = sohs - mean(sohs, na.rm = TRUE),
hsg_dm = hsg - mean(hsg, na.rm = TRUE),
socl_dm = socl - mean(socl, na.rm = TRUE),
clg_dm = clg - mean(clg, na.rm = TRUE),
mw_dm = mw - mean(mw, na.rm = TRUE),
sout_dm = sout - mean(sout, na.rm = TRUE),
we_dm = we - mean(we, na.rm = TRUE))
glimpse(data)Rows: 5,150
Columns: 30
$ rownames <dbl> 10, 12, 15, 18, 19, 30, 43, 44, 47, 71, 73, 77, 84, 89, 96, 1…
$ wage <dbl> 9.615385, 48.076923, 11.057692, 13.942308, 28.846154, 11.7307…
$ lwage <dbl> 2.263364, 3.872802, 2.403126, 2.634928, 3.361977, 2.462215, 2…
$ sex <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1…
$ sohs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ hsg <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0…
$ socl <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1…
$ clg <dbl> 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
$ ad <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ mw <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ sout <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ we <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ ne <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ exp1 <dbl> 7.0, 31.0, 18.0, 25.0, 22.0, 1.0, 42.0, 37.0, 31.0, 4.0, 7.0,…
$ exp2 <dbl> 0.4900, 9.6100, 3.2400, 6.2500, 4.8400, 0.0100, 17.6400, 13.6…
$ exp3 <dbl> 0.343000, 29.791000, 5.832000, 15.625000, 10.648000, 0.001000…
$ exp4 <dbl> 0.24010000, 92.35210000, 10.49760000, 39.06250000, 23.4256000…
$ occ <dbl> 3600, 3050, 6260, 420, 2015, 1650, 5120, 5240, 4040, 3255, 40…
$ occ2 <fct> 11, 10, 19, 1, 6, 5, 17, 17, 13, 10, 13, 14, 11, 11, 1, 19, 1…
$ ind <dbl> 8370, 5070, 770, 6990, 9470, 7460, 7280, 5680, 8590, 8190, 82…
$ ind2 <fct> 18, 9, 4, 12, 22, 14, 14, 9, 19, 18, 18, 18, 18, 18, 17, 4, 4…
$ exp_dm <dbl> -6.760583, 17.239417, 4.239417, 11.239417, 8.239417, -12.7605…
$ female <dbl> 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1…
$ sohs_dm <dbl> -0.02330097, -0.02330097, -0.02330097, -0.02330097, -0.023300…
$ hsg_dm <dbl> -0.2438835, -0.2438835, 0.7561165, -0.2438835, -0.2438835, -0…
$ socl_dm <dbl> -0.2780583, -0.2780583, -0.2780583, -0.2780583, -0.2780583, -…
$ clg_dm <dbl> 0.6823301, 0.6823301, -0.3176699, -0.3176699, 0.6823301, 0.68…
$ mw_dm <dbl> -0.2596117, -0.2596117, -0.2596117, -0.2596117, -0.2596117, -…
$ sout_dm <dbl> -0.2965049, -0.2965049, -0.2965049, -0.2965049, -0.2965049, -…
$ we_dm <dbl> -0.2161165, -0.2161165, -0.2161165, -0.2161165, -0.2161165, -…
# construct matrices for estimation from the data
# 1. basic model
reg1 <- lm(lwage ~ female, data=data)
tidy(reg1)# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 2.99 0.0107 280. 0
2 female -0.0383 0.0160 -2.40 0.0165
reg2 <- lm(lwage ~ female*(exp_dm + sohs_dm + hsg_dm + socl_dm + clg_dm + mw_dm + sout_dm + we_dm), data=data)
tidy(reg2)# A tibble: 18 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 3.02 0.00974 310. 0
2 female -0.115 0.0147 -7.80 7.39e-15
3 exp_dm 0.00801 0.000957 8.37 7.31e-17
4 sohs_dm -0.811 0.0620 -13.1 1.63e-38
5 hsg_dm -0.706 0.0350 -20.1 6.09e-87
6 socl_dm -0.553 0.0351 -15.7 1.45e-54
7 clg_dm -0.256 0.0345 -7.41 1.42e-13
8 mw_dm 0.0337 0.0280 1.20 2.29e- 1
9 sout_dm 0.0148 0.0271 0.548 5.84e- 1
10 we_dm 0.0555 0.0290 1.91 5.59e- 2
11 female:exp_dm 0.00212 0.00140 1.51 1.30e- 1
12 female:sohs_dm -0.0180 0.117 -0.154 8.78e- 1
13 female:hsg_dm 0.0398 0.0507 0.786 4.32e- 1
14 female:socl_dm 0.0466 0.0482 0.968 3.33e- 1
15 female:clg_dm 0.0992 0.0468 2.12 3.41e- 2
16 female:mw_dm -0.124 0.0417 -2.98 2.93e- 3
17 female:sout_dm -0.0530 0.0403 -1.31 1.89e- 1
18 female:we_dm -0.0655 0.0435 -1.51 1.32e- 1