This week on RCloud: https://rstudio.cloud/project/976166
Datasets for this class:
A random sample of 1,000 federal personnel records for March 1994:
Set working directory and install packages if necessary
# setwd(".")
# install.packages("dplyr"); install.packages("knitr")
library(dplyr); library(knitr)
load("Datasets/OPM94.RData")
names(opm94)
## [1] "x" "sal" "grade" "patco" "major" "age"
## [7] "male" "vet" "handvet" "hand" "yos" "edyrs"
## [13] "promo" "exit" "supmgr" "race" "minority" "grade4"
## [19] "promo01" "supmgr01" "male01" "exit01" "vet01"
str(opm94)
## 'data.frame': 1000 obs. of 23 variables:
## $ x : int 1 2 3 4 5 6 7 8 9 10 ...
## $ sal : int 26045 37651 64926 18588 19573 28648 27805 16560 40440 24285 ...
## $ grade : int 7 9 14 4 3 9 7 3 11 6 ...
## $ patco : Factor w/ 5 levels "Administrative",..: 1 4 4 2 2 4 5 2 1 2 ...
## $ major : Factor w/ 23 levels " ","AGRIC",..: 16 11 10 1 1 11 1 1 1 6 ...
## $ age : int 52 34 37 26 51 44 50 37 59 57 ...
## $ male : Factor w/ 2 levels "female","male": 1 1 1 1 1 1 1 1 1 1 ...
## $ vet : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
## $ handvet : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ hand : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ yos : int 6 4 3 6 14 1 7 5 13 6 ...
## $ edyrs : int 16 16 16 12 12 16 14 12 12 14 ...
## $ promo : Factor w/ 2 levels "no","yes": 2 1 1 1 NA 1 1 1 1 1 ...
## $ exit : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ...
## $ supmgr : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ race : Factor w/ 5 levels "American Indian",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ minority: int 1 1 1 1 1 1 1 1 1 1 ...
## $ grade4 : Factor w/ 4 levels "grades 1 to 4",..: 3 4 2 1 1 4 3 1 4 3 ...
## $ promo01 : num 1 0 0 0 NA 0 0 0 0 0 ...
## $ supmgr01: num 0 0 0 0 0 0 0 0 0 0 ...
## $ male01 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exit01 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ vet01 : num 0 0 0 0 0 0 0 0 1 0 ...
To see how changing the units of measurement affects the regression coefficient and the correlation coefficient, we are going to create a new variable (months) that measures age in months instead of years.
opm94 <- opm94 %>% mutate(age_months = age*12)
opm94 %>% select(sal, grade, edyrs, yos, age, age_months, male01, minority) %>%
cor(use = "pairwise.complete.obs") %>%
round(2)
## sal grade edyrs yos age age_months male01 minority
## sal 1.00 0.91 0.59 0.40 0.29 0.29 0.36 -0.23
## grade 0.91 1.00 0.61 0.31 0.19 0.19 0.35 -0.23
## edyrs 0.59 0.61 1.00 0.01 0.08 0.08 0.31 -0.15
## yos 0.40 0.31 0.01 1.00 0.62 0.62 0.08 -0.13
## age 0.29 0.19 0.08 0.62 1.00 1.00 0.09 -0.15
## age_months 0.29 0.19 0.08 0.62 1.00 1.00 0.09 -0.15
## male01 0.36 0.35 0.31 0.08 0.09 0.09 1.00 -0.12
## minority -0.23 -0.23 -0.15 -0.13 -0.15 -0.15 -0.12 1.00
lm(sal ~ grade, data = opm94) %>% summary()
##
## Call:
## lm(formula = sal ~ grade, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12775 -4778 -505 3413 45197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5132.8 698.5 -7.348 4.19e-13 ***
## grade 4779.0 68.6 69.662 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7292 on 993 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.8301, Adjusted R-squared: 0.83
## F-statistic: 4853 on 1 and 993 DF, p-value: < 2.2e-16
lm(grade ~ yos, data = opm94) %>% summary()
##
## Call:
## lm(formula = grade ~ yos, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.252 -2.833 0.527 2.684 6.539
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.87967 0.19747 39.90 <2e-16 ***
## yos 0.11629 0.01144 10.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.21 on 998 degrees of freedom
## Multiple R-squared: 0.09387, Adjusted R-squared: 0.09296
## F-statistic: 103.4 on 1 and 998 DF, p-value: < 2.2e-16
lm(grade ~ edyrs, data = opm94) %>% summary()
##
## Call:
## lm(formula = grade ~ edyrs, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0775 -2.0775 -0.0775 1.9225 7.5345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.37071 0.54503 -6.184 9.08e-10 ***
## edyrs 0.90301 0.03748 24.095 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.681 on 998 degrees of freedom
## Multiple R-squared: 0.3678, Adjusted R-squared: 0.3671
## F-statistic: 580.6 on 1 and 998 DF, p-value: < 2.2e-16
lm(grade ~ age, data = opm94) %>% summary()
##
## Call:
## lm(formula = grade ~ age, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.7443 -2.9733 0.9045 2.7595 6.2099
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.92001 0.46168 14.989 < 2e-16 ***
## age 0.06107 0.01024 5.965 3.4e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.313 on 998 degrees of freedom
## Multiple R-squared: 0.03442, Adjusted R-squared: 0.03346
## F-statistic: 35.58 on 1 and 998 DF, p-value: 3.395e-09
lm(yos ~ age, data = opm94) %>% summary()
##
## Call:
## lm(formula = yos ~ age, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.2467 -4.3889 0.2288 4.9875 16.6804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.85485 0.96979 -9.131 <2e-16 ***
## age 0.53883 0.02151 25.056 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.96 on 998 degrees of freedom
## Multiple R-squared: 0.3861, Adjusted R-squared: 0.3855
## F-statistic: 627.8 on 1 and 998 DF, p-value: < 2.2e-16
lm(yos ~ age_months, data = opm94) %>% summary()
##
## Call:
## lm(formula = yos ~ age_months, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.2467 -4.3889 0.2288 4.9875 16.6804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.854846 0.969789 -9.131 <2e-16 ***
## age_months 0.044902 0.001792 25.056 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.96 on 998 degrees of freedom
## Multiple R-squared: 0.3861, Adjusted R-squared: 0.3855
## F-statistic: 627.8 on 1 and 998 DF, p-value: < 2.2e-16
lm(sal ~ male01, data = opm94) %>% summary()
##
## Call:
## lm(formula = sal ~ male01, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31945 -11537 -3092 9591 71883
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34222.8 749.9 45.64 <2e-16 ***
## male01 12776.6 1046.3 12.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16500 on 993 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.1305, Adjusted R-squared: 0.1297
## F-statistic: 149.1 on 1 and 993 DF, p-value: < 2.2e-16
Create a dummy variable female
(female = 1, male = 0)
opm94 <- opm94 %>% mutate(female01 = if_else(male01 == 0, 1, 0 ))
lm(sal ~ female01, data = opm94) %>% summary()
##
## Call:
## lm(formula = sal ~ female01, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31945 -11537 -3092 9591 71883
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46999.4 729.8 64.40 <2e-16 ***
## female01 -12776.6 1046.3 -12.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16500 on 993 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.1305, Adjusted R-squared: 0.1297
## F-statistic: 149.1 on 1 and 993 DF, p-value: < 2.2e-16
Mean salaries for Males/Females:
opm94 %>% group_by(male) %>% summarise(Mean_Salary = mean(sal, na.rm = TRUE))
## # A tibble: 2 x 2
## male Mean_Salary
## <fct> <dbl>
## 1 female 34223.
## 2 male 46999.
Yuriy Davydenko 2020