The following code details how we imported, extracted relevant variables, recoded these variables to suit our analysis, and combined them into one dataset named capstone_data.csv
.
This code was separated from the main document as it is a process that only needs to be performed once.
# Load necessary packages
library(tidyverse)
library(haven) # Package for reading Stata (DTA) format.
library(skimr)
rm(list = ls())
## Import individual datasets
# Import Demographics variable: marital status, gender, age from b3a_cov.dta
# Common link: pidlink
b3a_cov <- read_dta("b3a_cov.dta")
b3a_cov_reduced <-
b3a_cov %>%
select(1, 7:9, 32)
# Import Home Ownership Status kr03
b2_kr <- read_dta("b2_kr.dta")
b2_kr_reduced <-
b2_kr %>%
select(1:2)
# Import Highest Education Level (dl06) and Ethinic group (dl01f)
b3a_dl1 <- read_dta("b3a_dl1.dta")
b3a_dl1_reduced <-
b3a_dl1 %>%
select(1, 4, 17, 47)
# Import life satisfaction, current economic outlook, and outlook for future
# sw00, sw01, sw03
b3a_sw <- read_dta("b3a_sw.dta")
b3a_sw_reduced <-
b3a_sw %>%
select(1, 3:4, 6,19) #joined
# Import employment tk01a
b3a_tk1 <- read_dta("b3a_tk1.dta")
b3a_tk1_reduced <-
b3a_tk1 %>%
select(1, 3, 41) #joined
# Import Religion tr11
b3a_tr <- read_dta("b3a_tr.dta")
b3a_tr_reduced <-
b3a_tr %>%
select(1, 16, 40)
# Import Physical Health Status kk01
b3b_kk1 <- read_dta("b3b_kk1.dta")
b3b_kk1_reduced <-
b3b_kk1 %>%
select(1, 3, 13)
Here we combine the separate datasets into 1. This was done in a step-wise process, at each step checking for integrity.
# Combining data
# We will mutate and rename variables later
# # Data combined: age, maritial status, sex, ethnicity, education
# # Ethnicity added 20 Dec
data_combined <-
full_join(b3a_cov_reduced, b3a_dl1_reduced, by = "pidlink") %>%
select(-hhid14_9.y) %>%
rename(hhid14_9 = hhid14_9.x)
# # Data combined_1: age, maritial status, sex, ethnicity, education
# # Add: sw00, sw01, sw03
data_combined_1 <-
full_join(data_combined,b3a_sw_reduced, by = "pidlink") %>%
select(-hhid14_9.y) %>%
rename(hhid14_9 = hhid14_9.x)
# # Data_combined_2: age, maritial status, sex, education, sw00, sw01, sw03
# # Add: employment tk01a
data_combined_2 <-
full_join(data_combined_1, b3a_tk1_reduced, by = "pidlink") %>%
select(-hhid14_9.y) %>%
rename(hhid14_9 = hhid14_9.x)
# # Data_combined_3: age, maritial status, sex, education, sw00, sw01, sw03, employment tk01a
# # Add religion tr11
data_combined_3 <-
full_join (data_combined_2, b3a_tr_reduced, by = "pidlink") %>%
select(-hhid14_9.y) %>%
rename(hhid14_9 = hhid14_9.x)
# # Data_combined_4: age, maritial status, sex, education, sw00, sw01, sw03, employment tk01a, religion tr11,
# # Add health kk01
data_combined_4 <-
full_join (data_combined_3, b3b_kk1_reduced, by = "pidlink") %>%
select(-hhid14_9.y) %>%
rename(hhid14_9 = hhid14_9.x)
# # Data_combined_5: age, maritial status, sex, education, sw00, sw01, sw03, employment tk01a, religion tr11, health kk01
# # Add: home ownership kr03
data_combined_5 <-
full_join(data_combined_4, b2_kr_reduced, by = "hhid14_9") # note joined by hhid14_9 (household level) instead of pidlink (individual level)
# data_combined_5 is the final data set comprising:
# age, maritial status, sex, education, sw00, sw01, sw03,
# employment tk01a, religion tr11, health kk01, home ownership kr03
#
skim(data_combined_5)
Name | data_combined_5 |
Number of rows | 36581 |
Number of columns | 14 |
_______________________ | |
Column type frequency: | |
character | 3 |
numeric | 11 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
hhid14_9 | 0 | 1.00 | 9 | 9 | 0 | 15350 | 0 |
pidlink | 190 | 0.99 | 8 | 9 | 0 | 36391 | 0 |
dl01f | 2117 | 0.94 | 0 | 5 | 2804 | 85 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
age | 196 | 0.99 | 38.50 | 18.92 | 14 | 26 | 35 | 48 | 998 | ▇▁▁▁▁ |
marstat | 196 | 0.99 | 2.05 | 0.94 | 1 | 2 | 2 | 2 | 6 | ▇▁▁▁▁ |
sex | 196 | 0.99 | 2.03 | 1.00 | 1 | 1 | 3 | 3 | 3 | ▇▁▁▁▇ |
dl06 | 3904 | 0.89 | 17.07 | 25.52 | 2 | 2 | 5 | 6 | 99 | ▇▁▁▂▁ |
sw00 | 4926 | 0.87 | 2.68 | 0.81 | 1 | 2 | 3 | 3 | 9 | ▆▇▁▁▁ |
sw01 | 4926 | 0.87 | 3.05 | 1.01 | 1 | 3 | 3 | 4 | 9 | ▃▇▁▁▁ |
sw03 | 4926 | 0.87 | 4.36 | 1.49 | 1 | 3 | 4 | 5 | 9 | ▂▇▅▂▁ |
tk01a | 2148 | 0.94 | 1.69 | 0.95 | 1 | 1 | 1 | 3 | 8 | ▇▅▁▁▁ |
tr11 | 4987 | 0.86 | 2.10 | 0.72 | 1 | 2 | 2 | 2 | 7 | ▇▂▁▁▁ |
kk01 | 2310 | 0.94 | 2.05 | 0.68 | 1 | 2 | 2 | 2 | 4 | ▂▇▁▃▁ |
kr03 | 441 | 0.99 | 1.71 | 4.33 | 1 | 1 | 1 | 1 | 95 | ▇▁▁▁▁ |
In this next step we recoded the variables to suit our analysis.
# This next section will focusing recoding the data. Renaming each variable will be performed in the main script.
# Recoding the data ----
names(data_combined_5)
[1] "hhid14_9" "age" "marstat" "sex" "pidlink" "dl01f"
[7] "dl06" "sw00" "sw01" "sw03" "tk01a" "tr11"
[13] "kk01" "kr03"
# Please see below for a brief description of each variable.
data_recoded <-
data_combined_5 %>%
mutate(sw00 = as.numeric(case_when(sw00 == 5 ~ "1", # Recode sw00. 1: Not at all satisfied
sw00 == 4 ~ "2",
sw00 == 3 ~ "3",
sw00 == 2 ~ "4",
sw00 == 1 ~ "5", #5 will be "completely satisfied"
sw00 == 9 ~ NA_character_)
),
sw01 = as.numeric(case_when(sw01 == 8 ~ NA_character_, # 8: (172) dont know, coded as NA to be removed later
sw01 == 9 ~ NA_character_, # 9: (2)maybe data entry error, found a couple of 9
.default = as.character(sw01)
)
),
sw03 = as.numeric(case_when(sw03 == 8 ~ NA_character_, # 8: (1882)dont know, coded as NA to be removed later
sw03 == 9 ~ NA_character_, # 9: (3) maybe data entry error, found a couple of 9
.default = as.character(sw03)
)
),
kr03 = as.factor(case_when(kr03 == 1 ~ "1", #own
kr03 == 2 ~ "0", # occupying
kr03 == 5 ~ "0", # rented
kr03 == 95 ~ "0" # others
)
),
tk01a = as.factor(case_when(tk01a == 1 ~ "1", # employed
tk01a == 3 ~ "0", # unemployed
tk01a == 8 ~ "0" # prefer not to say
)
),
# tr11: Respondents are asked to rate how religious they see themselves
tr11 = as.factor(case_when(tr11 == 7 ~ "0", # refused to say
tr11 == 4 ~ "1", # not religious
tr11 == 3 ~ "2", # somewhat religious
tr11 == 2 ~ "3", # religious
tr11 == 1 ~ "4" # very religious
)
),
kk01 = as.factor(case_when(kk01 == 4 ~ "1", # unhealthy
kk01 == 3 ~ "2", # somewhat unhealthy
kk01 == 2 ~ "3", # somewhat healthy
kk01 == 1 ~ "4" # healthy
)
), # Education recoded to 5 broad levels.
dl06 = as.factor(case_when(dl06 == 99 ~ "0", # dont know, others, missing, school for disabled
dl06 == 98 ~ "0",
dl06 == 95 ~ "0",
dl06 == 17 ~ "0",
dl06 == 90 ~ "1", # kindergarten, elementary, islamic elementary
dl06 == 2 ~ "1",
dl06 == 72 ~ "1",
dl06 == 3 ~ "2", # junior high, jh vocational, islamic jh
dl06 == 4 ~ "2",
dl06 == 73 ~ "2",
dl06 == 5 ~ "3", # senior high, sh vocational, islamic sh
dl06 == 6 ~ "3",
dl06 == 74 ~ "3",
dl06 == 60 ~ "4", # college, university (under, master, doctorate)
dl06 == 61 ~ "4",
dl06 == 62 ~ "4",
dl06 == 63 ~ "4",
dl06 == 11 ~ "5", # adult ed, open uni, islamic school
dl06 == 12 ~ "5",
dl06 == 13 ~ "5",
dl06 == 14 ~ "5",
dl06 == 15 ~ "5"
)
),
age = as.numeric(age), # age as numeric
marstat = as.factor(marstat), # marital status as factor
female = as.factor(case_when(sex == 1 ~ "0", # sex=1 means male, 3=female, so i made male the baseline
sex == 3 ~ "1"
)
),
fo01 = sw03 - sw01 # Outlook score
)
glimpse(data_recoded)
Rows: 36,581
Columns: 16
$ hhid14_9 <chr> "001060000", "001060004", "001060000", "001060000",…
$ age <dbl> 59, 28, 39, 16, 30, 36, 26, 40, 55, 54, 34, 28, 24,…
$ marstat <fct> 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 5, …
$ sex <dbl+lbl> 1, 3, 3, 3, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 3, 3,…
$ pidlink <chr> "001060001", "001060004", "001060007", "001060008",…
$ dl01f <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
$ dl06 <fct> 1, 1, 1, 2, 2, 1, 1, NA, 1, 1, 4, 4, 4, 4, 1, 2, 1,…
$ sw00 <dbl> 1, 3, 2, 3, 2, 3, 3, 3, 2, 3, 1, 2, 2, 2, 2, 4, 3, …
$ sw01 <dbl> 3, 2, 3, 3, 2, 2, 2, 2, 4, 3, 2, 2, 1, 3, 2, 3, 3, …
$ sw03 <dbl> 4, 2, 3, 3, 2, 2, 3, 2, NA, 2, 2, 3, 5, 5, 3, 5, NA…
$ tk01a <fct> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, …
$ tr11 <fct> 3, 2, 4, 3, 3, 2, 4, 3, 3, 3, 4, 3, 4, 3, 4, 3, 3, …
$ kk01 <fct> 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, …
$ kr03 <fct> 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, …
$ female <fct> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
$ fo01 <dbl> 1, 0, 0, 0, 0, 0, 1, 0, NA, -1, 0, 1, 4, 2, 1, 2, N…
# DATA WRANGLING COMPLETE
# saved as csv file.
#write_csv(data_recoded, "capstone_data.csv")
Again, the above processes need only be done once. Thereafter, the dataset should be imported for analysis using a simple read_csv(capstone_data.csv)
command. I’m sure you’d like to return to the main document now and read all about our analysis. You may do so by clicking on this link