Capstone Project: Does home ownership lead to greater optimism for one’s future

Importing the Data

The following code details how we imported, extracted relevant variables, recoded these variables to suit our analysis, and combined them into one dataset named capstone_data.csv.

This code was separated from the main document as it is a process that only needs to be performed once.

# Load necessary packages
library(tidyverse)
library(haven) # Package for reading Stata (DTA) format.
library(skimr)
rm(list = ls())

## Import individual datasets

# Import Demographics variable: marital status, gender, age from b3a_cov.dta
# Common link: pidlink
b3a_cov <- read_dta("b3a_cov.dta")
b3a_cov_reduced <-
  b3a_cov %>%
  select(1, 7:9, 32)

# Import Home Ownership Status kr03

b2_kr <- read_dta("b2_kr.dta")
b2_kr_reduced <-
  b2_kr %>%
  select(1:2)
 
# Import Highest Education Level (dl06) and Ethinic group (dl01f)

b3a_dl1 <- read_dta("b3a_dl1.dta")
b3a_dl1_reduced <-
  b3a_dl1 %>%
  select(1, 4, 17, 47)

# Import life satisfaction, current economic outlook, and outlook for future
# sw00, sw01, sw03

b3a_sw <- read_dta("b3a_sw.dta")
b3a_sw_reduced <-
  b3a_sw %>%
  select(1, 3:4, 6,19) #joined

# Import employment tk01a

b3a_tk1 <- read_dta("b3a_tk1.dta")
b3a_tk1_reduced <-
  b3a_tk1 %>%
  select(1, 3, 41) #joined


# Import Religion tr11

b3a_tr <- read_dta("b3a_tr.dta")
b3a_tr_reduced <-
  b3a_tr %>%
  select(1, 16, 40)

# Import Physical Health Status kk01

b3b_kk1 <- read_dta("b3b_kk1.dta")
b3b_kk1_reduced <-
  b3b_kk1 %>%
  select(1, 3, 13)

Here we combine the separate datasets into 1. This was done in a step-wise process, at each step checking for integrity.

# Combining data
# We will mutate and rename variables later

# # Data combined: age, maritial status, sex, ethnicity, education
# # Ethnicity added 20 Dec
data_combined <-
  full_join(b3a_cov_reduced, b3a_dl1_reduced, by = "pidlink") %>%
  select(-hhid14_9.y) %>%
  rename(hhid14_9 = hhid14_9.x)

# # Data combined_1: age, maritial status, sex, ethnicity, education
# # Add: sw00, sw01, sw03
data_combined_1 <-
  full_join(data_combined,b3a_sw_reduced, by = "pidlink") %>%
  select(-hhid14_9.y) %>%
  rename(hhid14_9 = hhid14_9.x)

# # Data_combined_2: age, maritial status, sex, education, sw00, sw01, sw03
# # Add: employment tk01a

data_combined_2 <-
  full_join(data_combined_1, b3a_tk1_reduced, by = "pidlink") %>%
  select(-hhid14_9.y) %>%
  rename(hhid14_9 = hhid14_9.x)

# # Data_combined_3: age, maritial status, sex, education, sw00, sw01, sw03, employment tk01a
# # Add religion tr11

data_combined_3 <-
  full_join (data_combined_2, b3a_tr_reduced, by = "pidlink") %>%
  select(-hhid14_9.y) %>%
  rename(hhid14_9 = hhid14_9.x)

# # Data_combined_4: age, maritial status, sex, education, sw00, sw01, sw03, employment tk01a, religion tr11, 
# # Add health kk01

data_combined_4 <-
  full_join (data_combined_3, b3b_kk1_reduced, by = "pidlink") %>%
  select(-hhid14_9.y) %>%
  rename(hhid14_9 = hhid14_9.x)

# # Data_combined_5: age, maritial status, sex, education, sw00, sw01, sw03, employment tk01a, religion tr11, health kk01
# # Add: home ownership kr03

data_combined_5 <-
  full_join(data_combined_4, b2_kr_reduced, by = "hhid14_9") # note joined by hhid14_9 (household level) instead of pidlink (individual level)

# data_combined_5 is the final data set comprising:
# age, maritial status, sex, education, sw00, sw01, sw03,
# employment tk01a, religion tr11, health kk01, home ownership kr03
# 
skim(data_combined_5)
(#tab:Combining Dataset)Data summary
Name data_combined_5
Number of rows 36581
Number of columns 14
_______________________
Column type frequency:
character 3
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
hhid14_9 0 1.00 9 9 0 15350 0
pidlink 190 0.99 8 9 0 36391 0
dl01f 2117 0.94 0 5 2804 85 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 196 0.99 38.50 18.92 14 26 35 48 998 ▇▁▁▁▁
marstat 196 0.99 2.05 0.94 1 2 2 2 6 ▇▁▁▁▁
sex 196 0.99 2.03 1.00 1 1 3 3 3 ▇▁▁▁▇
dl06 3904 0.89 17.07 25.52 2 2 5 6 99 ▇▁▁▂▁
sw00 4926 0.87 2.68 0.81 1 2 3 3 9 ▆▇▁▁▁
sw01 4926 0.87 3.05 1.01 1 3 3 4 9 ▃▇▁▁▁
sw03 4926 0.87 4.36 1.49 1 3 4 5 9 ▂▇▅▂▁
tk01a 2148 0.94 1.69 0.95 1 1 1 3 8 ▇▅▁▁▁
tr11 4987 0.86 2.10 0.72 1 2 2 2 7 ▇▂▁▁▁
kk01 2310 0.94 2.05 0.68 1 2 2 2 4 ▂▇▁▃▁
kr03 441 0.99 1.71 4.33 1 1 1 1 95 ▇▁▁▁▁

In this next step we recoded the variables to suit our analysis.

# This next section will focusing recoding the data. Renaming each variable will be performed in the main script.

# Recoding the data ----
names(data_combined_5)
 [1] "hhid14_9" "age"      "marstat"  "sex"      "pidlink"  "dl01f"   
 [7] "dl06"     "sw00"     "sw01"     "sw03"     "tk01a"    "tr11"    
[13] "kk01"     "kr03"    
# Please see below for a brief description of each variable.
data_recoded <-
  data_combined_5 %>%
  mutate(sw00 = as.numeric(case_when(sw00 == 5 ~ "1", # Recode sw00. 1: Not at all satisfied
                          sw00 == 4 ~ "2",
                          sw00 == 3 ~ "3",
                          sw00 == 2 ~ "4",
                          sw00 == 1 ~ "5", #5 will be "completely satisfied"
                          sw00 == 9 ~ NA_character_)
                          ),
         sw01 = as.numeric(case_when(sw01 == 8 ~ NA_character_, # 8: (172) dont know, coded as NA to be removed later
                                     sw01 == 9 ~ NA_character_, # 9: (2)maybe data entry error, found a couple of 9
                                     .default = as.character(sw01)
                                     )
                           ),
         sw03 = as.numeric(case_when(sw03 == 8 ~ NA_character_, # 8: (1882)dont know, coded as NA to be removed later
                                     sw03 == 9 ~ NA_character_, # 9: (3) maybe data entry error, found a couple of 9
                                     .default = as.character(sw03)
                                     )
                           ),
         kr03 = as.factor(case_when(kr03 == 1 ~ "1", #own
                                    kr03 == 2 ~ "0", # occupying
                                    kr03 == 5 ~ "0", # rented
                                    kr03 == 95 ~ "0" # others
                                    )
                          ),
         tk01a = as.factor(case_when(tk01a == 1 ~ "1", # employed
                                     tk01a == 3 ~ "0", # unemployed
                                     tk01a == 8 ~ "0" # prefer not to say
                                     )
                           ),
         # tr11: Respondents are asked to rate how religious they see themselves
         tr11 = as.factor(case_when(tr11 == 7 ~ "0", # refused to say
                                    tr11 == 4 ~ "1", # not religious
                                    tr11 == 3 ~ "2", # somewhat religious
                                    tr11 == 2 ~ "3", # religious
                                    tr11 == 1 ~ "4" # very religious
                                    )
                          ),
         kk01 = as.factor(case_when(kk01 == 4 ~ "1", # unhealthy
                                    kk01 == 3 ~ "2", # somewhat unhealthy
                                    kk01 == 2 ~ "3", # somewhat healthy
                                    kk01 == 1 ~ "4" # healthy
                                    )
                          ), # Education recoded to 5 broad levels.
         dl06 = as.factor(case_when(dl06 == 99 ~ "0", # dont know, others, missing, school for disabled
                                    dl06 == 98 ~ "0",
                                    dl06 == 95 ~ "0",
                                    dl06 == 17 ~ "0",
                                    dl06 == 90 ~ "1", # kindergarten, elementary, islamic elementary
                                    dl06 == 2 ~ "1",
                                    dl06 == 72 ~ "1",
                                    dl06 == 3 ~ "2", # junior high, jh vocational, islamic jh
                                    dl06 == 4 ~ "2",
                                    dl06 == 73 ~ "2",
                                    dl06 == 5 ~ "3", # senior high, sh vocational, islamic sh
                                    dl06 == 6 ~ "3",
                                    dl06 == 74 ~ "3",
                                    dl06 == 60 ~ "4", # college, university (under, master, doctorate)
                                    dl06 == 61 ~ "4",
                                    dl06 == 62 ~ "4",
                                    dl06 == 63 ~ "4",
                                    dl06 == 11 ~ "5", # adult ed, open uni, islamic school
                                    dl06 == 12 ~ "5",
                                    dl06 == 13 ~ "5",
                                    dl06 == 14 ~ "5",
                                    dl06 == 15 ~ "5"
                                    )
                          ),
         age = as.numeric(age), # age as numeric
         marstat = as.factor(marstat), # marital status as factor
         female = as.factor(case_when(sex == 1 ~ "0", # sex=1 means male, 3=female, so i made male the baseline
                                   sex == 3 ~ "1"
                                   )
                         ),
         fo01 = sw03 - sw01 # Outlook score
         )
glimpse(data_recoded)
Rows: 36,581
Columns: 16
$ hhid14_9 <chr> "001060000", "001060004", "001060000", "001060000",…
$ age      <dbl> 59, 28, 39, 16, 30, 36, 26, 40, 55, 54, 34, 28, 24,…
$ marstat  <fct> 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 5, …
$ sex      <dbl+lbl> 1, 3, 3, 3, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 3, 3,…
$ pidlink  <chr> "001060001", "001060004", "001060007", "001060008",…
$ dl01f    <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
$ dl06     <fct> 1, 1, 1, 2, 2, 1, 1, NA, 1, 1, 4, 4, 4, 4, 1, 2, 1,…
$ sw00     <dbl> 1, 3, 2, 3, 2, 3, 3, 3, 2, 3, 1, 2, 2, 2, 2, 4, 3, …
$ sw01     <dbl> 3, 2, 3, 3, 2, 2, 2, 2, 4, 3, 2, 2, 1, 3, 2, 3, 3, …
$ sw03     <dbl> 4, 2, 3, 3, 2, 2, 3, 2, NA, 2, 2, 3, 5, 5, 3, 5, NA…
$ tk01a    <fct> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, …
$ tr11     <fct> 3, 2, 4, 3, 3, 2, 4, 3, 3, 3, 4, 3, 4, 3, 4, 3, 3, …
$ kk01     <fct> 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, …
$ kr03     <fct> 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, …
$ female   <fct> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
$ fo01     <dbl> 1, 0, 0, 0, 0, 0, 1, 0, NA, -1, 0, 1, 4, 2, 1, 2, N…
# DATA WRANGLING COMPLETE 
# saved as csv file. 

#write_csv(data_recoded, "capstone_data.csv")

Again, the above processes need only be done once. Thereafter, the dataset should be imported for analysis using a simple read_csv(capstone_data.csv) command. I’m sure you’d like to return to the main document now and read all about our analysis. You may do so by clicking on this link