maRk's blog: Capstone Project: Does home ownership lead to greater optimism for one's future

Importing the Data

The following code details how we imported, extracted relevant variables, recoded these variables to suit our analysis, and combined them into one dataset named capstone_data.csv.

This code was separated from the main document as it is a process that only needs to be performed once.

# Load necessary packages
library(tidyverse)
library(haven) # Package for reading Stata (DTA) format.
library(skimr)
rm(list = ls())

## Import individual datasets

# Import Demographics variable: marital status, gender, age from b3a_cov.dta
# Common link: pidlink
b3a_cov <- read_dta("b3a_cov.dta")
b3a_cov_reduced <-
  b3a_cov %>%
  select(1, 7:9, 32)

# Import Home Ownership Status kr03

b2_kr <- read_dta("b2_kr.dta")
b2_kr_reduced <-
  b2_kr %>%
  select(1:2)
 
# Import Highest Education Level (dl06) and Ethinic group (dl01f)

b3a_dl1 <- read_dta("b3a_dl1.dta")
b3a_dl1_reduced <-
  b3a_dl1 %>%
  select(1, 4, 17, 47)

# Import life satisfaction, current economic outlook, and outlook for future
# sw00, sw01, sw03

b3a_sw <- read_dta("b3a_sw.dta")
b3a_sw_reduced <-
  b3a_sw %>%
  select(1, 3:4, 6,19) #joined

# Import employment tk01a

b3a_tk1 <- read_dta("b3a_tk1.dta")
b3a_tk1_reduced <-
  b3a_tk1 %>%
  select(1, 3, 41) #joined


# Import Religion tr11

b3a_tr <- read_dta("b3a_tr.dta")
b3a_tr_reduced <-
  b3a_tr %>%
  select(1, 16, 40)

# Import Physical Health Status kk01

b3b_kk1 <- read_dta("b3b_kk1.dta")
b3b_kk1_reduced <-
  b3b_kk1 %>%
  select(1, 3, 13)

Here we combine the separate datasets into 1. This was done in a step-wise process, at each step checking for integrity.

# Combining data
# We will mutate and rename variables later

# # Data combined: age, maritial status, sex, ethnicity, education
# # Ethnicity added 20 Dec
data_combined <-
  full_join(b3a_cov_reduced, b3a_dl1_reduced, by = "pidlink") %>%
  select(-hhid14_9.y) %>%
  rename(hhid14_9 = hhid14_9.x)

# # Data combined_1: age, maritial status, sex, ethnicity, education
# # Add: sw00, sw01, sw03
data_combined_1 <-
  full_join(data_combined,b3a_sw_reduced, by = "pidlink") %>%
  select(-hhid14_9.y) %>%
  rename(hhid14_9 = hhid14_9.x)

# # Data_combined_2: age, maritial status, sex, education, sw00, sw01, sw03
# # Add: employment tk01a

data_combined_2 <-
  full_join(data_combined_1, b3a_tk1_reduced, by = "pidlink") %>%
  select(-hhid14_9.y) %>%
  rename(hhid14_9 = hhid14_9.x)

# # Data_combined_3: age, maritial status, sex, education, sw00, sw01, sw03, employment tk01a
# # Add religion tr11

data_combined_3 <-
  full_join (data_combined_2, b3a_tr_reduced, by = "pidlink") %>%
  select(-hhid14_9.y) %>%
  rename(hhid14_9 = hhid14_9.x)

# # Data_combined_4: age, maritial status, sex, education, sw00, sw01, sw03, employment tk01a, religion tr11, 
# # Add health kk01

data_combined_4 <-
  full_join (data_combined_3, b3b_kk1_reduced, by = "pidlink") %>%
  select(-hhid14_9.y) %>%
  rename(hhid14_9 = hhid14_9.x)

# # Data_combined_5: age, maritial status, sex, education, sw00, sw01, sw03, employment tk01a, religion tr11, health kk01
# # Add: home ownership kr03

data_combined_5 <-
  full_join(data_combined_4, b2_kr_reduced, by = "hhid14_9") # note joined by hhid14_9 (household level) instead of pidlink (individual level)

# data_combined_5 is the final data set comprising:
# age, maritial status, sex, education, sw00, sw01, sw03,
# employment tk01a, religion tr11, health kk01, home ownership kr03
# 
skim(data_combined_5)

(#tab:Combining Dataset)Data summary
Name	data_combined_5
Number of rows	36581
Number of columns	14
_______________________
Column type frequency:
character	3
numeric	11
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique
hhid14_9	0	1.00	9	9	0	15350
pidlink	190	0.99	8	9	0	36391
dl01f	2117	0.94	0	5	2804	85

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
age	196	0.99	38.50	18.92	14	26	35	48	998	▇▁▁▁▁
marstat	196	0.99	2.05	0.94	1	2	2	2	6	▇▁▁▁▁
sex	196	0.99	2.03	1.00	1	1	3	3	3	▇▁▁▁▇
dl06	3904	0.89	17.07	25.52	2	2	5	6	99	▇▁▁▂▁
sw00	4926	0.87	2.68	0.81	1	2	3	3	9	▆▇▁▁▁
sw01	4926	0.87	3.05	1.01	1	3	3	4	9	▃▇▁▁▁
sw03	4926	0.87	4.36	1.49	1	3	4	5	9	▂▇▅▂▁
tk01a	2148	0.94	1.69	0.95	1	1	1	3	8	▇▅▁▁▁
tr11	4987	0.86	2.10	0.72	1	2	2	2	7	▇▂▁▁▁
kk01	2310	0.94	2.05	0.68	1	2	2	2	4	▂▇▁▃▁
kr03	441	0.99	1.71	4.33	1	1	1	1	95	▇▁▁▁▁

In this next step we recoded the variables to suit our analysis.

# This next section will focusing recoding the data. Renaming each variable will be performed in the main script.

# Recoding the data ----
names(data_combined_5)

 [1] "hhid14_9" "age"      "marstat"  "sex"      "pidlink"  "dl01f"   
 [7] "dl06"     "sw00"     "sw01"     "sw03"     "tk01a"    "tr11"    
[13] "kk01"     "kr03"

# Please see below for a brief description of each variable.
data_recoded <-
  data_combined_5 %>%
  mutate(sw00 = as.numeric(case_when(sw00 == 5 ~ "1", # Recode sw00. 1: Not at all satisfied
                          sw00 == 4 ~ "2",
                          sw00 == 3 ~ "3",
                          sw00 == 2 ~ "4",
                          sw00 == 1 ~ "5", #5 will be "completely satisfied"
                          sw00 == 9 ~ NA_character_)
                          ),
         sw01 = as.numeric(case_when(sw01 == 8 ~ NA_character_, # 8: (172) dont know, coded as NA to be removed later
                                     sw01 == 9 ~ NA_character_, # 9: (2)maybe data entry error, found a couple of 9
                                     .default = as.character(sw01)
                                     )
                           ),
         sw03 = as.numeric(case_when(sw03 == 8 ~ NA_character_, # 8: (1882)dont know, coded as NA to be removed later
                                     sw03 == 9 ~ NA_character_, # 9: (3) maybe data entry error, found a couple of 9
                                     .default = as.character(sw03)
                                     )
                           ),
         kr03 = as.factor(case_when(kr03 == 1 ~ "1", #own
                                    kr03 == 2 ~ "0", # occupying
                                    kr03 == 5 ~ "0", # rented
                                    kr03 == 95 ~ "0" # others
                                    )
                          ),
         tk01a = as.factor(case_when(tk01a == 1 ~ "1", # employed
                                     tk01a == 3 ~ "0", # unemployed
                                     tk01a == 8 ~ "0" # prefer not to say
                                     )
                           ),
         # tr11: Respondents are asked to rate how religious they see themselves
         tr11 = as.factor(case_when(tr11 == 7 ~ "0", # refused to say
                                    tr11 == 4 ~ "1", # not religious
                                    tr11 == 3 ~ "2", # somewhat religious
                                    tr11 == 2 ~ "3", # religious
                                    tr11 == 1 ~ "4" # very religious
                                    )
                          ),
         kk01 = as.factor(case_when(kk01 == 4 ~ "1", # unhealthy
                                    kk01 == 3 ~ "2", # somewhat unhealthy
                                    kk01 == 2 ~ "3", # somewhat healthy
                                    kk01 == 1 ~ "4" # healthy
                                    )
                          ), # Education recoded to 5 broad levels.
         dl06 = as.factor(case_when(dl06 == 99 ~ "0", # dont know, others, missing, school for disabled
                                    dl06 == 98 ~ "0",
                                    dl06 == 95 ~ "0",
                                    dl06 == 17 ~ "0",
                                    dl06 == 90 ~ "1", # kindergarten, elementary, islamic elementary
                                    dl06 == 2 ~ "1",
                                    dl06 == 72 ~ "1",
                                    dl06 == 3 ~ "2", # junior high, jh vocational, islamic jh
                                    dl06 == 4 ~ "2",
                                    dl06 == 73 ~ "2",
                                    dl06 == 5 ~ "3", # senior high, sh vocational, islamic sh
                                    dl06 == 6 ~ "3",
                                    dl06 == 74 ~ "3",
                                    dl06 == 60 ~ "4", # college, university (under, master, doctorate)
                                    dl06 == 61 ~ "4",
                                    dl06 == 62 ~ "4",
                                    dl06 == 63 ~ "4",
                                    dl06 == 11 ~ "5", # adult ed, open uni, islamic school
                                    dl06 == 12 ~ "5",
                                    dl06 == 13 ~ "5",
                                    dl06 == 14 ~ "5",
                                    dl06 == 15 ~ "5"
                                    )
                          ),
         age = as.numeric(age), # age as numeric
         marstat = as.factor(marstat), # marital status as factor
         female = as.factor(case_when(sex == 1 ~ "0", # sex=1 means male, 3=female, so i made male the baseline
                                   sex == 3 ~ "1"
                                   )
                         ),
         fo01 = sw03 - sw01 # Outlook score
         )
glimpse(data_recoded)

Rows: 36,581
Columns: 16
$ hhid14_9 <chr> "001060000", "001060004", "001060000", "001060000",…
$ age      <dbl> 59, 28, 39, 16, 30, 36, 26, 40, 55, 54, 34, 28, 24,…
$ marstat  <fct> 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 5, …
$ sex      <dbl+lbl> 1, 3, 3, 3, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 3, 3,…
$ pidlink  <chr> "001060001", "001060004", "001060007", "001060008",…
$ dl01f    <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "…
$ dl06     <fct> 1, 1, 1, 2, 2, 1, 1, NA, 1, 1, 4, 4, 4, 4, 1, 2, 1,…
$ sw00     <dbl> 1, 3, 2, 3, 2, 3, 3, 3, 2, 3, 1, 2, 2, 2, 2, 4, 3, …
$ sw01     <dbl> 3, 2, 3, 3, 2, 2, 2, 2, 4, 3, 2, 2, 1, 3, 2, 3, 3, …
$ sw03     <dbl> 4, 2, 3, 3, 2, 2, 3, 2, NA, 2, 2, 3, 5, 5, 3, 5, NA…
$ tk01a    <fct> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, …
$ tr11     <fct> 3, 2, 4, 3, 3, 2, 4, 3, 3, 3, 4, 3, 4, 3, 4, 3, 3, …
$ kk01     <fct> 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 2, …
$ kr03     <fct> 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, …
$ female   <fct> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, …
$ fo01     <dbl> 1, 0, 0, 0, 0, 0, 1, 0, NA, -1, 0, 1, 4, 2, 1, 2, N…

# DATA WRANGLING COMPLETE 
# saved as csv file. 

#write_csv(data_recoded, "capstone_data.csv")

Again, the above processes need only be done once. Thereafter, the dataset should be imported for analysis using a simple read_csv(capstone_data.csv) command. I’m sure you’d like to return to the main document now and read all about our analysis. You may do so by clicking on this link

Capstone Project: Does home ownership lead to greater optimism for one’s future

Importing the Data