Family budget: CES (Consumer Expenditure Survey) data
- 22 minsOverview:
Analyze CES data in R. to see what American households’ consumption pattern look like at different income levels. The unit of the data is CU (Consumer Unit).
Data Cleaning
Download data
lapply(list("lodown","tidyverse","survey","mitools"), library, character.only = TRUE)
# Download data (2017)
if ( !('2017' %in% list.files( file.path(path.expand( "~" ) , "CES") )) ){
# catalog
ces_cat <-
get_catalog( "ces" ,
output_dir = file.path( path.expand( "~" ) , "CES" ) ) # 2017 most recent
# 2017 only
ces_cat <- subset( ces_cat , year == 2017 )
# download the microdata to your local computer
ces_cat <- lodown( "ces" , ces_cat )
}
Merge data of five quarters into one
fmli171x <- readRDS( file.path( path.expand( "~" ) , "CES" , "2017/fmli171x.rds" ) )
fmli172 <- readRDS( file.path( path.expand( "~" ) , "CES" , "2017/fmli172.rds" ) )
fmli173 <- readRDS( file.path( path.expand( "~" ) , "CES" , "2017/fmli173.rds" ) )
fmli174 <- readRDS( file.path( path.expand( "~" ) , "CES" , "2017/fmli174.rds" ) )
fmli181 <- readRDS( file.path( path.expand( "~" ) , "CES" , "2017/fmli181.rds" ) )
fmli171x$qtr <- 1; fmli172$qtr <- 2; fmli173$qtr <- 3; fmli174$qtr <- 4; fmli181$qtr <- 5
fmli181 <- fmli181[ , intersect(names( fmli171x ), names(fmli181))]
fmli174 <- fmli174[ , intersect(names( fmli171x ), names(fmli174))]
fmli173 <- fmli173[ , intersect(names( fmli171x ), names(fmli173))]
fmli172 <- fmli172[ , intersect(names( fmli171x ), names(fmli172))]
fmli171x <- fmli171x[ , intersect(names( fmli171x ), names(fmli181))]
fmly <- rbind( fmli171x , fmli172 , fmli173 , fmli174 , fmli181 )
rm( fmli171x , fmli172 , fmli173 , fmli174 , fmli181 )
wtrep <- c( paste0( "wtrep" , stringr::str_pad( 1:44 , 2 , pad = "0" ) ) , "finlwt21" )
for ( i in wtrep ) fmly[ is.na( fmly[ , i ] ) , i ] <- 0 # NA -> 0
rm(wtrep,i)
Create variables for annual expenditures
# create new variables in fmly
new.var <- c("totalexp", "totalhouse", "totalfood", "totalhealth", "totalcloth", "totaltranspt", "totalrec", "totaledu")
# new variables contains the sum of the total expenditure from the current and previous quarters
fmly$totalexp <- rowSums( fmly[ , c( "totexppq" , "totexpcq" ) ] , na.rm = TRUE )
fmly$totalhouse <- rowSums( fmly[ , c( "ehousngp" , "ehousngc" ) ] , na.rm = TRUE ) # include property tax
fmly$totalfood <- rowSums( fmly[ , c( "foodcq" , "foodpq" ) ] , na.rm = TRUE )
fmly$totalhealth <- rowSums( fmly[ , c( "healthcq" , "healthpq" ) ] , na.rm = TRUE )
fmly$totalcloth <- rowSums( fmly[ , c( "chldrncq" , "chldrnpq" , "grlfifcq" , "grlfifpq",
"menboycq" , "menboypq" , "womsixcq" , "womsixpq") ] , na.rm = TRUE )
fmly$totaltranspt <- rowSums( fmly[ , c( "etranptc" , "etranptp" ) ] , na.rm = TRUE )
fmly$totalrec <- rowSums( fmly[ , c( "tfeesadc" , "tfeesadp" ) ] , na.rm = TRUE )
fmly$totaledu <- rowSums( fmly[ , c( "educacq" , "educapq" ) ] , na.rm = TRUE )
# immediately convert missing values (NA) to zeroes
for (i in new.var){ fmly[ is.na( fmly[,i] ) , i ] <- 0 }
# annualize the expenditures by multiplying them by 4
for (i in new.var){ fmly[,i] <- fmly[,i] * 4 }
fmly$totaltax <- fmly$totxest
fmly$totalexp <- fmly$totalexp + fmly$totaltax
Now I create annual expenditure variables for these break-downs:
- housing:
totalhouse
=ehousngp
+ehousngc
- food:
totalfood
=foodcq
+foodpq
- health care:
totalhealth
=healthcq
+healthpq
- clothes:
totalcloth
=chldrncq
+chldrnpq
+grlfifcq
+grlfifpq
+menboycq
+menboypq
+womsixcq
+womsixpq
(child + girl + boy + man + woman) - transportation:
totaltranspt
=etranptc
+etranptp
- recreation:
totalrec
=tfeesadc
+tfeesadp
- education:
totaledu
=educacq
+educapq
- taxes:
totxest
, combining Federal and State tax liabilities that are estimated using a tax simulator (TAXSIM), created by the National Bureau of Economic Research (NBER) - total expenditure:
totalexp
=totexppq
+totexpcq
And income:
- income before tax:
fincbtxm
Create survey object
# add a column of ones
fmly$one <- 1
# create a vector containing all of the multiply-imputed variables
mi_vars <- gsub( "5$" , "" , grep( "[a-z]5$" , names( fmly ) , value = TRUE ) )
# loop through each of the five variables..
for ( i in 1:5 ){
x <- fmly # temporary table x
# loop through each of the multiply-imputed variables..
for ( j in mi_vars ){
# copy the contents of the current column (for example 'welfare1')
# over to a new column ending in 'mi' (for example 'welfaremi')
x[ , paste0( j , 'mi' ) ] <- x[ , paste0( j , i ) ]
# delete the all five of the imputed variable columns
x <- x[ , !( names( x ) %in% paste0( j , 1:5 ) ) ]
}
# save the current table in the sqlite database as 'imp1' 'imp2' etc.
assign( paste0( 'imp' , i ) , x )
rm( x )
}
# containing the five multiply-imputed data tables - imp1 through imp5
ces_design <-
svrepdesign(
weights = ~finlwt21 ,
repweights = "wtrep[0-9]+" ,
data = imputationList( list( imp1 , imp2 , imp3 , imp4 , imp5 ) ) ,
type = "BRR" ,
combined.weights = TRUE ,
mse = TRUE
)
rm( imp1 , imp2 , imp3 , imp4 , imp5, i, j, mi_vars )
Statistics: Quantile
To group consumer units into different income levels, we need to first find the quantiles of income. And then we can cut income into segments as income levels. The quantiles of interest are 0.2, 0.4, 0.6, 0.8, 0.90, 0.95, 0.99, 0.999 quantiles.
Income after tax quantiles
q <- MIcombine( with( ces_design , svyby( ~ finatxem , ~ as.factor(1) , svyquantile , # minimum is negative
quantile = c(0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99, 0.999 ) , se = TRUE ) ) )
names(q$coefficients) <- c(0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99, 0.999 )
q <- q %>% print() %>% as.data.frame()
q <- rownames_to_column(q, "quantile")
names(q)[2:3] <- c("income", "standard error")
Income before tax quantiles
bq <- MIcombine( with( ces_design , svyby( ~ fincbtxm , ~ as.factor(1) , svyquantile ,
quantile = c(0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99, 0.999 ) , se = TRUE ) ) )
names(bq$coefficients) <- c(0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99, 0.999 )
bq <- bq %>% print() %>% as.data.frame()
bq <- rownames_to_column(bq, "quantile")
names(bq)[2:3] <- c("income", "standard error")
Then we can create a grouping variable:
ces_design <-
update(
ces_design , income_before_tax = cut(fincbtxm, breaks = c(-Inf,bq$income,Inf),
labels = c("0-20th percentile", "20-40th percentile", "40-60th percentile",
"60-80th percentile", "80-90th percentile", "90-95th percentile",
"95-99th percentile", "99-99.9th percentile", "99.9-100th percentile") ))
Tables
Amount of money
variables <- c("totalhouse", "totalfood", "totalhealth", "totalcloth", "totaltranspt", "totalrec", "totaledu",
"totxest", "totalexp", "fincbtxm") # tax: totaltax / totxest
names <- c("housing", "food", "health care", "clothing", "transportation", "recreation", "education",
"income tax", "total expenditure (include tax)", "income before tax")
am <- NULL
for (i in variables){
MIcombine( with( ces_design , svyby( ~ get(i) , ~ income_before_tax , svymean ) ) ) %>%
print() %>% as.data.frame() %>% t() -> x
rownames(x) <- c(names[which(variables == i)],'standard error')
x %>% as.data.frame %>% rownames_to_column("Income group") %>% rbind(am,.) -> am
}
Percentage over income
pc <- am %>% slice( seq(1,nrow(am)-1,2) ) %>% column_to_rownames("Income group")
for (j in 1:nrow(pc)){
pc[j,] <- pc[j,]/pc[nrow(pc),]
}
pc %>% rownames_to_column("Income group") -> pc
Write Tables
writexl::write_xlsx(list("before_tax_income" = bq, "after_tax_income" = q, "mean expenditure" = am, "percentage" = pc),
file.path( wd, "family budget.xlsx" ))
Plots
Pie charts
## Pies
pie <- am %>% slice( seq(1,nrow(am)-1,2) ) %>% slice(-nrow(.)) %>% column_to_rownames("Income group")
rownames(pie)[(nrow(pie)-1)] <- 'taxes'
pie %>% rbind('other' = pie[nrow(pie),] - colSums(pie[-nrow(pie),])) %>% rownames_to_column("Income group") %>%
slice(1:(nrow(.)-2),nrow(.),(nrow(.)-1)) %>% column_to_rownames("Income group") -> pie
for (j in 1:nrow(pie)){
pie[j,] <- pie[j,]/pie[nrow(pie),]
}
pie %>% rownames_to_column("Income group") -> pie
pie %>% gather(key = 'income group', value = 'percentage', '0-20th percentile':'99.9-100th percentile') -> pie
names(pie)[1] <- "category"
pie %>% filter(category != 'total expenditure (include tax)') %>%
filter(`income group` != '0-20th percentile' & `income group` != '20-40th percentile') %>%
ggplot(aes(x = factor(1), y = percentage, fill = category)) +
geom_bar(width = 1, stat = 'identity') +
coord_polar(theta = "y") + theme_void() +
facet_wrap(~`income group`) + #drop axes and labels
geom_text(aes(label = scales::percent(percentage)), color="black", size=3.8,
position = position_stack(vjust = 0.5))
Not good.
Tableau
Reference
PUMD (CE public-use microdata) Documentation
CONSUMER EXPENDITURES–2017
asdfree.com
Embed Tableau Public on your Website