Family budget: CES (Consumer Expenditure Survey) data

- 22 mins

Overview:

Analyze CES data in R. to see what American households’ consumption pattern look like at different income levels. The unit of the data is CU (Consumer Unit).


Data Cleaning

Download data

lapply(list("lodown","tidyverse","survey","mitools"), library, character.only = TRUE)
# Download data (2017)
if ( !('2017' %in% list.files( file.path(path.expand( "~" ) , "CES") )) ){
  # catalog
  ces_cat <-
    get_catalog( "ces" ,
                 output_dir = file.path( path.expand( "~" ) , "CES" ) ) # 2017 most recent
  # 2017 only
  ces_cat <- subset( ces_cat , year == 2017 )
  # download the microdata to your local computer
  ces_cat <- lodown( "ces" , ces_cat )
}

Merge data of five quarters into one

fmli171x <- readRDS( file.path( path.expand( "~" ) , "CES" , "2017/fmli171x.rds" ) )
fmli172 <- readRDS( file.path( path.expand( "~" ) , "CES" , "2017/fmli172.rds" ) )
fmli173 <- readRDS( file.path( path.expand( "~" ) , "CES" , "2017/fmli173.rds" ) )
fmli174 <- readRDS( file.path( path.expand( "~" ) , "CES" , "2017/fmli174.rds" ) )
fmli181 <- readRDS( file.path( path.expand( "~" ) , "CES" , "2017/fmli181.rds" ) )
fmli171x$qtr <- 1; fmli172$qtr <- 2; fmli173$qtr <- 3; fmli174$qtr <- 4; fmli181$qtr <- 5
fmli181 <- fmli181[ , intersect(names( fmli171x ), names(fmli181))]
fmli174 <- fmli174[ , intersect(names( fmli171x ), names(fmli174))]
fmli173 <- fmli173[ , intersect(names( fmli171x ), names(fmli173))]
fmli172 <- fmli172[ , intersect(names( fmli171x ), names(fmli172))]
fmli171x <- fmli171x[ , intersect(names( fmli171x ), names(fmli181))]
fmly <- rbind( fmli171x , fmli172 , fmli173 , fmli174 , fmli181 )
rm( fmli171x , fmli172 , fmli173 , fmli174 , fmli181 )

wtrep <- c( paste0( "wtrep" , stringr::str_pad( 1:44 , 2 , pad = "0" ) ) , "finlwt21" )
for ( i in wtrep ) fmly[ is.na( fmly[ , i ] ) , i ] <- 0 # NA -> 0
rm(wtrep,i)

Create variables for annual expenditures

# create new variables in fmly 
new.var <- c("totalexp", "totalhouse", "totalfood", "totalhealth", "totalcloth", "totaltranspt", "totalrec", "totaledu")
# new variables contains the sum of the total expenditure from the current and previous quarters
fmly$totalexp <- rowSums( fmly[ , c( "totexppq" , "totexpcq" ) ] , na.rm = TRUE )
fmly$totalhouse <- rowSums( fmly[ , c( "ehousngp" , "ehousngc" ) ] , na.rm = TRUE ) # include property tax
fmly$totalfood <- rowSums( fmly[ , c( "foodcq" , "foodpq" ) ] , na.rm = TRUE )
fmly$totalhealth <- rowSums( fmly[ , c( "healthcq" , "healthpq" ) ] , na.rm = TRUE )
fmly$totalcloth <- rowSums( fmly[ , c( "chldrncq" , "chldrnpq" , "grlfifcq" , "grlfifpq",
                                       "menboycq" , "menboypq" , "womsixcq" , "womsixpq") ] , na.rm = TRUE )
fmly$totaltranspt <- rowSums( fmly[ , c( "etranptc" , "etranptp" ) ] , na.rm = TRUE )
fmly$totalrec <- rowSums( fmly[ , c( "tfeesadc" , "tfeesadp" ) ] , na.rm = TRUE )
fmly$totaledu <- rowSums( fmly[ , c( "educacq" , "educapq" ) ] , na.rm = TRUE )

# immediately convert missing values (NA) to zeroes
for (i in new.var){ fmly[ is.na( fmly[,i] ) , i ] <- 0 }
# annualize the expenditures by multiplying them by 4
for (i in new.var){ fmly[,i] <- fmly[,i] * 4 }

fmly$totaltax <- fmly$totxest
fmly$totalexp <- fmly$totalexp + fmly$totaltax

Now I create annual expenditure variables for these break-downs:

And income:

Create survey object

# add a column of ones
fmly$one <- 1
# create a vector containing all of the multiply-imputed variables
mi_vars <- gsub( "5$" , "" , grep( "[a-z]5$" , names( fmly ) , value = TRUE ) )

# loop through each of the five variables..
for ( i in 1:5 ){
  x <- fmly # temporary table x
  # loop through each of the multiply-imputed variables..
  for ( j in mi_vars ){
    # copy the contents of the current column (for example 'welfare1')
    # over to a new column ending in 'mi' (for example 'welfaremi')
    x[ , paste0( j , 'mi' ) ] <- x[ , paste0( j , i ) ]
    # delete the all five of the imputed variable columns
    x <- x[ , !( names( x ) %in% paste0( j , 1:5 ) ) ]
  }
  # save the current table in the sqlite database as 'imp1' 'imp2' etc.
  assign( paste0( 'imp' , i ) , x )
  rm( x )
}

# containing the five multiply-imputed data tables - imp1 through imp5
ces_design <-
  svrepdesign(
    weights = ~finlwt21 ,
    repweights = "wtrep[0-9]+" ,
    data = imputationList( list( imp1 , imp2 , imp3 , imp4 , imp5 ) ) ,
    type = "BRR" ,
    combined.weights = TRUE ,
    mse = TRUE
  )
rm( imp1 , imp2 , imp3 , imp4 , imp5, i, j, mi_vars )

Statistics: Quantile

To group consumer units into different income levels, we need to first find the quantiles of income. And then we can cut income into segments as income levels. The quantiles of interest are 0.2, 0.4, 0.6, 0.8, 0.90, 0.95, 0.99, 0.999 quantiles.

Income after tax quantiles

q <- MIcombine( with( ces_design , svyby( ~ finatxem , ~ as.factor(1) , svyquantile , # minimum is negative
                                     quantile = c(0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99, 0.999 ) , se = TRUE ) ) )
names(q$coefficients) <- c(0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99, 0.999 )
q <- q %>% print() %>% as.data.frame()
q <- rownames_to_column(q, "quantile")
names(q)[2:3] <- c("income", "standard error")

Income before tax quantiles

bq <- MIcombine( with( ces_design , svyby( ~ fincbtxm , ~ as.factor(1) , svyquantile ,
                                          quantile = c(0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99, 0.999 ) , se = TRUE ) ) )
names(bq$coefficients) <- c(0.2, 0.4, 0.6, 0.8, 0.9, 0.95, 0.99, 0.999 )
bq <- bq %>% print() %>% as.data.frame()
bq <- rownames_to_column(bq, "quantile")
names(bq)[2:3] <- c("income", "standard error")

Then we can create a grouping variable:

ces_design <-
  update(
    ces_design , income_before_tax = cut(fincbtxm, breaks = c(-Inf,bq$income,Inf),
                                         labels = c("0-20th percentile", "20-40th percentile", "40-60th percentile",
                                                    "60-80th percentile", "80-90th percentile", "90-95th percentile",
                                                    "95-99th percentile", "99-99.9th percentile", "99.9-100th percentile") ))

Tables

Amount of money

variables <- c("totalhouse", "totalfood", "totalhealth", "totalcloth", "totaltranspt", "totalrec", "totaledu",
               "totxest", "totalexp", "fincbtxm") # tax: totaltax / totxest
names <- c("housing", "food", "health care", "clothing", "transportation", "recreation", "education",
           "income tax", "total expenditure (include tax)", "income before tax")
am <- NULL
for (i in variables){
  MIcombine( with( ces_design , svyby( ~ get(i) , ~ income_before_tax , svymean ) ) ) %>%
    print() %>% as.data.frame() %>% t() -> x
  rownames(x) <- c(names[which(variables == i)],'standard error')
  x %>% as.data.frame %>% rownames_to_column("Income group") %>% rbind(am,.) -> am
}

Percentage over income

pc <- am %>% slice( seq(1,nrow(am)-1,2) ) %>% column_to_rownames("Income group")
for (j in 1:nrow(pc)){
  pc[j,] <- pc[j,]/pc[nrow(pc),]
}
pc %>% rownames_to_column("Income group") -> pc

Write Tables

writexl::write_xlsx(list("before_tax_income" = bq, "after_tax_income" = q, "mean expenditure" = am, "percentage" = pc),
                    file.path( wd, "family budget.xlsx" ))

Plots

Pie charts

## Pies
pie <- am %>% slice( seq(1,nrow(am)-1,2) ) %>% slice(-nrow(.)) %>% column_to_rownames("Income group")
rownames(pie)[(nrow(pie)-1)] <- 'taxes'
pie %>% rbind('other' = pie[nrow(pie),] - colSums(pie[-nrow(pie),])) %>% rownames_to_column("Income group") %>%
  slice(1:(nrow(.)-2),nrow(.),(nrow(.)-1)) %>% column_to_rownames("Income group") -> pie
for (j in 1:nrow(pie)){
  pie[j,] <- pie[j,]/pie[nrow(pie),]
}
pie %>% rownames_to_column("Income group") -> pie
pie %>% gather(key = 'income group', value = 'percentage', '0-20th percentile':'99.9-100th percentile') -> pie
names(pie)[1] <- "category"
pie %>% filter(category != 'total expenditure (include tax)') %>% 
  filter(`income group` != '0-20th percentile' & `income group` != '20-40th percentile') %>%
  ggplot(aes(x = factor(1), y = percentage, fill = category)) +
  geom_bar(width = 1, stat = 'identity') +
  coord_polar(theta = "y") + theme_void() +
  facet_wrap(~`income group`) + #drop axes and labels
  geom_text(aes(label = scales::percent(percentage)), color="black", size=3.8,
            position = position_stack(vjust = 0.5)) 

Not good.

Tableau


Reference

PUMD (CE public-use microdata) Documentation
CONSUMER EXPENDITURES–2017
asdfree.com
Embed Tableau Public on your Website

Zhijian Liu

Zhijian Liu

A foodaholic

comments powered by Disqus
rss facebook twitter github gitlab youtube mail spotify lastfm instagram linkedin google google-plus pinterest medium vimeo stackoverflow reddit quora quora