library(Hmisc) library(assertthat) library(dplyr) # Define functions -------------------------------------------------------- wtd_quantile <- Hmisc::wtd.quantile is.all.na.or.zero <- function(x){ return(length(x[!is.na(x) & x != 0]) == 0) } #' Apply top and bottom coding with log IQR to a level-defined file #' #' Lower-level functions used within 'implement_top_code_with_iqr()' and #' 'implement_bottom_code_with_iqr()' of the 'lissyrtools' package. #' #' @param file A tibble or data.frame with a LIS or LWS file. #' @param file_name A string with the name of the LIS or LWS file. #' @param variable A string with the variable to which top coding should be applied. #' @param times A numeric indicating the scale factor for IQR. Defaults to 3. #' @param variable_level A string with the level of the variable. Should be either 'household' or 'person'. #' If NULL (default), the function will try to guess the level of the variable. #' This is done by comparing the value in 'variable' with pre-set lists of variables. #' @param weight A string with the name of the variable in 'file' that should be #' used as sample weights. #' #' @return A tibble containing the file with the recoded variable. #' #' @keywords internal implement_top_code_with_iqr_hfile <- function(file, file_name, variable, times = 3, weight = NULL){ assertthat::assert_that(variable %in% names(file), msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'.")) var_ <- file[[variable]] # throw warning if variable_level not "household" if(!is.all.na.or.zero(var_)){ if(is.null(weight)){ weight_var <- "hwgt" }else{ weight_var <- weight } assertthat::assert_that(all(var_ >= 0 | is.na(var_)), msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values.")) assertthat::assert_that(weight_var %in% names(file), msg = glue::glue("'{weight_var}' could not be found in '{file_name}'.")) log_var <- dplyr::if_else(var_ > 0, true = log(var_), false = 0, missing = NA_real_) index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files log_var_for_iqr_computation <- log_var[index_valid_weights] weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE] if(!is.all.na.or.zero(log_var_for_iqr_computation)){ log_third_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.75) log_first_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.25) log_times_iqr <- (log_third_quartile - log_first_quartile) * times var_ <- dplyr::if_else(log(var_) > (log_third_quartile + log_times_iqr), true = exp(log_third_quartile + log_times_iqr), false = var_) } file[[variable]] <- var_ } return(file) } #' @rdname implement_top_code_with_iqr_hfile implement_bottom_code_with_iqr_hfile <- function(file, file_name, variable, times = 3, weight = NULL){ assertthat::assert_that(variable %in% names(file), msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'.")) var_ <- file[[variable]] # throw warning if variable_level not "household" if(!is.all.na.or.zero(var_)){ if(is.null(weight)){ weight_var <- "hwgt" }else{ weight_var <- weight } assertthat::assert_that(all(var_ >= 0 | is.na(var_)), msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values.")) assertthat::assert_that(weight_var %in% names(file), msg = glue::glue("'{weight_var}' could not be found in '{file_name}'.")) log_var <- dplyr::if_else(var_ > 0, true = log(var_), false = 0, missing = NA_real_) index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files log_var_for_iqr_computation <- log_var[index_valid_weights] weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE] if(!is.all.na.or.zero(log_var_for_iqr_computation)){ log_third_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.75) log_first_quartile <- wtd_quantile(log_var_for_iqr_computation, weights = weights_for_iqr_computation, probs = 0.25) log_times_iqr <- (log_third_quartile - log_first_quartile) * times var_ <- dplyr::if_else(log(var_) < (log_first_quartile - log_times_iqr), true = exp(log_first_quartile - log_times_iqr), false = var_) } file[[variable]] <- var_ } return(file) } #' Read and process a single file for the poverty IKF estimates #' #' @param dataset_name A character vector of length one with the name of the dataset #' to read and process in 'ccyy' format. E.g. 'lu10'. #' #' @return A tibble with 6 columns: 'hid', 'dhi', 'did', 'nhhmem', 'person_weight', 'children_weight'. read_and_process_poverty <- function(dataset_name) { hfile_name <- paste(dataset_name,'h',sep='') h_file <- read.LIS(hfile_name, vars = c("did","hid","nhhmem","nhhmem17", "nhhmem65","dhi","hwgt"), subset = "!is.na(dhi) & !is.na(hwgt)", labels=FALSE) h_file[["dhi"]] <- dplyr::if_else((h_file[["dhi"]]< 0) & !is.na(h_file[["dhi"]]), true = 0, false = as.numeric(h_file[["dhi"]])) h_file$dhi <- h_file$dhi / sqrt(h_file$nhhmem) h_file <- implement_top_code_with_iqr_hfile(file = h_file, file_name = hfile_name, variable = "dhi", times = 3, weight = "hwgt") h_file <- implement_bottom_code_with_iqr_hfile(file = h_file, file_name = hfile_name, variable = "dhi", times = 3, weight = "hwgt") h_file <- h_file %>% dplyr::mutate(person_weight = hwgt * nhhmem, children_weight = hwgt * nhhmem17, elderly_weight = hwgt * nhhmem65) %>% dplyr::select(-hwgt) return(h_file) } #' Compute poverty estimates for a single file #' #' @description #' Computes a total of 9 estimates for a single file. These include the poverty #' rate at 40, 50 and 60% for children, elderly and overall population. #' E.g. poverty rate at 40% of median for eldery. #' #' To be used within 'compute_ikf_estimates()'. #' #' @param file A tibble with the merged household and person-level files. #' #' @return A list with 9 elements. #' @keywords internal compute_ikf_poverty_estimates <- function(file){ output <- list(poverty_rate_40_overall = NA_real_, poverty_rate_50_overall = NA_real_, poverty_rate_60_overall = NA_real_, poverty_rate_40_children = NA_real_, poverty_rate_50_children = NA_real_, poverty_rate_60_children = NA_real_, poverty_rate_40_elderly = NA_real_, poverty_rate_50_elderly = NA_real_, poverty_rate_60_elderly = NA_real_) if(is.all.na.or.zero(file[["dhi"]])){ return(output) } median_dhi <- unname(wtd_quantile(x = file[["dhi"]], weights = file[["person_weight"]], probs = 0.5)) poverty_40 <- 0.4 * median_dhi poverty_50 <- 0.5 * median_dhi poverty_60 <- 0.6 * median_dhi # compute poverty_rates file <- file %>% dplyr::mutate(poor_40 = dplyr::if_else(dhi < poverty_40, true = 1, false = 0), poor_50 = dplyr::if_else(dhi < poverty_50, true = 1, false = 0), poor_60 = dplyr::if_else(dhi < poverty_60, true = 1, false = 0)) # poverty rate overall population output[["poverty_rate_40_overall"]] <- sum(file[["poor_40"]] * file[["person_weight"]], na.rm = TRUE)/sum(file[["person_weight"]], na.rm = TRUE) output[["poverty_rate_50_overall"]] <- sum(file[["poor_50"]] * file[["person_weight"]], na.rm = TRUE)/sum(file[["person_weight"]], na.rm = TRUE) output[["poverty_rate_60_overall"]] <- sum(file[["poor_60"]] * file[["person_weight"]], na.rm = TRUE)/sum(file[["person_weight"]], na.rm = TRUE) # poverty_rate children if(!is.all.na.or.zero(file[["children_weight"]])){ output[["poverty_rate_40_children"]] <- sum(file[["poor_40"]] * file[["children_weight"]], na.rm = TRUE)/sum(file[["children_weight"]], na.rm = TRUE) output[["poverty_rate_50_children"]] <- sum(file[["poor_50"]] * file[["children_weight"]], na.rm = TRUE)/sum(file[["children_weight"]], na.rm = TRUE) output[["poverty_rate_60_children"]] <- sum(file[["poor_60"]] * file[["children_weight"]], na.rm = TRUE)/sum(file[["children_weight"]], na.rm = TRUE) } # poverty_rate elderly if(!is.all.na.or.zero(file[["elderly_weight"]])){ output[["poverty_rate_40_elderly"]] <- sum(file[["poor_40"]] * file[["elderly_weight"]], na.rm = TRUE)/sum(file[["elderly_weight"]], na.rm = TRUE) output[["poverty_rate_50_elderly"]] <- sum(file[["poor_50"]] * file[["elderly_weight"]], na.rm = TRUE)/sum(file[["elderly_weight"]], na.rm = TRUE) output[["poverty_rate_60_elderly"]] <- sum(file[["poor_60"]] * file[["elderly_weight"]], na.rm = TRUE)/sum(file[["elderly_weight"]], na.rm = TRUE) } return(output) } # Implement functions ----------------------------------------------------- h_file <- read_and_process_poverty("lu10") # <- Insert dataset in 'ccyy' format (e.g. 'lu10') compute_ikf_poverty_estimates(h_file)