
#' Compare categorical level distribution between subsets
#'
#' Given two data subsets, defined by `indexA` and `indexB`, and a
#' categorical variable `catVar`, compute the probability that each
#' level of `catVar` appears in each subset and the Agresti-Caffo
#' confidence interval for the difference in these probabilities,
#' based on the [PropCIs::wald2ci] function.
#'
#'
#' @param DF A data frame containing `catVar`
#' @param catVar Categorical variable whose distribution is compared between two subsets
#' @param indexA Defines records in the first subset
#' @param indexB Defines records in the second subset; default NULL uses all records not in the first subset
#' @param cLevel Confidence level for estimated probability differences
#' @param includeNA Missing data handling option: `ifany` (the default), `no` or `always`
#'
#' @return Data frame with one row for each `catVar` level and these 10 columns:
#'   * `Level` the `catVar` level
#'   * `xA` the number of times `Level` appears in the first subset
#'   * `nA` the total records in the first subset
#'   * `xB` the number of times `Level` appears in the second subset
#'   * `nB` the total records in the second subset
#'   * `pA` the estimated probability that `Level` appears in the first subset
#'   * `pB` the estimated probability that `Level` appears in the second subset
#'   * `loCI` the lower confidence limit on the difference `pA - pB`
#'   * `upCI` the upper confidence limit on the difference `pA - pB`
#'   * `signif` a logical indicator of whether `pA - pB` is significantly different from zero
#' @export
#'
#' @examples
#' catVar <- c(rep("a", 100), rep("b", 100), rep("c", 100))
#' auxVar <- c(rep("Set1", 30), rep("Set2", 70),
#'            rep("Set1", 50), rep("Set2", 50),
#'            rep("Set1", 90), rep("Set2", 10))
#' DF <- data.frame(catVar = catVar, auxVar = auxVar)
#' indexA <- which(DF$auxVar == "Set1")
#' CompareCategoricalLevels(DF, "catVar", indexA)
CompareCategoricalLevels <- function(DF, catVar, indexA, indexB = NULL, cLevel = 0.95, includeNA = "ifany"){
  #
  stopifnot("DF must be a data frame"= is.data.frame(DF))
  stopifnot("catVar not found in DF"= catVar %in% colnames(DF))
  #
  x <- as.character(DF[indexA, catVar])
  stopifnot("indexA subset is empty"= length(x) > 0)
  if (is.null(indexB)){
    y <- as.character(DF[-indexA, catVar])
  } else {
    y <- as.character(DF[indexB, catVar])
  }
  stopifnot("indexB subset is empty"= length(y) > 0)
  stopifnot("cLevel not between 0 and 1"= cLevel > 0 & cLevel < 1)
  stopifnot("includeNA must be 'ifany', 'no', or 'always'"= includeNA %in% c('ifany', 'no', 'always'))
  #
  tblA <- table(x, useNA = includeNA)
  tblB <- table(y, useNA = includeNA)
  #
  Alevels <- ifelse(is.na(names(tblA)), "<NA>", names(tblA))
  names(tblA) <- Alevels
  Blevels <- ifelse(is.na(names(tblB)), "<NA>", names(tblB))
  names(tblB) <- Blevels
  allLevels <- sort(union(Alevels, Blevels))
  nLvl <- length(allLevels)
  #
  xA <- vector("numeric", nLvl)
  n1 <- sum(tblA)
  nA <- rep(n1, nLvl)
  xB <- vector("numeric", nLvl)
  n2 <- sum(tblB)
  nB <- rep(n2, nLvl)
  pA <- vector("numeric", nLvl)
  pB <- vector("numeric", nLvl)
  loCI <- vector("numeric", nLvl)
  upCI <- vector("numeric", nLvl)
  signif <- rep(FALSE, nLvl)
  #
  for (i in 1:nLvl){
    lvl <- allLevels[i]
    x1 <- ifelse(lvl %in% Alevels, as.numeric(tblA[lvl]), 0)
    xA[i] <- x1
    x2 <- ifelse(lvl %in% Blevels, as.numeric(tblB[lvl]), 0)
    xB[i] <- x2
    p1 <- (x1 + 1)/(n1 + 2)
    pA[i] <- p1
    p2 <- (x2 + 1)/(n2 + 2)
    pB[i] <- p2
    waldObj <- PropCIs::wald2ci(x1, n1, x2, n2, cLevel, adjust = "AC")
    ll <- waldObj$conf.int[1]
    uu <- waldObj$conf.int[2]
    loCI[i] <- ll
    upCI[i] <- uu
    if ((ll > 0) | (uu < 0)){
      signif[i] <- TRUE
    }
  }
  #
  outFrame <- data.frame(Level = allLevels, xA = xA, nA = nA, xB = xB, nB = nB,
                         pA = pA, pB = pB, loCI = loCI, upCI = upCI, signif = signif)
  #
  class(outFrame) <- c("CatDiffs", "data.frame")
  #
  return(outFrame)
}

