
#' @export
mark_ngrams <- function(x, ngram) {

  if (!is.character(x) || length(x) == 0L) {
    stop("x must be a non-empty character vector.")
  }
  if (!is.data.frame(ngram) || !all(c("combination", "length") %in% names(ngram))) {
    stop("ngram must be a data frame with columns 'combination' and 'length' (from ngram_frequencies).")
  }

  # --- Convert to cuneiform if needed ---
  has_cuneiform <- any(grepl("[\U00012000-\U0001254F]", x, perl = TRUE))
  if (!has_cuneiform) {
    x <- as.cuneiform(x)
  }

  # --- Remove spaces and brackets ---
  x <- str_replace_all(x, "[()\\[\\]{} ]", "")

  # --- Sort ngram descending by length ---
  ngram <- ngram[ngram$length>1, ]
  ngram <- ngram[order(-ngram$length), ]

  # --- Replace each combination ---
  for (i in seq_len(nrow(ngram))) {
    com <- ngram$combination[i]
    replacement <- paste0(" {", com, "} ")
    x <- str_replace_all(x, fixed(com), replacement)
  }

  x
}
