Tackling Formatted Tabular Data from Excel

By Jeremy Selva

February 15, 2024

Overview

Reading data in formatted cell in excel can be really tricky. In this post, I will share six problematic formatted columns to read and share how I try to handle them.

Data Set

The data set can be found in this link:

https://raw.githubusercontent.com/JauntyJJS/jaunty-blogdown/main/content/blog/2024-02-15-Tackling-Formatted-Cell-Data/sample_excel.xlsx

Here is a peak view of the excel file with some problematic columns.

The column date_issue has two number format. One in General and the other in Date.

The column colour_weight has two colour format. The one in black is weight in pounds and the one in green is weight in kilogram.

The columns text_integer_issue and text_numeric_issue are numeric columns but some cells were formatted as text. These cells are indicated by the green triangle.

The column numeric_integer_issue is an integer column.

The column one_or_zero_issue is a column containing one and zero but it is missing for the first hundreds of rows.

R Packages Used

Here are the R packages used in this analysis.

Code
library("sessioninfo")
library("quarto")
library("fs")
library("knitr")

library("here")
library("readxl")
library("collateral")
library("tidyxl")
library("janitor")

library("dplyr")
library("tidyr")
library("purrr")
library("lubridate")
library("tibble")

library("pointblank")
library("reactable")
library("unheadr")
Code
get_r_package_info <- function() {

  r_package_table <- sessioninfo::package_info()
  rownames(r_package_table) <- NULL

  r_package_table <- r_package_table |>
    tibble::as_tibble() |>
    dplyr::mutate(
      version = ifelse(is.na(r_package_table$loadedversion),
                       r_package_table$ondiskversion,
                       r_package_table$loadedversion)) |>
    dplyr::filter(.data[["attached"]] == TRUE) |>
    dplyr::select(
      dplyr::any_of(c("package", "version",
                      "date", "source"))
    )

  return(r_package_table)

}

get_r_package_info() |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )

R Platform Information

Here are the R platform environment used in this analysis.

Code
get_quarto_version <- function(
    test_sys_path = FALSE,
    test_no_path = FALSE
) {

  # Taken from https://github.com/r-lib/sessioninfo/issues/75
  if (isNamespaceLoaded("quarto") && isFALSE(test_sys_path)) {
    path <- quarto::quarto_path() |>
      fs::path_real()
    ver <- system("quarto -V", intern = TRUE)
    if (is.null(path) || isTRUE(test_no_path)) {
      "NA (via quarto)"
    } else {
      paste0(ver, " @ ", path, "/ (via quarto)")
    }
  } else {
    path <- Sys.which("quarto") |>
      fs::path_real()
    if (path == "" || isTRUE(test_no_path)) {
      "NA"
    } else {
      ver <- system("quarto -V", intern = TRUE)
      paste0(ver, " @ ", path)
    }
  }
}


get_knitr_version <- function() {

  knitr_info <- "NA"

  r_package_table <- sessioninfo::package_info(
    pkgs = c("installed")
  ) |>
    dplyr::filter(.data[["package"]] == "knitr")

  if (nrow(r_package_table) == 1) {

    knitr_version <- r_package_table$loadedversion[1]
    knitr_source <- r_package_table$source[1]

    knitr_info <- paste0(
      knitr_version, " from ",
      knitr_source)
  }

  return(knitr_info)

}

get_r_platform_info <- function() {

  r_platform_table <- sessioninfo::platform_info()
  r_platform_table[["quarto"]] <- get_quarto_version()[1]
  r_platform_table[["knitr"]] <- get_knitr_version()[1]

  r_platform_table <- data.frame(
    setting = names(r_platform_table),
    value = unlist(r_platform_table,
                   use.names = FALSE),
    stringsAsFactors = FALSE
  )

  return(r_platform_table)
}

r_platform_table <- get_r_platform_info()
r_platform_table |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )

Read Data Attempt 1

We try to read the formatted data but no warning was provided.

sample_excel_attempt_1 <- readxl::read_excel(
  path = here::here("content",
                    "blog",
                    "2024-02-15-Tackling-Formatted-Cell-Data",
                    "sample_excel.xlsx"),
  sheet = "Sheet1"
)

Though no warning was provided, when we print the variable, everything is turned to character and things are not read as expected.

  • date_issue: those formatted as Date have been turned to numbers
  • colour_weight: different colour inputs not differentiated
  • text_integer_issue: column turned to text
  • text_numeric_issue: column turned to text
  • numeric_integer_issue: column is read correctly as numeric
  • one_or_zero_issue: column turned to logical
Code
sample_excel_attempt_1 |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )

Read Data Attempt 2

We try to read the formatted data indicating the right column types but it gives strange and intimidating warnings.

sample_excel_attempt_2 <- readxl::read_excel(
  path = here::here("content",
                    "blog",
                    "2024-02-15-Tackling-Formatted-Cell-Data",
                    "sample_excel.xlsx"),
  sheet = "Sheet1",
  col_types = c("text" , "date", 
                "numeric", "numeric", "numeric", "numeric", "numeric")
)

This method resolves the issue when the one_or_zero_issue column being turned to logical by mistake.

Code
sample_excel_attempt_2 |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )

On the other hand, the other issues are not fully resolved.

  • date_issue: those formatted as General have been turned to blank
  • colour_weight different colour inputs not differentiated

From our previous failed attempt, it is also hard to convince ourselves and others that

  • the numbers in characters from the columns text_integer_issue and text_numeric_issue are read correctly.
  • the numbers in numeric_integer_issue only have integer values.
  • the numbers in one_or_zero_issue only have values 0 or 1.

At this point of time, many of my colleagues actually gave up using R in their data analysis workflow because it is not working for them. For me, manually formatting these excel sheets does not work for me, especially when the data set is large.

There are limited resources to deal with formatted cells in Excel.

I will provide try to provide some ways to deal with these formatted columns mainly using collateral, pointblank, tidyxl.

Read Data Attempt 3

First, I convert the date columns into a list of character, Date and logical vectors so that I preserve both the text and date format.

sample_excel_attempt_3 <- readxl::read_excel(
  path = here::here("content",
                    "blog",
                    "2024-02-15-Tackling-Formatted-Cell-Data",
                    "sample_excel.xlsx"),
  sheet = "Sheet1",
  col_types = c("text" , "list", 
                "text", "text", "text", "numeric", "numeric")
  ) |> 
  janitor::clean_names() |> 
  # Check if cohort id is unique
  pointblank::rows_distinct(
    columns = "id",
  )
Code
sample_excel_attempt_3 |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )

Fix date

Next I create a function that convert dates in character vectors into Date objects, convert logical vector to NA and convert Date vectors into the date format that I want.

convert_dmy_text_to_date <- function(input) {
  if (length(class(input)) == 1) {
    if (class(input) == "character") {
      return(as.Date.character(lubridate::dmy(input)))
    } else if (class(input) == "logical") {
      return(NA)
    }
  }
  return(as.Date.character(lubridate::ymd(input)))
}

However, creating function can lead to unexpected warnings and errors. To view these issues, I use some functions from the collateral R package. collateral::map_peacefully is able to capture various warnings and errors if any and summarised them as a tibble. collateral::has_warnings and collateral::has_errors returns a logical vector indicating which row has a warning.

To get the output is a little tricky, I actually have to the developer of collateral for help in this by opening a GitHub issue. For my case, the purrr::map_vec works for me.

fixed_date <- sample_excel_attempt_3 |>
  dplyr::select(c("id","date_issue")) |> 
  dplyr::mutate(
    converted_date_log = collateral::map_peacefully(
      .x = .data[["date_issue"]],
      .f = convert_dmy_text_to_date
    ),
    converted_date = purrr::map_vec(
      .x = .data[["converted_date_log"]], 
      .f = "result"
    ),
    warning_check = collateral::has_warnings(.data[["converted_date_log"]]),
    error_check = collateral::has_errors(.data[["converted_date_log"]]),

  )
Code
fixed_date |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )

I then use the function pointblank::test_col_vals_in_set from the pointblank package to detect if there are any issues by setting no_issue to be either TRUE or FALSE.

no_issue <- fixed_date |>
  pointblank::test_col_vals_in_set(
    columns = c("warning_check", "error_check"),
    set = c(FALSE)    
  )

If there are any issues, I will isolate and display them.

if (!isTRUE(no_issue)) {
  fixed_date |>
    dplyr::filter(
      warning_check == TRUE | error_check == TRUE
    ) |> 
    dplyr::mutate(
      warning_log = purrr::map(.x = .data[["converted_date_log"]],
                               .f = "warnings",
                               .null = NA),
      error_log = purrr::map(.x = .data[["converted_date_log"]],
                             .f = "errors",
                             .null = NA)   
    ) |> 
    reactable::reactable(
      filterable = TRUE,
      defaultPageSize = 5,
      paginationType = "jump",
    )
}

I will then correct the invalid dates accordingly and proceed. We just assume that the date was supposed to be 31-01-2017.

fixed_date <- sample_excel_attempt_3 |> 
  dplyr::select(c("id","date_issue")) |> 
  dplyr::mutate(
    date_issue = dplyr::case_when(
      (.data[["id"]] == "ID0004" & 
       .data[["date_issue"]] == "32/1/2017"
      ) ~ list(c("31/1/2017")),
      .default = .data[["date_issue"]]
    ) 
  ) |> 
  dplyr::mutate(
    converted_date_log = collateral::map_peacefully(
      .x = .data[["date_issue"]],
      .f = convert_dmy_text_to_date
    ),
    converted_date = purrr::map_vec(
      .x = .data[["converted_date_log"]], 
      .f = "result"
    ),
    warning_check = collateral::has_warnings(.data[["converted_date_log"]]),
    error_check = collateral::has_errors(.data[["converted_date_log"]]),

  ) |>
  pointblank::col_vals_in_set(
    columns = c("warning_check", "error_check"),
    set = c(FALSE)    
  ) |> 
  dplyr::select(
    c("id", "converted_date")
  ) |> 
  dplyr::rename(
    date_fixed_yyyy_mm_dd = "converted_date"
  )
Code
fixed_date |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )

Fix weight (Method 1)

We use tidyxl function tidyxl::xlsx_cells to read the excel cell content (data values and some meta data of the cell).

cells <- tidyxl::xlsx_cells(
  path = here::here("content",
                    "blog",
                    "2024-02-15-Tackling-Formatted-Cell-Data",
                    "sample_excel.xlsx"),
  sheet = "Sheet1",
  include_blank_cells = TRUE)

We observe that each cell is assigned to a local format id. We need to identify which local format id refers to the cell with black colour Calibri font and green colour Calibri font.

Code
cells |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )