Tackling Formatted Tabular Data from Excel

By Jeremy Selva

February 15, 2024

Overview

Reading data in formatted cell in excel can be really tricky. In this post, I will share six problematic formatted columns to read and share how I try to handle them.

Data Set

The data set can be found in this link:

https://raw.githubusercontent.com/JauntyJJS/jaunty-blogdown/main/content/blog/2024-02-15-Tackling-Formatted-Cell-Data/sample_excel.xlsx

Here is a peak view of the excel file with some problematic columns.

The column date_issue has two number format. One in General and the other in Date.

The column colour_weight has two colour format. The one in black is weight in pounds and the one in green is weight in kilogram.

The columns text_integer_issue and text_numeric_issue are numeric columns but some cells were formatted as text. These cells are indicated by the green triangle.

The column numeric_integer_issue is an integer column.

The column one_or_zero_issue is a column containing one and zero but it is missing for the first hundreds of rows.

R Packages Used

Here are the R packages used in this analysis.

Code
library("sessioninfo")
library("quarto")
library("fs")
library("knitr")

library("here")
library("readxl")
library("collateral")
library("tidyxl")
library("janitor")

library("dplyr")
library("tidyr")
library("purrr")
library("lubridate")
library("tibble")

library("pointblank")
library("reactable")
library("unheadr")
Code
get_r_package_info <- function() {

  r_package_table <- sessioninfo::package_info()
  rownames(r_package_table) <- NULL

  r_package_table <- r_package_table |>
    tibble::as_tibble() |>
    dplyr::mutate(
      version = ifelse(is.na(r_package_table$loadedversion),
                       r_package_table$ondiskversion,
                       r_package_table$loadedversion)) |>
    dplyr::filter(.data[["attached"]] == TRUE) |>
    dplyr::select(
      dplyr::any_of(c("package", "version",
                      "date", "source"))
    )

  return(r_package_table)

}

get_r_package_info() |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
package
version
date
source
collateral
0.5.2
2021-10-25
CRAN (R 4.4.0)
dplyr
1.1.4
2023-11-17
RSPM
fs
1.6.4
2024-04-25
RSPM
here
1.0.1
2020-12-13
CRAN (R 4.4.0)
janitor
2.2.0
2023-02-02
CRAN (R 4.4.0)
1–5 of 17 rows
of 4

R Platform Information

Here are the R platform environment used in this analysis.

Code
get_quarto_version <- function(
    test_sys_path = FALSE,
    test_no_path = FALSE
) {

  # Taken from https://github.com/r-lib/sessioninfo/issues/75
  if (isNamespaceLoaded("quarto") && isFALSE(test_sys_path)) {
    path <- quarto::quarto_path() |>
      fs::path_real()
    ver <- system("quarto -V", intern = TRUE)
    if (is.null(path) || isTRUE(test_no_path)) {
      "NA (via quarto)"
    } else {
      paste0(ver, " @ ", path, "/ (via quarto)")
    }
  } else {
    path <- Sys.which("quarto") |>
      fs::path_real()
    if (path == "" || isTRUE(test_no_path)) {
      "NA"
    } else {
      ver <- system("quarto -V", intern = TRUE)
      paste0(ver, " @ ", path)
    }
  }
}


get_knitr_version <- function() {

  knitr_info <- "NA"

  r_package_table <- sessioninfo::package_info(
    pkgs = c("installed")
  ) |>
    dplyr::filter(.data[["package"]] == "knitr")

  if (nrow(r_package_table) == 1) {

    knitr_version <- r_package_table$loadedversion[1]
    knitr_source <- r_package_table$source[1]

    knitr_info <- paste0(
      knitr_version, " from ",
      knitr_source)
  }

  return(knitr_info)

}

get_r_platform_info <- function() {

  r_platform_table <- sessioninfo::platform_info()
  r_platform_table[["quarto"]] <- get_quarto_version()[1]
  r_platform_table[["knitr"]] <- get_knitr_version()[1]

  r_platform_table <- data.frame(
    setting = names(r_platform_table),
    value = unlist(r_platform_table,
                   use.names = FALSE),
    stringsAsFactors = FALSE
  )

  return(r_platform_table)
}

r_platform_table <- get_r_platform_info()
r_platform_table |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
setting
value
version
R version 4.4.0 (2024-04-24 ucrt)
os
Windows 11 x64 (build 22631)
system
x86_64, mingw32
ui
RTerm
language
(EN)
1–5 of 12 rows
of 3

Read Data Attempt 1

We try to read the formatted data but no warning was provided.

sample_excel_attempt_1 <- readxl::read_excel(
  path = here::here("content",
                    "blog",
                    "2024-02-15-Tackling-Formatted-Cell-Data",
                    "sample_excel.xlsx"),
  sheet = "Sheet1"
)

Though no warning was provided, when we print the variable, everything is turned to character and things are not read as expected.

  • date_issue: those formatted as Date have been turned to numbers
  • colour_weight: different colour inputs not differentiated
  • text_integer_issue: column turned to text
  • text_numeric_issue: column turned to text
  • numeric_integer_issue: column is read correctly as numeric
  • one_or_zero_issue: column turned to logical
Code
sample_excel_attempt_1 |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
id
date_issue
colour_weight Black in pounds Green in kilograms
text_integer_issue
text_numeric_issue
numeric_integer_issue
one_or_zero_issue
ID0001
11/11/2017
92
74
1.6
1
ID0002
18/10/2017
194
54
0.14000000000000001
55
ID0003
61
53
0.96
9
ID0004
32/1/2017
165
64
0.02
2
ID0005
25941
148
48
0.23
3
1–5 of 1053 rows
of 211

Read Data Attempt 2

We try to read the formatted data indicating the right column types but it gives strange and intimidating warnings.

sample_excel_attempt_2 <- readxl::read_excel(
  path = here::here("content",
                    "blog",
                    "2024-02-15-Tackling-Formatted-Cell-Data",
                    "sample_excel.xlsx"),
  sheet = "Sheet1",
  col_types = c("text" , "date", 
                "numeric", "numeric", "numeric", "numeric", "numeric")
)

This method resolves the issue when the one_or_zero_issue column being turned to logical by mistake.

Code
sample_excel_attempt_2 |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
id
date_issue
colour_weight Black in pounds Green in kilograms
text_integer_issue
text_numeric_issue
numeric_integer_issue
one_or_zero_issue
ID0001
92
74
1.6
1
ID0002
194
54
0.14
55
ID0003
61
53
0.96
9
ID0004
165
64
0.02
2
ID0005
1971-01-08T00:00:00Z
148
48
0.23
3
1–5 of 1053 rows
of 211

On the other hand, the other issues are not fully resolved.

  • date_issue: those formatted as General have been turned to blank
  • colour_weight different colour inputs not differentiated

From our previous failed attempt, it is also hard to convince ourselves and others that

  • the numbers in characters from the columns text_integer_issue and text_numeric_issue are read correctly.
  • the numbers in numeric_integer_issue only have integer values.
  • the numbers in one_or_zero_issue only have values 0 or 1.

At this point of time, many of my colleagues actually gave up using R in their data analysis workflow because it is not working for them. For me, manually formatting these excel sheets does not work for me, especially when the data set is large.

There are limited resources to deal with formatted cells in Excel.

I will provide try to provide some ways to deal with these formatted columns mainly using collateral, pointblank, tidyxl.

Read Data Attempt 3

First, I convert the date columns into a list of character, Date and logical vectors so that I preserve both the text and date format.

sample_excel_attempt_3 <- readxl::read_excel(
  path = here::here("content",
                    "blog",
                    "2024-02-15-Tackling-Formatted-Cell-Data",
                    "sample_excel.xlsx"),
  sheet = "Sheet1",
  col_types = c("text" , "list", 
                "text", "text", "text", "numeric", "numeric")
  ) |> 
  janitor::clean_names() |> 
  # Check if cohort id is unique
  pointblank::rows_distinct(
    columns = "id",
  )
Code
sample_excel_attempt_3 |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
id
date_issue
colour_weight_black_in_pounds_green_in_kilograms
text_integer_issue
text_numeric_issue
numeric_integer_issue
one_or_zero_issue
ID0001
11/11/2017
92
74
1.6
1
ID0002
18/10/2017
194
54
0.14000000000000001
55
ID0003
61
53
0.96
9
ID0004
32/1/2017
165
64
0.02
2
ID0005
1971-01-08T00:00:00Z
148
48
0.23
3
1–5 of 1053 rows
of 211

Fix date

Next I create a function that convert dates in character vectors into Date objects, convert logical vector to NA and convert Date vectors into the date format that I want.

convert_dmy_text_to_date <- function(input) {
  if (length(class(input)) == 1) {
    if (class(input) == "character") {
      return(as.Date.character(lubridate::dmy(input)))
    } else if (class(input) == "logical") {
      return(NA)
    }
  }
  return(as.Date.character(lubridate::ymd(input)))
}

However, creating function can lead to unexpected warnings and errors. To view these issues, I use some functions from the collateral R package. collateral::map_peacefully is able to capture various warnings and errors if any and summarised them as a tibble. collateral::has_warnings and collateral::has_errors returns a logical vector indicating which row has a warning.

To get the output is a little tricky, I actually have to the developer of collateral for help in this by opening a GitHub issue. For my case, the purrr::map_vec works for me.

fixed_date <- sample_excel_attempt_3 |>
  dplyr::select(c("id","date_issue")) |> 
  dplyr::mutate(
    converted_date_log = collateral::map_peacefully(
      .x = .data[["date_issue"]],
      .f = convert_dmy_text_to_date
    ),
    converted_date = purrr::map_vec(
      .x = .data[["converted_date_log"]], 
      .f = "result"
    ),
    warning_check = collateral::has_warnings(.data[["converted_date_log"]]),
    error_check = collateral::has_errors(.data[["converted_date_log"]]),

  )
Code
fixed_date |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
id
date_issue
converted_date_log
converted_date
warning_check
error_check
ID0001
11/11/2017
[object Object]
2017-11-11
false
false
ID0002
18/10/2017
[object Object]
2017-10-18
false
false
ID0003
[object Object]
false
false
ID0004
32/1/2017
[object Object]
true
false
ID0005
1971-01-08T00:00:00Z
[object Object]
1971-01-08
false
false
1–5 of 1053 rows
of 211

I then use the function pointblank::test_col_vals_in_set from the pointblank package to detect if there are any issues by setting no_issue to be either TRUE or FALSE.

no_issue <- fixed_date |>
  pointblank::test_col_vals_in_set(
    columns = c("warning_check", "error_check"),
    set = c(FALSE)    
  )

If there are any issues, I will isolate and display them.

if (!isTRUE(no_issue)) {
  fixed_date |>
    dplyr::filter(
      warning_check == TRUE | error_check == TRUE
    ) |> 
    dplyr::mutate(
      warning_log = purrr::map(.x = .data[["converted_date_log"]],
                               .f = "warnings",
                               .null = NA),
      error_log = purrr::map(.x = .data[["converted_date_log"]],
                             .f = "errors",
                             .null = NA)   
    ) |> 
    reactable::reactable(
      filterable = TRUE,
      defaultPageSize = 5,
      paginationType = "jump",
    )
}
id
date_issue
converted_date_log
converted_date
warning_check
error_check
warning_log
error_log
ID0004
32/1/2017
[object Object]
true
false
All formats failed to parse. No formats found.

I will then correct the invalid dates accordingly and proceed. We just assume that the date was supposed to be 31-01-2017.

fixed_date <- sample_excel_attempt_3 |> 
  dplyr::select(c("id","date_issue")) |> 
  dplyr::mutate(
    date_issue = dplyr::case_when(
      (.data[["id"]] == "ID0004" & 
       .data[["date_issue"]] == "32/1/2017"
      ) ~ list(c("31/1/2017")),
      .default = .data[["date_issue"]]
    ) 
  ) |> 
  dplyr::mutate(
    converted_date_log = collateral::map_peacefully(
      .x = .data[["date_issue"]],
      .f = convert_dmy_text_to_date
    ),
    converted_date = purrr::map_vec(
      .x = .data[["converted_date_log"]], 
      .f = "result"
    ),
    warning_check = collateral::has_warnings(.data[["converted_date_log"]]),
    error_check = collateral::has_errors(.data[["converted_date_log"]]),

  ) |>
  pointblank::col_vals_in_set(
    columns = c("warning_check", "error_check"),
    set = c(FALSE)    
  ) |> 
  dplyr::select(
    c("id", "converted_date")
  ) |> 
  dplyr::rename(
    date_fixed_yyyy_mm_dd = "converted_date"
  )
Code
fixed_date |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
id
date_fixed_yyyy_mm_dd
ID0001
2017-11-11
ID0002
2017-10-18
ID0003
ID0004
2017-01-31
ID0005
1971-01-08
1–5 of 1053 rows
of 211

Fix weight (Method 1)

We use tidyxl function tidyxl::xlsx_cells to read the excel cell content (data values and some meta data of the cell).

cells <- tidyxl::xlsx_cells(
  path = here::here("content",
                    "blog",
                    "2024-02-15-Tackling-Formatted-Cell-Data",
                    "sample_excel.xlsx"),
  sheet = "Sheet1",
  include_blank_cells = TRUE)

We observe that each cell is assigned to a local format id. We need to identify which local format id refers to the cell with black colour Calibri font and green colour Calibri font.

Code
cells |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
sheet
address
row
col
is_blank
content
data_type
error
logical
numeric
date
character
character_formatted
formula
is_array
formula_ref
formula_group
comment
height
width
row_outline_level
col_outline_level
style_format
local_format_id
Sheet1
A1
1
1
false
16
character
id
[object Object]
false
43.2
8.38
1
1
Normal
1
Sheet1
B1
1
2
false
14
character
date_issue
[object Object]
false
43.2
10.6640625
1
1
Normal
1
Sheet1
C1
1
3
false
13
character
colour_weight Black in pounds Green in kilograms
[object Object]
false
43.2
19.5546875
1
1
Normal
8
Sheet1
D1
1
4
false
8
character
text_integer_issue
[object Object]
false
43.2
17.88671875
1
1
Normal
1
Sheet1
E1
1
5
false
9
character
text_numeric_issue
[object Object]
false
43.2
18.6640625
1
1
Normal
1
1–5 of 6335 rows
of 1267

We use tidyxl function tidyxl::xlsx_formats to read the excel file in formats.

formats <- tidyxl::xlsx_formats(
  path = here::here("content",
                    "blog",
                    "2024-02-15-Tackling-Formatted-Cell-Data",
                    "sample_excel.xlsx")
)

Using green as the running example, first in excel, click on a cell with green font. Next, click on the drop down button beside the font colour button.

This will give the following output. Next click on More Colors…

Go to the Custom tab and extract the hex code saying #00B050 for green.

Next, use https://www.schemecolor.com/?getcolor={hex code} ( https://www.schemecolor.com/sample?getcolor=00B050 in our running example) to find out what the Hex8 code is for the green font.

We can do a similar step to obtain the Hex8 code for the black font.

As such, we can identify the local format id accordingly with the black and green Hex8 code as #FF000000 and #FF00B050 respectively.

green_font_local_format_id <- which(formats$local$font$color$rgb == "FF00B050")
black_font_local_format_id <- which(formats$local$font$color$rgb == "FF000000")

green_font_local_format_id
[1]  3 14
black_font_local_format_id
 [1]  1  2  4  5  6  7  8  9 10 15

We identify the column index of colour_weight_black_in_pounds_green_in_kilograms.

weight_column_index <- which(
  colnames(sample_excel_attempt_3) == "colour_weight_black_in_pounds_green_in_kilograms"
)

weight_column_index
[1] 3

With the column index and local format id identified, we can filter the cells data to isolate cells which contain the weight in pounds and kilogram.

weight_pounds_converted <- cells |>
  dplyr::filter(.data[["row"]] != 1) |> 
  dplyr::filter(.data[["col"]] == weight_column_index) |> 
  dplyr::filter(.data[["local_format_id"]] %in% black_font_local_format_id) |> 
  pointblank::col_vals_in_set(
    columns = c("data_type"),
    set = c("numeric")    
  ) |>   
  dplyr::filter(.data[["data_type"]] == "numeric") |>  
  dplyr::select(c("row", "numeric")) |> 
  dplyr::rename(weight_pounds_converted = "numeric") |> 
  dplyr::mutate(
    weight_pounds_converted = 
      janitor::round_half_up(.data[["weight_pounds_converted"]] / 2.2046,
                             digits = 0)
  )
Code
weight_pounds_converted |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
row
weight_pounds_converted
3
88
5
75
6
67
9
83
36
87
1–5 of 602 rows
of 121
weight_in_kg <- cells |>
  dplyr::filter(.data[["row"]] != 1) |> 
  dplyr::filter(.data[["col"]] == weight_column_index) |> 
  dplyr::filter(.data[["local_format_id"]] %in% green_font_local_format_id) |> 
  pointblank::col_vals_in_set(
    columns = c("data_type"),
    set = c("numeric")    
  ) |>  
  dplyr::filter(.data[["data_type"]] == "numeric") |>  
  dplyr::select(c("row", "numeric")) |> 
  dplyr::rename(weight_kg = "numeric")
Code
weight_in_kg |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
row
weight_kg
2
92
4
61
7
86
8
84
10
80
1–5 of 451 rows
of 91

We can filter the cells data to isolate cells which contain the ids.

id_column_index <- which(
  colnames(sample_excel_attempt_3) == "id"
)

id_cells <- cells |>
  dplyr::filter(.data[["row"]] != 1) |> 
  dplyr::filter(.data[["col"]] == id_column_index) |>
  pointblank::col_vals_in_set(
    columns = c("data_type"),
    set = c("character")
  ) |> 
  dplyr::select(c("row", "character")) |> 
  dplyr::rename(id = "character")

We combine the weight data together.

fixed_weight <- id_cells |> 
  dplyr::left_join(weight_pounds_converted, 
                   by = dplyr::join_by("row"),
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::left_join(weight_in_kg, 
                   by = dplyr::join_by("row"),
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  tidyr::unite(
    col = "weight_fixed_kg",
    c("weight_pounds_converted",
      "weight_kg"),
    remove = TRUE,
    na.rm = TRUE) |> 
  dplyr::select(c("id", "weight_fixed_kg"))
Code
fixed_weight |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
id
weight_fixed_kg
ID0001
92
ID0002
88
ID0003
61
ID0004
75
ID0005
67
1–5 of 1053 rows
of 211

Fix weight (Method 2)

An alternative method is to use unheadr::annotate_mf_all from the unheadr package. However the last column one_or_zero_issue must be removed.

If not, the following error will be displayed

sample_excel_attempt_4 <- unheadr::annotate_mf_all(
  xlfilepath = here::here("content",
                          "blog", 
                          "2024-02-15-Tackling-Formatted-Cell-Data", 
                          "sample_excel.xlsx")
)
Error in unheadr::annotate_mf_all(xlfilepath = here::here("content", "blog", : Check spreadsheet for blank cells in seemingly empty rows

Here is the file with the last column removed:

https://raw.githubusercontent.com/JauntyJJS/jaunty-blogdown/main/content/blog/2024-02-15-Tackling-Formatted-Cell-Data/sample_excel_remove_last.xlsx

and the output of reading this file with unheadr::annotate_mf_all.

sample_excel_attempt_4 <- unheadr::annotate_mf_all(
  xlfilepath = here::here("content",
                          "blog", 
                          "2024-02-15-Tackling-Formatted-Cell-Data", 
                          "sample_excel_remove_last.xlsx")
)
Code
sample_excel_attempt_4 |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump"
  )
id
date_issue
colour_weight Black in pounds Green in kilograms
text_integer_issue
text_numeric_issue
numeric_integer_issue
(color-FF000000) ID0001
(color-FF000000) 11/11/2017
(color-FF00B050) 92
(color-FF000000) 74
(color-FF000000) 1.6
(color-FF000000) 1
(color-FF000000) ID0002
(color-FF000000) 18/10/2017
(color-FF000000) 194
(color-FF000000) 54
(color-FF000000) 0.14000000000000001
(color-FF000000) 55
(color-FF000000) ID0003
(color-FF000000) NA
(color-FF00B050) 61
(color-FF000000) 53
(color-FF000000) 0.96
(color-FF000000) 9
(color-FF000000) ID0004
(color-FF000000) 32/1/2017
(color-FF000000) 165
(color-FF000000) 64
(color-FF000000) 0.02
(color-FF000000) 2
(color-FF000000) ID0005
(color-FF000000) 25941
(color-FF000000) 148
(color-FF000000) 48
(color-FF000000) 0.23
(color-FF000000) 3
1–5 of 1053 rows
of 211

Check integer in character

When we tried to read the excel data earlier (sample_excel_attempt_2) using col_types as numeric for the text_integer_issue column, we received a large chunk of error. Here are the relevant warning messages in relation to this column.

  • Warning: Coercing text to numeric in D3 / R3C4: ‘54’
  • Warning: Coercing text to numeric in D7 / R7C4: ‘33’
  • Warning: Coercing text to numeric in D8 / R8C4: ‘35’
  • Warning: Coercing text to numeric in D9 / R9C4: ‘53’
  • Warning: Coercing text to numeric in D10 / R10C4: ‘187’

The warnings inform the user that it sees “54”, “33”, “35” and “53” in cells D3, D7, D8 and D9 respectively as “text” and it is forced to be converted to numeric. While in these cases, there are no issues when they are converted to numeric.

It may be safer to check if the column truly contain only positive integers even though they are in text, rather than relying on the long warning messages.

We continue with sample_excel_attempt_3 which reads the text_integer_issue column as text instead. We use some functions from the pointblank package like pointblank::col_vals_regex and pointblank::col_vals_gt to ensure that the text are positive integers.

integer_check_from_text <- sample_excel_attempt_3 |> 
  dplyr::select(c("id","text_integer_issue")) |> 
  pointblank::col_vals_regex(
    columns = c("text_integer_issue"),
    regex = "^[1-9]([0-9]+)?(.[0]+)?$",
    na_pass = TRUE,
  ) |> 
  dplyr::mutate(
    text_integer_issue = as.integer(.data[["text_integer_issue"]])
  ) |> 
  pointblank::col_vals_gt(
    columns = c("text_integer_issue"),
    value = 0,
    na_pass = TRUE,    
  ) |> 
  dplyr::rename(
    text_integer_fixed = "text_integer_issue"
  )

Check numeric in character

When we tried to read the excel data earlier (sample_excel_attempt_2) using col_types as numeric for the text_integer_issue column, we received a large chunk of error. Here are the relevant warning messages in relation to this column.

  • Warning: Coercing text to numeric in E4 / R4C5: ‘0.96’
  • Warning: Coercing text to numeric in E5 / R5C5: ‘0.02’
  • Warning: Coercing text to numeric in E6 / R6C5: ‘0.23’
  • Warning: Coercing text to numeric in E10 / R10C5: ‘0.06’

The warnings inform the user that it sees “0.96”, “0.02”, “0.23” in cells E4, E5 and E6 respectively as “text” and it is forced to be converted to numeric.

Similarly, we can use the same functions pointblank::col_vals_regex and pointblank::col_vals_gt to ensure that the text are positive numbers. Do note that when numeric are read as text in this dataset. It can be of the form

  • “0.140”
  • “7.07E-2”
numeric_check <- sample_excel_attempt_3 |> 
  dplyr::select(c("id","text_numeric_issue")) |> 
  pointblank::col_vals_regex(
    columns = c("text_numeric_issue"),
    regex = "^[0-9]+((.[0-9]+)?(E(-)?[0-9]+)?)?$",
    na_pass = TRUE,
  ) |> 
  dplyr::mutate(
    text_numeric_issue = as.numeric(.data[["text_numeric_issue"]])
  ) |> 
  pointblank::col_vals_gt(
    columns = c("text_numeric_issue"),
    value = 0,
    na_pass = TRUE,    
  ) |> 
  dplyr::rename(
    text_numeric_fixed = "text_numeric_issue"
  )

Check integers in numeric

It may be necessary to check if a column only has integers. However, when using the function readxl::read_excel with col_types set to “numeric”. There is no way to check if the column truly contain integers, even when no warning is displayed. We also cannot use pointblank::col_vals_regex because the column is not in text.

We create some functions to check if a numeric vector only has integers. The is_integer_vector returns FALSE when at least one of its element is not an integer.

is_integer_value <- function(input_value,
                             allow_na = FALSE) {

  boolean_result <- FALSE

  # When input value is NA
  if (is.na(input_value)) {
    if (isTRUE(allow_na)) {
      boolean_result <- TRUE
      return(boolean_result)
    } else {
      return(boolean_result)
    }
  }

  # When input value is not numeric
  if (isTRUE(!is.numeric(input_value))) {
    return(boolean_result)
  }

  # When input value is numeric
  boolean_result <- isTRUE(input_value %% 1 == 0)

  return(boolean_result)
}


is_integer_vector <- function(input_vector,
                              allow_na = FALSE) {

  boolean_results <- input_vector |>
    purrr::map_lgl(
      .f = is_integer_value,
      allow_na = allow_na
    )
  return(boolean_results)
}

We use pointblank::col_vals_expr to do a validation using our custom made is_integer_vector function.

Here is a case when it fails.

numeric_check |>
  pointblank::expect_col_vals_expr(
    expr = ~ is_integer_vector(numeric_check[["text_numeric_fixed"]],
                               allow_na = TRUE)
)
Error: The `expect_col_vals_expr()` validation failed beyond the absolute threshold level (1).
* failure level (1049) >= failure threshold (1)

We apply the is_integer_vector function on the numeric_integer_issue column before converting the column to an integer column.

integer_check_from_numeric <- sample_excel_attempt_3 |> 
  dplyr::select(c("id","numeric_integer_issue")) |> 
  pointblank::col_vals_expr(
    expr = ~ is_integer_vector(sample_excel_attempt_3[["numeric_integer_issue"]],
                               allow_na = TRUE)
  ) |> 
  dplyr::mutate(
    numeric_integer_issue = as.integer(.data[["numeric_integer_issue"]]),
  ) |> 
  dplyr::rename(
    numeric_integer_fixed = "numeric_integer_issue"
  )

Check values in set

For the column one_or_zero_issue, we use the function pointblank::col_vals_in_set to check if the column contains 0, 1 or NA.

one_or_zero_check <- sample_excel_attempt_3 |> 
  dplyr::select(c("id","one_or_zero_issue")) |> 
  pointblank::col_vals_in_set(
    columns = c("one_or_zero_issue"),
    set = c(NA, 0, 1)
  ) |> 
  dplyr::rename(
    one_or_zero_fixed = "one_or_zero_issue"
  )

Combine

With the issues resolved for the problematic columns mentioned earlier, we combine them together to give our cleaned dataset.

cleaned_data <- sample_excel_attempt_3 |> 
  dplyr::select("id") |> 
  dplyr::left_join(fixed_date, 
                   by = dplyr::join_by("id"),
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::left_join(fixed_weight, 
                   by = dplyr::join_by("id"),
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::left_join(integer_check_from_text, 
                   by = dplyr::join_by("id"),
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::left_join(numeric_check, 
                   by = dplyr::join_by("id"),
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::left_join(integer_check_from_numeric, 
                   by = dplyr::join_by("id"),
                   unmatched = "error",
                   relationship = "one-to-one") |> 
  dplyr::left_join(one_or_zero_check, 
                   by = dplyr::join_by("id"),
                   unmatched = "error",
                   relationship = "one-to-one")
Code
cleaned_data |>
  reactable::reactable(
    filterable = TRUE,
    defaultPageSize = 5,
    paginationType = "jump",
  )
id
date_fixed_yyyy_mm_dd
weight_fixed_kg
text_integer_fixed
text_numeric_fixed
numeric_integer_fixed
one_or_zero_fixed
ID0001
2017-11-11
92
74
1.6
1
ID0002
2017-10-18
88
54
0.14
55
ID0003
61
53
0.96
9
ID0004
2017-01-31
75
64
0.02
2
ID0005
1971-01-08
67
48
0.23
3
1–5 of 1053 rows
of 211