Tools for creating and applying dictionaries of value-replacement pairs, to clean non-valid values of numeric, categorical, or date-type variables within a dataset.
Install from GitHub with:
# install.packages("remotes")
remotes::install_github("epicentre-msf/dbc")
library(dbc)
data(ll1) # example messy dataset
data(dict_categ1) # example dictionary of categorical vars and allowed values
ll1
#> # A tibble: 7 × 10
#> id age age_unit sex status contacts date_onset date_admit date_exit exit_status
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 M143 14 Years M Suspected 22 43920 2020-04-01 2021.04.02 <NA>
#> 2 M345 8 months F ? ten 43924 April 2020 43940 SENT HOME
#> 3 M104 29 <NA> - confirmed 15 <NA> 03_04_2020 43932 Died
#> 4 M623 91 Year - Confirmed <NA> 2022-04-10 2020-04-12 Unknown Cure
#> 5 M685 ? Days F suspect Not sure <NA> <NA> 43918 <NA>
#> 6 M550 39.. Ans Homme Probable 31 43946 43951 43964 <NA>
#> 7 M190 66 Years M Not a case 17 24/04/2020 43952 43941 Sent home
dict_clean_numeric <- check_numeric(
ll1,
vars = c("age", "contacts"), # cols that should be numeric
fn = as.integer # values not coercible by `fn` are non-valid
)
dict_clean_numeric
#> # A tibble: 4 × 5
#> variable value replacement query new
#> <chr> <chr> <chr> <chr> <lgl>
#> 1 contacts ten <NA> Non-valid number TRUE
#> 2 age ? <NA> Non-valid number TRUE
#> 3 contacts Not sure <NA> Non-valid number TRUE
#> 4 age 39.. <NA> Non-valid number TRUE
2. Manually review non-valid values and give appropriate replacements, or use keyword “.na” to indicate that the value has been reviewed and cannot be corrected.
Normally one would do this step in a spreadsheet but we’ll do it in R here for simplicity.
dict_clean_numeric$replacement <- c(".na", "39", "10", ".na")
clean_numeric(
ll1,
vars = c("age", "contacts"),
dict_clean = dict_clean_numeric,
fn = as.integer
)
#> # A tibble: 7 × 10
#> id age age_unit sex status contacts date_onset date_admit date_exit exit_status
#> <chr> <int> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr>
#> 1 M143 14 Years M Suspected 22 43920 2020-04-01 2021.04.02 <NA>
#> 2 M345 8 months F ? NA 43924 April 2020 43940 SENT HOME
#> 3 M104 29 <NA> - confirmed 15 <NA> 03_04_2020 43932 Died
#> 4 M623 91 Year - Confirmed NA 2022-04-10 2020-04-12 Unknown Cure
#> 5 M685 39 Days F suspect 10 <NA> <NA> 43918 <NA>
#> 6 M550 NA Ans Homme Probable 31 43946 43951 43964 <NA>
#> 7 M190 66 Years M Not a case 17 24/04/2020 43952 43941 Sent home
4. If the original dataset is updated, we can repeat the cleaning steps while retaining the corrections already made in the previous cleaning dictionary.
Check for new non-valid numeric values, after incorporating previous cleaning
dict_clean_numeric_update <- check_numeric(
ll2, # same as ll1 but with 3 additional entries
vars = c("age", "contacts"),
dict_clean = dict_clean_numeric, # incorporate previous cleaning before checking
fn = as.integer,
return_all = TRUE # return original cleaning dict + new entries
)
dict_clean_numeric_update
#> # A tibble: 5 × 5
#> variable value replacement query new
#> <chr> <chr> <chr> <chr> <lgl>
#> 1 contacts ten .na Non-valid number NA
#> 2 age ? 39 Non-valid number NA
#> 3 contacts Not sure 10 Non-valid number NA
#> 4 age 39.. .na Non-valid number NA
#> 5 age 6 years <NA> Non-valid number TRUE
Manually specify replacement for new non-valid entry
dict_clean_numeric_update$replacement[5] <- "6"
Apply updated cleaning dictionary to updated dataset
clean_numeric(
ll2,
vars = c("age", "contacts"),
dict_clean = dict_clean_numeric_update,
fn = as.integer
)
#> # A tibble: 10 × 10
#> id age age_unit sex status contacts date_onset date_admit date_exit exit_status
#> <chr> <int> <chr> <chr> <chr> <int> <chr> <chr> <chr> <chr>
#> 1 M143 14 Years M Suspected 22 43920 2020-04-01 2021.04.02 <NA>
#> 2 M345 8 months F ? NA 43924 April 2020 43940 SENT HOME
#> 3 M104 29 <NA> - confirmed 15 <NA> 03_04_2020 43932 Died
#> 4 M623 91 Year - Confirmed NA 2022-04-10 2020-04-12 Unknown Cure
#> 5 M685 39 Days F suspect 10 <NA> <NA> 43918 <NA>
#> 6 M550 NA Ans Homme Probable 31 43946 43951 43964 <NA>
#> 7 M190 66 Years M Not a case 17 24/04/2020 43952 43941 Sent home
#> 8 M443 10 Months F Confirmed 26 <NA> 43900 43926 Cured
#> 9 M206 6 Years f Conf. 7 43921 43923 ? dead
#> 10 M701 56 Years M Suspected 39 17-03-2020 20-03-2020 05-04-2020 <NA>
dict_clean_categ <- check_categorical(
ll1,
dict_allowed = dict_categ1 # dictionary of categorical vars and their allowed values
)
dict_clean_categ
#> # A tibble: 7 × 4
#> variable value replacement new
#> <chr> <chr> <chr> <lgl>
#> 1 age_unit year <NA> TRUE
#> 2 age_unit ans <NA> TRUE
#> 3 exit_status cure <NA> TRUE
#> 4 sex - <NA> TRUE
#> 5 sex homme <NA> TRUE
#> 6 status ? <NA> TRUE
#> 7 status suspect <NA> TRUE
2. Manually review non-valid values and give appropriate replacements, or use keyword “.na” to indicate that the value has been reviewed and cannot be corrected.
Again, we would normally do this step in a spreadsheet but we do it in R here for simplicity.
dict_clean_categ$replacement <- c(
"Years",
"Years",
"Cured",
".na",
"M",
".na",
"Suspected"
)
clean_categorical(
ll1,
dict_allowed = dict_categ1,
dict_clean = dict_clean_categ
)
#> # A tibble: 7 × 10
#> id age age_unit sex status contacts date_onset date_admit date_exit exit_status
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 M143 14 Years M Suspected 22 43920 2020-04-01 2021.04.02 <NA>
#> 2 M345 8 Months F <NA> ten 43924 April 2020 43940 Sent home
#> 3 M104 29 <NA> <NA> Confirmed 15 <NA> 03_04_2020 43932 Died
#> 4 M623 91 Years <NA> Confirmed <NA> 2022-04-10 2020-04-12 Unknown Cured
#> 5 M685 ? Days F Suspected Not sure <NA> <NA> 43918 <NA>
#> 6 M550 39.. Years M Probable 31 43946 43951 43964 <NA>
#> 7 M190 66 Years M Not a case 17 24/04/2020 43952 43941 Sent home