# 1 Completing combinations of key variables

Consider the data below. Let a, b, and c each be unique keys from a variety of data sources, where some data sources contain more than one of the keys, but not necessarily all of them at the same time. The goal is to complete the keys best by cross-referencing all other matching pairs.

For example, a=1 and b=11 appear together, and a=1 and c=10 appear together, therefore these three key values should appear together on the first two rows.

library(tidyverse)

Data and best key variables completion.

x is a key variable but does not appear in this subset. The function handles empty keys, too.

# with data for key matching imputation
dat_data <-
tribble(
~a, ~b, ~c, ~x, ~d1, ~d2
,  1, 11, NA, NA,   1,  20
,  1, NA, 10, NA,   2,  21
,  2, NA, NA, NA,   3,  22
, NA, 22, 20, NA,   4,  23
,  2, NA, 20, NA,   5,  24
,  3, 33, NA, NA,   6,  25
,  4, NA, 40, NA,   7,  26
,  5, NA, NA, NA,   8,  27
,  6, NA, 60, NA,   9,  28
,  6, NA, 60, NA,  10,  29
, NA, 77, NA, NA,  11,  30
, NA, 88, 80, NA,  12,  31
, NA, 88, NA, NA,  13,  32
, NA, NA, NA, NA,  14,  33
)

# pre-determined best key variables completion
dat_keys_pre <-
tribble(
~a, ~b, ~c, ~x
,  1, 11, 10, NA
,  2, 22, 20, NA
,  3, 33, NA, NA
,  4, NA, 40, NA
,  5, NA, NA, NA
,  6, NA, 60, NA
, NA, 77, NA, NA
, NA, 88, 80, NA
, NA, NA, NA, NA
)

## 1.1 Functions

# Combine rows with different data values
f_coalesce_by_column <-
function(
dat
) {
# for matching group_by() keys, keeps first non-NA value for each column

# combine rows in data frame containing NA to make complete row
# https://stackoverflow.com/questions/45515218/combine-rows-in-data-frame-containing-na-to-make-complete-row

## use:
## dat %>%
##   group_by(A) %>%
##   summarise_all(f_coalesce_by_column)

## TESTING
# dat <-
#   # A = 1, easy
#   # A = 2, easy
#   # A = 3, 3rd row has a dup and NA
#   # A = 4, 3rd row has conflucing info
#   tribble(
#    ~A, ~B, ~C, ~D, ~E
#   , 1, NA,  3, NA,  5
#   , 1,  2, NA,  2, NA
#   , 2, NA, NA,  3, NA
#   , 2,  4,  5, NA,  4
#   , 3, NA, NA,  3, NA
#   , 3,  4,  5, NA,  4
#   , 3,  4, NA, NA,  4
#   , 4, NA, NA,  3, NA
#   , 4,  4,  5, NA,  4
#   , 4, 99, 99, 99, NA
#   )

dplyr::coalesce(!!! as.list(dat)) %>%
return()
}

# For a set of key variables that are unique for participants,
#   identify sets of rows with unique key variables in common
#   and collapse them together to have a single "most complete" row
#   of key values by removing the NAs.
# Each key variable must have a unique value for each participant.
f_coalesce_column_set <-
function(
dat
) {

names_var <-
names(dat)

for (i_key in seq_along(names_var)) {
## i_key = 1

# rows for current column without NAs
dat_rows_process <-
dat[!is.na(dat[[i_key]]), ]

# bind to end to process later
dat_rows_skip_NA <-
dat[is.na(dat[[i_key]]), ]

# coalesce for this key column
if (nrow(dat_rows_process) > 0) {
dat_rows_process <-
dat_rows_process %>%
group_by_at(
i_key
) %>%
summarise_all(
f_coalesce_by_column
) %>%
ungroup()
}

# combine, assure original column order for iteration
dat <-
bind_rows(
dat_rows_process
, dat_rows_skip_NA
) %>%
select(
one_of(names_var)
)
}

return(dat)
}

# match and replace less complete rows with most complete row
f_replace_keys_less_with_most_complete <-
function(
dat_data
, dat_most_complete
, col_keys = c("a", "b", "c")
) {

# combination to match key variables, from most to least
dat_combination_var <-
expand.grid(
replicate(
length(col_keys)
, 0:1
, simplify = FALSE)
)
dat_combination_var\$s <-
rowSums(dat_combination_var)
dat_combination_var <-
dat_combination_var %>%
arrange(
-s
) %>%
select(
-s
)
colnames(dat_combination_var) <- col_keys

# dat_combination_var

# list to fill with matches, then bind at end
dat_data_out <- list()
dat_data_in  <- list()

for (i_combination in 1:nrow(dat_combination_var)) {
## i_combination = 1
## i_combination = 2

# keep track of which rows to include and exclude for this combination
ind_all      <- matrix(NA, nrow = nrow(dat_data), ncol = length(col_keys))

keys_include <- (dat_combination_var[i_combination, ] == 1)

for (i_col_keys in seq_along(col_keys)) {
## i_col_keys = 1

if (keys_include[i_col_keys]) {
# values for these keys
ind_all[, i_col_keys] <- !is.na(dat_data[[ col_keys[i_col_keys] ]])
}
if (!keys_include[i_col_keys]) {
# NAs for these col_keys
ind_all[, i_col_keys] <- is.na(dat_data[[ col_keys[i_col_keys] ]])
}

}

# identify rows for this combination
ind_combination <- rowSums(ind_all) == length(col_keys)

dat_data_in[[i_combination]] <-
dat_data[ind_combination,]

if(sum(keys_include) == 0) {
dat_data_out[[i_combination]] <-
dat_data_in[[i_combination]]

} else {

dat_data_out[[i_combination]] <-
right_join(
dat_most_complete
, dat_data_in[[i_combination]]
, by = col_keys[which(keys_include)]
, suffix = c("", ".EMPTY")
, keep = FALSE
) %>%
select(
-ends_with(".EMPTY")
)
}

}

# combine all results
dat_data_updated <-
bind_rows(
dat_data_out
)

return(dat_data_updated)
}

#' Complete multiple key ID variables
#'
#' When data are combined from multiple sources and multiple identifier keys (ID)
#' are used, sometimes in combinations, then we may want to provide the
#' most complete set of the IDs for each observation.
#' By definition, each key value is unique within its column to identify an observational unit.
#'
#' @param dat_data data frame with a set of variables that are keys
#' @param dat_keys optional data frame with only key variables, more complete than set in dat_data
#' @param col_keys key columns names
#'
#' @return dat_data with updated key columns
#' @export
#'
#' @examples
#'
#' dat_data <-
#'   tribble(
#'     ~a, ~b, ~c, ~d1, ~d2
#'   ,  1, 11, NA,   1,  20
#'   ,  1, NA, 10,   2,  21
#'   ,  2, NA, NA,   3,  22
#'   , NA, 22, 20,   4,  23
#'   ,  2, NA, 20,   5,  24
#'   ,  3, 33, NA,   6,  25
#'   ,  4, NA, 40,   7,  26
#'   ,  5, NA, NA,   8,  27
#'   ,  6, NA, 60,   9,  28
#'   ,  6, NA, 60,  10,  29
#'   , NA, 77, NA,  11,  30
#'   , NA, 88, 80,  12,  31
#'   , NA, 88, NA,  13,  32
#'   , NA, NA, NA,  14,  33
#'   )
#'
#' dat_data_updated <-
#'   f_complete_multiple_keys(
#'     dat_data
#'   , dat_keys = NULL
#'   , col_keys = c("a", "b", "c")
#'   )
#'
#' dat_data_updated %>% print(n=Inf)
#'
f_complete_multiple_keys <-
function(
dat_data
, dat_keys = NULL
, col_keys = c("a", "b", "c")
) {

# create ID to restore original row order at end
dat_data <-
dat_data %>%
mutate(
.ID. = 1:n()
)

# create complete keys
if(is.null(dat_keys)) {
dat_keys <-
f_coalesce_column_set(
dat_data %>%
select(
one_of(col_keys)
)
)
}

# update data with complete keys
dat_data_updated <-
f_replace_keys_less_with_most_complete(
dat_data          = dat_data
, dat_most_complete = dat_keys
, col_keys          = col_keys
) %>%
# restore original row order
arrange(
.ID.
) %>%
select(
-.ID.
)

return(dat_data_updated)

}

# 2 Results

## 2.1 Test key completion

Print data.

dat_data %>% print(n=Inf)
# A tibble: 14 x 6
a     b     c x        d1    d2
<dbl> <dbl> <dbl> <lgl> <dbl> <dbl>
1     1    11    NA NA        1    20
2     1    NA    10 NA        2    21
3     2    NA    NA NA        3    22
4    NA    22    20 NA        4    23
5     2    NA    20 NA        5    24
6     3    33    NA NA        6    25
7     4    NA    40 NA        7    26
8     5    NA    NA NA        8    27
9     6    NA    60 NA        9    28
10     6    NA    60 NA       10    29
11    NA    77    NA NA       11    30
12    NA    88    80 NA       12    31
13    NA    88    NA NA       13    32
14    NA    NA    NA NA       14    33

Computing the best key variables completion.

dat_keys <-
f_coalesce_column_set(dat_data %>% select(a, b, c, x))
dat_keys %>% print(n=Inf)
# A tibble: 9 x 4
a     b     c x
<dbl> <dbl> <dbl> <lgl>
1     1    11    10 NA
2     2    22    20 NA
3     4    NA    40 NA
4     6    NA    60 NA
5    NA    88    80 NA
6     3    33    NA NA
7    NA    77    NA NA
8     5    NA    NA NA
9    NA    NA    NA NA   

Pre-determined best key variables completion.

dat_keys_pre %>% print(n=Inf)
# A tibble: 9 x 4
a     b     c x
<dbl> <dbl> <dbl> <lgl>
1     1    11    10 NA
2     2    22    20 NA
3     3    33    NA NA
4     4    NA    40 NA
5     5    NA    NA NA
6     6    NA    60 NA
7    NA    77    NA NA
8    NA    88    80 NA
9    NA    NA    NA NA   

They are equal.

dat_keys      <- dat_keys     %>% arrange(a, b, c, x)
dat_keys_pre  <- dat_keys_pre %>% arrange(a, b, c, x)
all_equal(dat_keys_pre, dat_keys)
[1] TRUE

## 2.2 Best key variables completion with data

dat_data_updated <-
f_complete_multiple_keys(
dat_data
, dat_keys = NULL
, col_keys = c("a", "b", "c", "x")
)

dat_data_updated %>% print(n=Inf)
# A tibble: 14 x 6
a     b     c x        d1    d2
<dbl> <dbl> <dbl> <lgl> <dbl> <dbl>
1     1    11    10 NA        1    20
2     1    11    10 NA        2    21
3     2    22    20 NA        3    22
4     2    22    20 NA        4    23
5     2    22    20 NA        5    24
6     3    33    NA NA        6    25
7     4    NA    40 NA        7    26
8     5    NA    NA NA        8    27
9     6    NA    60 NA        9    28
10     6    NA    60 NA       10    29
11    NA    77    NA NA       11    30
12    NA    88    80 NA       12    31
13    NA    88    80 NA       13    32
14    NA    NA    NA NA       14    33