serial joins two data frames on as many columns as possible until every row in df1 is matched

I have two dataframes I need to join. the goal is to join on as many columns as possible and proceed serially from most to least number fo exact column matches. I need to solve this for K columns. for a step of matching j columns (j < k) the order of the column permutations doesn't matter. This example is for 3 columns with dplyr:

# 3-column toy example 
library(dplyr)
df1_rows = 50
df2_rows = 100
set.seed(1)

df1 <- data.frame(id1 = sample(LETTERS,df1_rows,replace = T),
                  A = sample(1:5,df1_rows,replace = T),
                  B = sample(1:5,df1_rows,replace = T),
                  C = sample(1:5,df1_rows,replace = T) )
df1
df2 <- data.frame(A = sample(1:5,df2_rows,replace = T),
                  B = sample(1:5,df2_rows,replace = T),
                  C = sample(1:5,df2_rows,replace = T) ) %>%
  distinct(A,B,C,.keep_all = T) %>%
  distinct(A,B,.keep_all = T) %>%
  distinct(B,C,.keep_all = T) %>%
  distinct(A,C,.keep_all = T) %>%
  rowwise() %>%
  mutate(id2 = sample(LETTERS,1,replace = T))

# match on 3 columns
col_match_3 = df1 %>%
  inner_join(df2 , by = c("A","B","C")) %>%
  distinct(id1, A,B,C,.keep_all = T)

#serial matches on two columns
col_match_AB = df1 %>%
  anti_join(col_match_3, by = c("A","B","C")) %>%
  inner_join(df2 %>% select(-C), by = c("A","B")) %>%
  distinct(id1, A,B,C,.keep_all = T)

col_match_BC = df1 %>%
  anti_join(col_match_3, by = c("A","B","C")) %>%
  anti_join(col_match_AB, by = c("A","B")) %>%
  inner_join(df2 %>% select(-A), by = c("B","C")) %>%
  distinct(id1, A,B,C,.keep_all = T)

col_match_AC = df1 %>%
  anti_join(col_match_3, by = c("A","B","C")) %>%
  anti_join(col_match_AB, by = c("A","B")) %>%
  anti_join(col_match_BC, by = c("B","C")) %>%
  inner_join(df2 %>% select(-B), by = c("A","C")) %>%
  distinct(id1, A,B,C,.keep_all = T)

#join on one col
col_match_A = df1 %>%
  anti_join(col_match_3, by = c("A","B","C")) %>%
  anti_join(col_match_AB, by = c("A","B")) %>%
  anti_join(col_match_BC, by = c("B","C")) %>%
  anti_join(col_match_AC, by = c("A","C")) %>%
  inner_join(df2 %>% select(id2,A), by = "A") %>%
  distinct(id1, A,B,C,.keep_all = T)

col_match_B = df1 %>%
  anti_join(col_match_3, by = c("A","B","C")) %>%
  anti_join(col_match_AB, by = c("A","B")) %>%
  anti_join(col_match_BC, by = c("B","C")) %>%
  anti_join(col_match_AC, by = c("A","C")) %>%
  anti_join(col_match_A, by = "A") %>%
  inner_join(df2 %>% select(id2,B), by = "B") %>%
  distinct(.keep_all = T)
  

res = 
  col_match_3 %>%
  bind_rows(col_match_AB) %>%
  bind_rows(col_match_BC) %>%
  bind_rows(col_match_AC) %>%
  bind_rows(col_match_A)

res


Read more here: https://stackoverflow.com/questions/67009364/serial-joins-two-data-frames-on-as-many-columns-as-possible-until-every-row-in-d

Content Attribution

This content was originally published by Eric at Recent Questions - Stack Overflow, and is syndicated here via their RSS feed. You can read the original post over there.

%d bloggers like this: