df1和df2之间的抗_join,但是如何将DF2中的所有不匹配更改为na

发布于 2025-02-12 03:13:47 字数 5791 浏览 1 评论 0 原文

以下是我的两个dataFrames,DF1和DF2,

df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),response=c("y","y","y","n","n","y","y","n","n","y"))

id     text response
1  632592651     asdf        y
2  633322173      cat        y
3  634703802      dog        y
4  634927873    mouse        n
5  635812953 elephant        n
6  636004739    goose        y
7  636101211      rat        y
8  636157799     mice        n
9  636263106    kitty        n
10 636752420   kitten        y

df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                  text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                  volume=c("1234","432","324","333","2223","412346","7456","3456","2345","2345","6","345","23","2","4778","234","8675","3459","8","9"))

 id           text volume
1   632592651       asdf_xyz   1234
2   633322173            cat    432
3   634703802            dog    324
4   634927873          mouse    333
5   635812953       elephant   2223
6   636004739          goose 412346
7   636101211            rat   7456
8   636157799           mice   3456
9   636263106          kitty   2345
10  636752420         kitten   2345
11  636809222      tiger_xyz      6
12 2004722036           lion    345
13 2004894388        leopard     23
14 2005045755        ostrich      2
15 2005535472       kangaroo   4778
16 2005630542       platypus    234
17 2005788781           fish   8675
18 2005809679        reptile   3459
19 2005838317        mammals      8
20 2005866692 amphibians_xyz      9

如何将非匹配项从DF2的行ID1:20更改为Na(即与DF1无匹配)以及ID1的列“文本”(IE ASDF_XYZ)到na?

我尝试过

library(dplyr)

df3 <- df2 %>%
  anti_join(df1, by=c("id"))

id           text volume
1   636809222      tiger_xyz      6
2  2004722036           lion    345
3  2004894388        leopard     23
4  2005045755        ostrich      2
5  2005535472       kangaroo   4778
6  2005630542       platypus    234
7  2005788781           fish   8675
8  2005809679        reptile   3459
9  2005838317        mammals      8
10 2005866692 amphibians_xyz      9

df3$id[df3$id != 0] <- NA
df3$text[df3$text != 0] <- NA
df3$volume[df3$volume != 0] <- NA

(一个一个一个,是因为我找不到解决方案如何将数据框架的全部值更改为na)

id text volume
1  <NA> <NA>   <NA>
2  <NA> <NA>   <NA>
3  <NA> <NA>   <NA>
4  <NA> <NA>   <NA>
5  <NA> <NA>   <NA>
6  <NA> <NA>   <NA>
7  <NA> <NA>   <NA>
8  <NA> <NA>   <NA>
9  <NA> <NA>   <NA>
10 <NA> <NA>   <NA>

和df4(),

inner_join(x = df1, 
           y = df2, 
           by = "id") %>%
  mutate_if(is.factor, as.character) %>%
  mutate(text = ifelse(test = text.x != text.y, 
                       yes = NA, 
                       no = text.x)) %>%
  select(id, text, response, volume)

id     text response volume
1  632592651     <NA>        y   1234
2  633322173      cat        y    432
3  634703802      dog        y    324
4  634927873    mouse        n    333
5  635812953 elephant        n   2223
6  636004739    goose        y 412346
7  636101211      rat        y   7456
8  636157799     mice        n   3456
9  636263106    kitty        n   2345
10 636752420   kitten        y   2345

但不确定如何用DF3和DF4替换DF2。所需的输出如下所示:

id           text volume
1   632592651       NA   1234
2   633322173            cat    432
3   634703802            dog    324
4   634927873          mouse    333
5   635812953       elephant   2223
6   636004739          goose 412346
7   636101211            rat   7456
8   636157799           mice   3456
9   636263106          kitty   2345
10  636752420         kitten   2345
11  NA               NA      NA
12  NA               NA      NA
13  NA               NA      NA
14  NA               NA      NA
15  NA               NA      NA
16  NA               NA      NA
17  NA               NA      NA
18  NA               NA      NA
19  NA               NA      NA
20  NA               NA      NA

有人可以帮忙吗? 如果可能的话,我还可以知道是否有手动方法可以根据DF3 $ ID选择DF2的子集并将所有值更改为NA?

第2部分:

在我的请求的第二部分中,我想从JOAD_DF创建另一个数据范围,该数据范围仅在DF1中出现(称为fund_in_df1)。输出的示例:

found_in_df1:

#           id     text volume
# 1: 632592651     <NA>   1234
# 2: 633322173      cat    432
# 3: 634703802      dog    324
# 4: 634927873    mouse    333
# 5: 635812953 elephant   2223
# 6: 636004739    goose 412346
# 7: 636101211      rat   7456
# 8: 636157799     mice   3456
# 9: 636263106    kitty   2345
#10: 636752420   kitten   2345

该解决方案在如何返回df1和df2中匹配列'id”的行值,而不是列'text'',然后返回na中的不匹配列“文本”中的不匹配?但是,我正在寻找一种替代方法,即,可以使用df1从加入_df编写脚本来获得fund_in_df1,因为我们有df1并加入了_df?

Below are my two dataframes, df1 and df2

df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),response=c("y","y","y","n","n","y","y","n","n","y"))

id     text response
1  632592651     asdf        y
2  633322173      cat        y
3  634703802      dog        y
4  634927873    mouse        n
5  635812953 elephant        n
6  636004739    goose        y
7  636101211      rat        y
8  636157799     mice        n
9  636263106    kitty        n
10 636752420   kitten        y

df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                  text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                  volume=c("1234","432","324","333","2223","412346","7456","3456","2345","2345","6","345","23","2","4778","234","8675","3459","8","9"))

 id           text volume
1   632592651       asdf_xyz   1234
2   633322173            cat    432
3   634703802            dog    324
4   634927873          mouse    333
5   635812953       elephant   2223
6   636004739          goose 412346
7   636101211            rat   7456
8   636157799           mice   3456
9   636263106          kitty   2345
10  636752420         kitten   2345
11  636809222      tiger_xyz      6
12 2004722036           lion    345
13 2004894388        leopard     23
14 2005045755        ostrich      2
15 2005535472       kangaroo   4778
16 2005630542       platypus    234
17 2005788781           fish   8675
18 2005809679        reptile   3459
19 2005838317        mammals      8
20 2005866692 amphibians_xyz      9

How do I change the non-matching items from row id1:20 of df2 to NA (i.e. all of them as no matching with df1) and the column 'text' (i.e. asdf_xyz) of id1 to NA?

I have tried

library(dplyr)

df3 <- df2 %>%
  anti_join(df1, by=c("id"))

id           text volume
1   636809222      tiger_xyz      6
2  2004722036           lion    345
3  2004894388        leopard     23
4  2005045755        ostrich      2
5  2005535472       kangaroo   4778
6  2005630542       platypus    234
7  2005788781           fish   8675
8  2005809679        reptile   3459
9  2005838317        mammals      8
10 2005866692 amphibians_xyz      9

df3$id[df3$id != 0] <- NA
df3$text[df3$text != 0] <- NA
df3$volume[df3$volume != 0] <- NA

(Doing this one by one because I couldn't find solution how to change the entire value of the dataframe to NA)

id text volume
1  <NA> <NA>   <NA>
2  <NA> <NA>   <NA>
3  <NA> <NA>   <NA>
4  <NA> <NA>   <NA>
5  <NA> <NA>   <NA>
6  <NA> <NA>   <NA>
7  <NA> <NA>   <NA>
8  <NA> <NA>   <NA>
9  <NA> <NA>   <NA>
10 <NA> <NA>   <NA>

and df4 (solution from How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'?)

inner_join(x = df1, 
           y = df2, 
           by = "id") %>%
  mutate_if(is.factor, as.character) %>%
  mutate(text = ifelse(test = text.x != text.y, 
                       yes = NA, 
                       no = text.x)) %>%
  select(id, text, response, volume)

id     text response volume
1  632592651     <NA>        y   1234
2  633322173      cat        y    432
3  634703802      dog        y    324
4  634927873    mouse        n    333
5  635812953 elephant        n   2223
6  636004739    goose        y 412346
7  636101211      rat        y   7456
8  636157799     mice        n   3456
9  636263106    kitty        n   2345
10 636752420   kitten        y   2345

but not sure how to replace df2 with df3 and df4. The desired output is shown below:

id           text volume
1   632592651       NA   1234
2   633322173            cat    432
3   634703802            dog    324
4   634927873          mouse    333
5   635812953       elephant   2223
6   636004739          goose 412346
7   636101211            rat   7456
8   636157799           mice   3456
9   636263106          kitty   2345
10  636752420         kitten   2345
11  NA               NA      NA
12  NA               NA      NA
13  NA               NA      NA
14  NA               NA      NA
15  NA               NA      NA
16  NA               NA      NA
17  NA               NA      NA
18  NA               NA      NA
19  NA               NA      NA
20  NA               NA      NA

Can someone help please?
If possible, may I also know if there's a manual approach to select subset of df2 based on df3$id and change all values to NA?

Part 2:

For the second part of my request, I would like to create another dataframes from joined_df which appears only in df1 (call it found_in_df1). Example of output:

found_in_df1:

#           id     text volume
# 1: 632592651     <NA>   1234
# 2: 633322173      cat    432
# 3: 634703802      dog    324
# 4: 634927873    mouse    333
# 5: 635812953 elephant   2223
# 6: 636004739    goose 412346
# 7: 636101211      rat   7456
# 8: 636157799     mice   3456
# 9: 636263106    kitty   2345
#10: 636752420   kitten   2345

The solution is given in How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'? but I'm looking for an alternative approach, i.e., is it possible to write a script to say retrieve from joined_df using df1 to give found_in_df1 since we have df1 and joined_df?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(2

雨后咖啡店 2025-02-19 03:13:48

data.table 版本使用!antijoin ,以及覆盖:= with df2 返回的所有列/行 NA (回收列表。(Na)到所有列)。
然后在所有常见变量上循环并覆盖与 ID :不匹配的任何值:

library(data.table)
setDT(df1)
setDT(df2)

df2[!df1, on=.(id), names(df2) := .(NA)]
idvars <- "id"
compvars <- setdiff(intersect(names(df1), names(df2)), idvars)
for (i in compvars) {
    df2[!df1, on=c(idvars,i), (i) := NA]
}

#           id     text volume
# 1: 632592651     <NA>   1234
# 2: 633322173      cat    432
# 3: 634703802      dog    324
# 4: 634927873    mouse    333
# 5: 635812953 elephant   2223
# 6: 636004739    goose 412346
# 7: 636101211      rat   7456
# 8: 636157799     mice   3456
# 9: 636263106    kitty   2345
#10: 636752420   kitten   2345
#11:      <NA>     <NA>   <NA>
#12:      <NA>     <NA>   <NA>
#13:      <NA>     <NA>   <NA>
#14:      <NA>     <NA>   <NA>
#15:      <NA>     <NA>   <NA>
#16:      <NA>     <NA>   <NA>
#17:      <NA>     <NA>   <NA>
#18:      <NA>     <NA>   <NA>
#19:      <NA>     <NA>   <NA>
#20:      <NA>     <NA>   <NA>

data.table version using an !antijoin, and overwriting := all columns/rows returned in df2 with an NA (recycled list .(NA) to all columns).
Then looping over all the common variables and overwriting any values which don't match by id:

library(data.table)
setDT(df1)
setDT(df2)

df2[!df1, on=.(id), names(df2) := .(NA)]
idvars <- "id"
compvars <- setdiff(intersect(names(df1), names(df2)), idvars)
for (i in compvars) {
    df2[!df1, on=c(idvars,i), (i) := NA]
}

#           id     text volume
# 1: 632592651     <NA>   1234
# 2: 633322173      cat    432
# 3: 634703802      dog    324
# 4: 634927873    mouse    333
# 5: 635812953 elephant   2223
# 6: 636004739    goose 412346
# 7: 636101211      rat   7456
# 8: 636157799     mice   3456
# 9: 636263106    kitty   2345
#10: 636752420   kitten   2345
#11:      <NA>     <NA>   <NA>
#12:      <NA>     <NA>   <NA>
#13:      <NA>     <NA>   <NA>
#14:      <NA>     <NA>   <NA>
#15:      <NA>     <NA>   <NA>
#16:      <NA>     <NA>   <NA>
#17:      <NA>     <NA>   <NA>
#18:      <NA>     <NA>   <NA>
#19:      <NA>     <NA>   <NA>
#20:      <NA>     <NA>   <NA>
兮颜 2025-02-19 03:13:47

潜在解决方案是使用 powerjoin package

library(dplyr)

df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
                  text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
                  response=c("y","y","y","n","n","y","y","n","n","y"))

df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                  text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                  volume=c(1234,432,324,333,2223,412346,7456,3456,2345,2345,6,345,23,2,4778,234,8675,3459,8,9))

expected_outcome <- data.frame(id = c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420",
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
                               text = c(NA, "cat", "dog", "mouse", "elephant", "goose", 
                                        "rat", "mice", "kitty", "kitten", 
                                        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
                               volume = c(1234, 432, 324, 333, 2223, 412346, 7456, 
                                          3456, 2345, 2345, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))

library(powerjoin)
joined_df <- power_full_join(df1, df2, by = c("id"),
                             conflict = rw ~ ifelse(.x != .y,
                                                    NA_integer_, 
                                                    .x))

final_df <- joined_df %>%
  mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
  select(id, text, volume)
final_df
#>           id     text volume
#> 1  632592651     <NA>   1234
#> 2  633322173      cat    432
#> 3  634703802      dog    324
#> 4  634927873    mouse    333
#> 5  635812953 elephant   2223
#> 6  636004739    goose 412346
#> 7  636101211      rat   7456
#> 8  636157799     mice   3456
#> 9  636263106    kitty   2345
#> 10 636752420   kitten   2345
#> 11      <NA>     <NA>     NA
#> 12      <NA>     <NA>     NA
#> 13      <NA>     <NA>     NA
#> 14      <NA>     <NA>     NA
#> 15      <NA>     <NA>     NA
#> 16      <NA>     <NA>     NA
#> 17      <NA>     <NA>     NA
#> 18      <NA>     <NA>     NA
#> 19      <NA>     <NA>     NA
#> 20      <NA>     <NA>     NA

all_equal(final_df, expected_outcome)
#> [1] TRUE

# Part 2
found_in_df1 <- power_left_join(df1, df2, by = c("id"),
                                conflict = rw ~ ifelse(.x != .y,
                                                       NA_integer_, 
                                                       .x)) %>%
  select(id, text, volume)
found_in_df1
#>           id     text volume
#> 1  632592651     <NA>   1234
#> 2  633322173      cat    432
#> 3  634703802      dog    324
#> 4  634927873    mouse    333
#> 5  635812953 elephant   2223
#> 6  636004739    goose 412346
#> 7  636101211      rat   7456
#> 8  636157799     mice   3456
#> 9  636263106    kitty   2345
#> 10 636752420   kitten   2345

处理冲突的一种 07-02由 peprex package (v2.0.1

) PowerJoin软件包(Mudskipper先生):这些操作是矢量化的,因此您无需执行命令“行”,即您可以删除“ RW”以简化和获得性能。在与DF1和DF2中包括和排除“ RW”之间没有实际区别,但是如果我们使用较大的数据范围,您可以看到性能明显增加,例如

library(dplyr)
library(powerjoin)

# define functions
power_full_join_func_rowwise <- function(df1, df2) {
  joined_df <- power_full_join(df1, df2, by = c("id"),
                               conflict = rw ~ ifelse(.x != .y,
                                                      NA_integer_, 
                                                      .x))
  
  final_df <- joined_df %>%
    mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
    select(id, text, volume)
  return(final_df)
}

power_full_join_func_not_rowwise <- function(df1, df2) {
  joined_df <- power_full_join(df1, df2, by = c("id"),
                               conflict = ~ifelse(.x != .y,
                                                      NA_integer_, 
                                                      .x))
  
  final_df <- joined_df %>%
    mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
    select(id, text, volume)
  return(final_df)
}

library(microbenchmark)
library(purrr)
library(ggplot2)

# make larger dfs (copy df1 and df2 X100)
df3 <- map_dfr(seq_len(100), ~ df1)
df4 <- map_dfr(seq_len(100), ~ df2)

# benchmark performance on the larger dataframes
res <- microbenchmark(power_full_join_func_rowwise(df3, df4),
                      power_full_join_func_not_rowwise(df3, df4))
res
#> Unit: milliseconds
#>                                        expr       min        lq      mean
#>      power_full_join_func_rowwise(df3, df4) 397.32661 426.08117 449.88787
#>  power_full_join_func_not_rowwise(df3, df4)  71.85757  77.25344  90.36191
#>     median        uq      max neval cld
#>  446.41715 472.47817 587.3301   100   b
#>   81.18239  93.95103 191.1248   100  a
autoplot(res)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

”

# Is the result the same?
all_equal(power_full_join_func_rowwise(df3, df4),
          power_full_join_func_not_rowwise(df3, df4))
#> [1] TRUE

在2022-11-24上由 reprex package (v2)。 0.1)

One potential solution for dealing with conflicts is to use the powerjoin package, e.g.

library(dplyr)

df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
                  text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
                  response=c("y","y","y","n","n","y","y","n","n","y"))

df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                  text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                  volume=c(1234,432,324,333,2223,412346,7456,3456,2345,2345,6,345,23,2,4778,234,8675,3459,8,9))

expected_outcome <- data.frame(id = c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420",
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
                               text = c(NA, "cat", "dog", "mouse", "elephant", "goose", 
                                        "rat", "mice", "kitty", "kitten", 
                                        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
                               volume = c(1234, 432, 324, 333, 2223, 412346, 7456, 
                                          3456, 2345, 2345, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))

library(powerjoin)
joined_df <- power_full_join(df1, df2, by = c("id"),
                             conflict = rw ~ ifelse(.x != .y,
                                                    NA_integer_, 
                                                    .x))

final_df <- joined_df %>%
  mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
  select(id, text, volume)
final_df
#>           id     text volume
#> 1  632592651     <NA>   1234
#> 2  633322173      cat    432
#> 3  634703802      dog    324
#> 4  634927873    mouse    333
#> 5  635812953 elephant   2223
#> 6  636004739    goose 412346
#> 7  636101211      rat   7456
#> 8  636157799     mice   3456
#> 9  636263106    kitty   2345
#> 10 636752420   kitten   2345
#> 11      <NA>     <NA>     NA
#> 12      <NA>     <NA>     NA
#> 13      <NA>     <NA>     NA
#> 14      <NA>     <NA>     NA
#> 15      <NA>     <NA>     NA
#> 16      <NA>     <NA>     NA
#> 17      <NA>     <NA>     NA
#> 18      <NA>     <NA>     NA
#> 19      <NA>     <NA>     NA
#> 20      <NA>     <NA>     NA

all_equal(final_df, expected_outcome)
#> [1] TRUE

# Part 2
found_in_df1 <- power_left_join(df1, df2, by = c("id"),
                                conflict = rw ~ ifelse(.x != .y,
                                                       NA_integer_, 
                                                       .x)) %>%
  select(id, text, volume)
found_in_df1
#>           id     text volume
#> 1  632592651     <NA>   1234
#> 2  633322173      cat    432
#> 3  634703802      dog    324
#> 4  634927873    mouse    333
#> 5  635812953 elephant   2223
#> 6  636004739    goose 412346
#> 7  636101211      rat   7456
#> 8  636157799     mice   3456
#> 9  636263106    kitty   2345
#> 10 636752420   kitten   2345

Created on 2022-07-02 by the reprex package (v2.0.1)

Edit

Per the comment below from the creator of the powerjoin package (Mr. Mudskipper): these operations are vectorised, so you don't need to perform the command 'rowwise', i.e. you can remove "rw" to simplify and gain performance. There is no practical difference between including and excluding "rw" with df1 and df2, but if we use larger dataframes you can see a clear increase in performance, e.g.

library(dplyr)
library(powerjoin)

# define functions
power_full_join_func_rowwise <- function(df1, df2) {
  joined_df <- power_full_join(df1, df2, by = c("id"),
                               conflict = rw ~ ifelse(.x != .y,
                                                      NA_integer_, 
                                                      .x))
  
  final_df <- joined_df %>%
    mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
    select(id, text, volume)
  return(final_df)
}

power_full_join_func_not_rowwise <- function(df1, df2) {
  joined_df <- power_full_join(df1, df2, by = c("id"),
                               conflict = ~ifelse(.x != .y,
                                                      NA_integer_, 
                                                      .x))
  
  final_df <- joined_df %>%
    mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
    select(id, text, volume)
  return(final_df)
}

library(microbenchmark)
library(purrr)
library(ggplot2)

# make larger dfs (copy df1 and df2 X100)
df3 <- map_dfr(seq_len(100), ~ df1)
df4 <- map_dfr(seq_len(100), ~ df2)

# benchmark performance on the larger dataframes
res <- microbenchmark(power_full_join_func_rowwise(df3, df4),
                      power_full_join_func_not_rowwise(df3, df4))
res
#> Unit: milliseconds
#>                                        expr       min        lq      mean
#>      power_full_join_func_rowwise(df3, df4) 397.32661 426.08117 449.88787
#>  power_full_join_func_not_rowwise(df3, df4)  71.85757  77.25344  90.36191
#>     median        uq      max neval cld
#>  446.41715 472.47817 587.3301   100   b
#>   81.18239  93.95103 191.1248   100  a
autoplot(res)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

# Is the result the same?
all_equal(power_full_join_func_rowwise(df3, df4),
          power_full_join_func_not_rowwise(df3, df4))
#> [1] TRUE

Created on 2022-11-24 by the reprex package (v2.0.1)

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文