仅当 R (dplyr) 中满足特定条件时，才将一列中的某些数据替换为另一列数据

发布于 2025-01-12 02:17:05 字数 580 浏览 0 评论 0原文

我有一个包含 30 多列和 10000 多行的广泛数据框。今天我想重点关注两列：languages 和 languages2：

languages             languages2

Spanish                  NA
Spanish                  NA
Other (specify)        French
Other (specify)        German
Other (specify)        Russian
English                  NA
Other (specify)        Portuguese
English                  NA
(...)

这就是我所需要的：

languages

Spanish
Spanish
French
German
Russian
English
Portuguese
English
(...)

我正在使用 mutate 函数寻找答案dplyr

原文

I have an extensive data frame with 30+ columns and 10000+ rows. Today I want to focus in two columns: languages and languages2:

languages             languages2

Spanish                  NA
Spanish                  NA
Other (specify)        French
Other (specify)        German
Other (specify)        Russian
English                  NA
Other (specify)        Portuguese
English                  NA
(...)

This is what I need:

languages

Spanish
Spanish
French
German
Russian
English
Portuguese
English
(...)

I am looking for an answer using mutate function from dplyr

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

留一抹残留的笑 2025-01-19 02:17:05

对于更大的数据，您可能需要控制其他场景，例如防止用 NA 替换数据并保留 Other 值。此外，如果解决第一列可能包含 Other() 或 Other, lang 的场景可能有好处。您可能需要考虑使用正则表达式或预处理第一列。

library("tidyverse")
dta <- tribble(
    ~lang1, ~lang2,
    "Spanish", NA,
    "Other", "English",
    "Other", NA
)

mutate(dta, lang1 = case_when(
    grepl("^Other,*", lang1) & !is.na(lang2) ~ lang2,
    TRUE ~ lang1
))

For a bigger data you may want to control for additional scenarios, like preventing replacing the data with NA and laving the Other value. Also if there may be a merit for addressing scenarios where the first column may contain Other() or Other, lang. You may want to consider using a regular expression or pre-processing the first column.

library("tidyverse")
dta <- tribble(
    ~lang1, ~lang2,
    "Spanish", NA,
    "Other", "English",
    "Other", NA
)

mutate(dta, lang1 = case_when(
    grepl("^Other,*", lang1) & !is.na(lang2) ~ lang2,
    TRUE ~ lang1
))

回复收藏 0 原文

中性美 2025-01-19 02:17:05

使用dplyr，我们可以将Other (specify)替换为NA，然后使用coalesce：

library(dplyr)

df %>%
  mutate(languages = coalesce(na_if(languages, "Other (specify)"), languages2)) %>%
  select(languages)

输出< /strong>

  languages
1    Spanish
2    Spanish
3     French
4     German
5    Russian
6    English
7 Portuguese
8    English

tidyverse 选项是使用 str_replace_all 将 Other (specify) 替换为 languages2 中的值。

library(tidyverse)

df %>%
  mutate(languages = str_replace_all(languages,"Other \\(specify\\)", languages2)) %>% 
  select(languages)

数据

df <- structure(list(languages = c("Spanish", "Spanish", "Other (specify)", 
"Other (specify)", "Other (specify)", "English", "Other (specify)", 
"English"), languages2 = c(NA, NA, "French", "German", "Russian", 
NA, "Portuguese", NA)), class = "data.frame", row.names = c(NA, 
-8L))

基准

但是，如果您有大量数据并且需要更快的速度，那么您可能会考虑基本 R，它比 dplyr 更快或data.table。

bm <- microbenchmark::microbenchmark(Konrad = mutate(df, languages = case_when(
  grepl("^Other,*", languages) & !is.na(languages2) ~ languages2,
  TRUE ~ languages
)),
langtang = df %>%
  mutate(languages=if_else(languages=="Other (specify)", languages2, languages)),
valentina = df %>%
  mutate(languages=if_else(!is.na(languages2), languages2, languages)),
andrew_stringr = df %>%
  mutate(languages = str_replace_all(languages,"Other \\(specify\\)", languages2)),
andrew_coalesce = df %>%
  mutate(languages = coalesce(na_if(languages, "Other (specify)"), languages2, 'Other (specify)')),
andrew_baseR = {df1 <- df; df1[df1$languages == "Other (specify)", "languages"] <- df1[df1$languages == "Other (specify)", "languages2" ]},
andrew_baseR_with = {df2 <- df; df2$languages <- with( df2, ifelse( languages == "Other (specify)", languages2, languages ) )},
andrew_datatable = {dt <- as.data.table(df); dt[languages == "Other (specify)", languages := languages2 ]},
times = 1000)

Using dplyr, we could replace Other (specify) with NA, then use coalesce:

library(dplyr)

df %>%
  mutate(languages = coalesce(na_if(languages, "Other (specify)"), languages2)) %>%
  select(languages)

Output

  languages
1    Spanish
2    Spanish
3     French
4     German
5    Russian
6    English
7 Portuguese
8    English

A tidyverse option is to use str_replace_all to replace Other (specify) with the value from languages2.

library(tidyverse)

df %>%
  mutate(languages = str_replace_all(languages,"Other \\(specify\\)", languages2)) %>% 
  select(languages)

Data

df <- structure(list(languages = c("Spanish", "Spanish", "Other (specify)", 
"Other (specify)", "Other (specify)", "English", "Other (specify)", 
"English"), languages2 = c(NA, NA, "French", "German", "Russian", 
NA, "Portuguese", NA)), class = "data.frame", row.names = c(NA, 
-8L))

Benchmark

However, if you have a lot of data and need something faster, then you might consider base R, which would be faster than dplyr or data.table.

bm <- microbenchmark::microbenchmark(Konrad = mutate(df, languages = case_when(
  grepl("^Other,*", languages) & !is.na(languages2) ~ languages2,
  TRUE ~ languages
)),
langtang = df %>%
  mutate(languages=if_else(languages=="Other (specify)", languages2, languages)),
valentina = df %>%
  mutate(languages=if_else(!is.na(languages2), languages2, languages)),
andrew_stringr = df %>%
  mutate(languages = str_replace_all(languages,"Other \\(specify\\)", languages2)),
andrew_coalesce = df %>%
  mutate(languages = coalesce(na_if(languages, "Other (specify)"), languages2, 'Other (specify)')),
andrew_baseR = {df1 <- df; df1[df1$languages == "Other (specify)", "languages"] <- df1[df1$languages == "Other (specify)", "languages2" ]},
andrew_baseR_with = {df2 <- df; df2$languages <- with( df2, ifelse( languages == "Other (specify)", languages2, languages ) )},
andrew_datatable = {dt <- as.data.table(df); dt[languages == "Other (specify)", languages := languages2 ]},
times = 1000)

回复收藏 0 原文

你又不是我 2025-01-19 02:17:05

data %>% 
  mutate(languages=if_else(languages=="Other (specify)", languages2, languages)

data %>% 
  mutate(languages=if_else(languages=="Other (specify)", languages2, languages)

回复收藏 0 原文

无妨# 2025-01-19 02:17:05

另一种可能性（如果您希望检查位于第二列而不是第一列）：

library(dplyr)

df <- data.frame(languages=c("Spanish","Spanish","Other (specify)","Other (specify)","Other (specify)","English","Other (specify)","English"),languages2=c(NA,NA,"French","German","Russian",NA,"Portuguese",NA))

df %>%
 mutate(languages=if_else(!is.na(languages2), languages2, languages))

   languages languages2
1    Spanish       <NA>
2    Spanish       <NA>
3     French     French
4     German     German
5    Russian    Russian
6    English       <NA>
7 Portuguese Portuguese
8    English       <NA>

Another possibility (if you prefer the check to be over the 2nd column instead of the 1st):

library(dplyr)

df <- data.frame(languages=c("Spanish","Spanish","Other (specify)","Other (specify)","Other (specify)","English","Other (specify)","English"),languages2=c(NA,NA,"French","German","Russian",NA,"Portuguese",NA))

df %>%
 mutate(languages=if_else(!is.na(languages2), languages2, languages))

   languages languages2
1    Spanish       <NA>
2    Spanish       <NA>
3     French     French
4     German     German
5    Russian    Russian
6    English       <NA>
7 Portuguese Portuguese
8    English       <NA>

回复收藏 0 原文

~没有更多了~