是否有一种简单的方法来重新编码因子变量的水平,使得低于给定频率出现的水平被重新编码为“其他”变量?

发布于 2025-01-18 00:41:00 字数 181 浏览 5 评论 0原文

theshold <- c(0.001, 0.5, 0.1)

df$a df$b df$c - 基于小于第一个阈值的级别频率重新编码级别

df$x df$y df$x - 基于小于第一个阈值的级别频率重新编码级别小于第二阈值

df$d df$e df$f - 根据小于第三阈值的级别频率重新编码级别

theshold <- c(0.001, 0.5, 0.1)

df$a df$b df$c - Recode levels based on level frequency of less than first threshold

df$x df$y df$x - Recode levels based on level frequency of less than second threshold

df$d df$e df$f - Recode levels based on level frequency of less than third threshold

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

世界和平 2025-01-25 00:41:00

您正在寻找fct_lump_prop()来自forcats

library(forcats)
library(dplyr)

dat <- data.frame(base = c("A", "A", "A",
                           "B", "B",
                           "C",
                           "D"))

dat |> mutate(base0.2 = fct_lump_prop(base, 0.2),
              base0.3 = fct_lump_prop(base, 0.3))

输出

#>   base base0.2 base0.3
#> 1    A       A       A
#> 2    A       A       A
#> 3    A       A       A
#> 4    B       B   Other
#> 5    B       B   Other
#> 6    C   Other   Other
#> 7    D   Other   Other

You're looking for fct_lump_prop() from forcats.

library(forcats)
library(dplyr)

dat <- data.frame(base = c("A", "A", "A",
                           "B", "B",
                           "C",
                           "D"))

dat |> mutate(base0.2 = fct_lump_prop(base, 0.2),
              base0.3 = fct_lump_prop(base, 0.3))

Output

#>   base base0.2 base0.3
#> 1    A       A       A
#> 2    A       A       A
#> 3    A       A       A
#> 4    B       B   Other
#> 5    B       B   Other
#> 6    C   Other   Other
#> 7    D   Other   Other

Created on 2022-03-31 by the reprex package (v2.0.0)

妖妓 2025-01-25 00:41:00

可能会有一个更容易的整洁这样做的方法,但是您可以编写一个可以实现此功能的功能:

set.seed(519)
x <- sample(LETTERS[1:5], 1000, prob=c(.01,.1,.29,.3,.3), replace=TRUE)
x <- as.factor(x)
recode_thresh <- function(x, threshold = .15){
  tab <- table(x)/sum(table(x))
  levs <- levels(x)
  levs <- c(levs, "other")
  x <- as.character(x)
  if(any(tab < threshold)){
    x <- ifelse(x %in% names(tab)[which(tab < threshold)], "other", x)
  }
  levs <- intersect(levs, unique(x))
  factor(x, levels=levs)
}
x2 <- recode_thresh(x, threshold=.15)
table(x)/1000
#> x
#>     A     B     C     D     E 
#> 0.014 0.106 0.294 0.276 0.310
table(x2)/1000
#> x2
#>     C     D     E other 
#> 0.294 0.276 0.310 0.120

在2022-03-31上由 reprex软件包(v2.0.1)

There may be an easier tidy way of doing this, but you could write a little function that would implement this:

set.seed(519)
x <- sample(LETTERS[1:5], 1000, prob=c(.01,.1,.29,.3,.3), replace=TRUE)
x <- as.factor(x)
recode_thresh <- function(x, threshold = .15){
  tab <- table(x)/sum(table(x))
  levs <- levels(x)
  levs <- c(levs, "other")
  x <- as.character(x)
  if(any(tab < threshold)){
    x <- ifelse(x %in% names(tab)[which(tab < threshold)], "other", x)
  }
  levs <- intersect(levs, unique(x))
  factor(x, levels=levs)
}
x2 <- recode_thresh(x, threshold=.15)
table(x)/1000
#> x
#>     A     B     C     D     E 
#> 0.014 0.106 0.294 0.276 0.310
table(x2)/1000
#> x2
#>     C     D     E other 
#> 0.294 0.276 0.310 0.120

Created on 2022-03-31 by the reprex package (v2.0.1)

夜无邪 2025-01-25 00:41:00

有了Andreas的建议和进一步的阅读,我想出了以下内容,这可以享受待遇。谢谢

agg_cats_thresholds <- c(0.01, 0.05, 0.005, 0.001)
agg_cats_thresholds <- as.data.frame(agg_cats_thresholds)

#create the lists of variables

factor_columns1 <- c("a", "b","c", "d", "e")
factor_columns2 <- c("f")
factor_columns3 <- c("g")
factor_columns4 <- c("h", "i", "j", "k")

# Use fct_lump_prop to reduce the levels of the various factor variables

churn.ml[factor_columns1] <- lapply(churn.ml[factor_columns1], 
                           fct_lump_prop, prop = agg_cats_thresholds[1,] 
,other_level = 'other')

churn.ml[factor_columns2] <- lapply(churn.ml[factor_columns2], 
                                fct_lump_prop, prop = 
agg_cats_thresholds[2,] ,other_level = 'other')

churn.ml[factor_columns3] <- lapply(churn.ml[factor_columns3], 
                                fct_lump_prop, prop = 
agg_cats_thresholds[3,] ,other_level = 'other')

with Andreas suggestion and further reading, i came up with the following which worked a treat. thanks

agg_cats_thresholds <- c(0.01, 0.05, 0.005, 0.001)
agg_cats_thresholds <- as.data.frame(agg_cats_thresholds)

#create the lists of variables

factor_columns1 <- c("a", "b","c", "d", "e")
factor_columns2 <- c("f")
factor_columns3 <- c("g")
factor_columns4 <- c("h", "i", "j", "k")

# Use fct_lump_prop to reduce the levels of the various factor variables

churn.ml[factor_columns1] <- lapply(churn.ml[factor_columns1], 
                           fct_lump_prop, prop = agg_cats_thresholds[1,] 
,other_level = 'other')

churn.ml[factor_columns2] <- lapply(churn.ml[factor_columns2], 
                                fct_lump_prop, prop = 
agg_cats_thresholds[2,] ,other_level = 'other')

churn.ml[factor_columns3] <- lapply(churn.ml[factor_columns3], 
                                fct_lump_prop, prop = 
agg_cats_thresholds[3,] ,other_level = 'other')
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文