在R中,将多列整数转换为因子的最佳方法是什么

发布于 2025-01-10 15:18:02 字数 1857 浏览 0 评论 0原文

这适用于一个玩具示例,但我认为对于更大的数据帧,必须有更好的方法来做到这一点(更快,更少的内存)。任何建议表示赞赏!

library(tidyverse)
library(tictoc)
cyl <- tibble(integer_value = unique(mtcars$cyl), 
              as_a_string = paste(unique(mtcars$cyl), " cylinders"))%>%
  mutate(variable = "cyl")
gear <- tibble(integer_value = unique(mtcars$gear), 
               as_a_string = paste(unique(mtcars$cyl), " gears"))%>%
  mutate(variable = "gear")
carb <- tibble(integer_value = unique(mtcars$carb), 
               as_a_string = paste(unique(mtcars$carb)," carburetors"))%>%
  mutate(variable = "carb")
vs <- tibble(integer_value = unique(mtcars$vs), 
             as_a_string = c("V shaped", "straight"))%>%
  mutate(variable = "vs")
am <- tibble(integer_value = unique(mtcars$vs), 
             as_a_string = c("Automatic", "Manual"))%>%
  mutate(variable = "am")

factor_info <- rbind(cyl,gear,carb,vs,am)%>%
  select(variable,everything())

df <- mtcars
tic()
for(var in unique(factor_info$variable)){
  col <- mtcars%>%
    select(all_of(var))%>%
    mutate(variable = all_of(var))%>%
    rename(integer_value = all_of(var))
  fac <- factor_info%>%
    filter(variable == all_of(var))
  df[[all_of(var)]] <- inner_join(col, fac)%>%
    select(as_a_string)%>%
    pull()
}
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
df <- df%>%
  as_tibble() %>%
  mutate(across(where(is.character), factor))
toc()
#> 0.172 sec elapsed

reprex 软件包 (v2.0.1) 创建于 2022 年 2 月 25 日

This works for a toy example, but I think there must be a better way to do this (faster, less memory) for larger dataframes. Any suggestions appreciated!

library(tidyverse)
library(tictoc)
cyl <- tibble(integer_value = unique(mtcars$cyl), 
              as_a_string = paste(unique(mtcars$cyl), " cylinders"))%>%
  mutate(variable = "cyl")
gear <- tibble(integer_value = unique(mtcars$gear), 
               as_a_string = paste(unique(mtcars$cyl), " gears"))%>%
  mutate(variable = "gear")
carb <- tibble(integer_value = unique(mtcars$carb), 
               as_a_string = paste(unique(mtcars$carb)," carburetors"))%>%
  mutate(variable = "carb")
vs <- tibble(integer_value = unique(mtcars$vs), 
             as_a_string = c("V shaped", "straight"))%>%
  mutate(variable = "vs")
am <- tibble(integer_value = unique(mtcars$vs), 
             as_a_string = c("Automatic", "Manual"))%>%
  mutate(variable = "am")

factor_info <- rbind(cyl,gear,carb,vs,am)%>%
  select(variable,everything())

df <- mtcars
tic()
for(var in unique(factor_info$variable)){
  col <- mtcars%>%
    select(all_of(var))%>%
    mutate(variable = all_of(var))%>%
    rename(integer_value = all_of(var))
  fac <- factor_info%>%
    filter(variable == all_of(var))
  df[[all_of(var)]] <- inner_join(col, fac)%>%
    select(as_a_string)%>%
    pull()
}
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
#> Joining, by = c("integer_value", "variable")
df <- df%>%
  as_tibble() %>%
  mutate(across(where(is.character), factor))
toc()
#> 0.172 sec elapsed

Created on 2022-02-25 by the reprex package (v2.0.1)

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

我爱人 2025-01-17 15:18:02

我没有足够的代表来添加评论。这是 TarJae 答案下的另一个变体。使用 purrr::modify_if

library(tidyverse)
mtcars %>%
  modify_if(is.numeric, as.factor) %>%
  str()
#> 'data.frame':    32 obs. of  11 variables:
#>  $ mpg : Factor w/ 25 levels "10.4","13.3",..: 16 16 19 17 13 12 3 20 19 14 ...
#>  $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
#>  $ disp: Factor w/ 27 levels "71.1","75.7",..: 13 13 6 16 23 15 23 12 10 14 ...
#>  $ hp  : Factor w/ 22 levels "52","62","65",..: 11 11 6 11 15 9 20 2 7 13 ...
#>  $ drat: Factor w/ 22 levels "2.76","2.93",..: 16 16 15 5 6 1 7 11 17 17 ...
#>  $ wt  : Factor w/ 29 levels "1.513","1.615",..: 9 12 7 16 18 19 21 15 13 18 ...
#>  $ qsec: Factor w/ 30 levels "14.5","14.6",..: 6 10 22 24 10 29 5 27 30 19 ...
#>  $ vs  : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
#>  $ am  : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
#>  $ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ...
#>  $ carb: Factor w/ 6 levels "1","2","3","4",..: 4 4 1 1 2 1 4 2 2 4 ...

reprex 包于 2022 年 2 月 26 日创建< /a> (v2.0.1)

I don't have enough rep to add a comment. Here is another variant to put under TarJae's answer. Using purrr::modify_if:

library(tidyverse)
mtcars %>%
  modify_if(is.numeric, as.factor) %>%
  str()
#> 'data.frame':    32 obs. of  11 variables:
#>  $ mpg : Factor w/ 25 levels "10.4","13.3",..: 16 16 19 17 13 12 3 20 19 14 ...
#>  $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
#>  $ disp: Factor w/ 27 levels "71.1","75.7",..: 13 13 6 16 23 15 23 12 10 14 ...
#>  $ hp  : Factor w/ 22 levels "52","62","65",..: 11 11 6 11 15 9 20 2 7 13 ...
#>  $ drat: Factor w/ 22 levels "2.76","2.93",..: 16 16 15 5 6 1 7 11 17 17 ...
#>  $ wt  : Factor w/ 29 levels "1.513","1.615",..: 9 12 7 16 18 19 21 15 13 18 ...
#>  $ qsec: Factor w/ 30 levels "14.5","14.6",..: 6 10 22 24 10 29 5 27 30 19 ...
#>  $ vs  : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
#>  $ am  : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
#>  $ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ...
#>  $ carb: Factor w/ 6 levels "1","2","3","4",..: 4 4 1 1 2 1 4 2 2 4 ...

Created on 2022-02-26 by the reprex package (v2.0.1)

黯然#的苍凉 2025-01-17 15:18:02

新答案删除了第一个。我认为您需要来自 forcats 包的 fct_relabel:经过 0.04 秒:

library(forcats)
library(dplyr)
library(tictoc)

tic()
mtcars %>% 
  mutate(cyl = fct_relabel(as.factor(cyl), ~paste(unique(cyl), as.character("cylinder"))),
         vs = fct_relabel(as.factor(vs), ~paste(unique(vs), as.character(c("V shaped", "straight")))),
         am = fct_relabel(as.factor(am), ~paste(unique(am), as.character(c("Automatic", "Manual")))),
         gear = fct_relabel(as.factor(gear), ~paste(unique(gear), as.character("gears"))),
         carb = fct_relabel(as.factor(carb), ~paste(unique(carb), as.character("carburetors")))) 
toc()

输出:

   mpg cyl         disp    hp  drat    wt  qsec vs         am          gear    carb         
   <dbl> <fct>      <dbl> <dbl> <dbl> <dbl> <dbl> <fct>      <fct>       <fct>   <fct>        
 1  21   4 cylinder  160    110  3.9   2.62  16.5 0 V shaped 0 Manual    3 gears 3 carburetors
 2  21   4 cylinder  160    110  3.9   2.88  17.0 0 V shaped 0 Manual    3 gears 3 carburetors
 3  22.8 6 cylinder  108     93  3.85  2.32  18.6 1 straight 0 Manual    3 gears 4 carburetors
 4  21.4 4 cylinder  258    110  3.08  3.22  19.4 1 straight 1 Automatic 4 gears 4 carburetors
 5  18.7 8 cylinder  360    175  3.15  3.44  17.0 0 V shaped 1 Automatic 4 gears 1 carburetors
 6  18.1 4 cylinder  225    105  2.76  3.46  20.2 1 straight 1 Automatic 4 gears 4 carburetors
 7  14.3 8 cylinder  360    245  3.21  3.57  15.8 0 V shaped 1 Automatic 4 gears 3 carburetors
 8  24.4 6 cylinder  147.    62  3.69  3.19  20   1 straight 1 Automatic 3 gears 1 carburetors
 9  22.8 6 cylinder  141.    95  3.92  3.15  22.9 1 straight 1 Automatic 3 gears 1 carburetors
10  19.2 4 cylinder  168.   123  3.92  3.44  18.3 1 straight 1 Automatic 3 gears 3 carburetors
# ... with 22 more rows

> toc()
0.04 sec elapsed

New answer deleted the first one. I think you need fct_relabel from forcats package: elapsed 0.04 sec:

library(forcats)
library(dplyr)
library(tictoc)

tic()
mtcars %>% 
  mutate(cyl = fct_relabel(as.factor(cyl), ~paste(unique(cyl), as.character("cylinder"))),
         vs = fct_relabel(as.factor(vs), ~paste(unique(vs), as.character(c("V shaped", "straight")))),
         am = fct_relabel(as.factor(am), ~paste(unique(am), as.character(c("Automatic", "Manual")))),
         gear = fct_relabel(as.factor(gear), ~paste(unique(gear), as.character("gears"))),
         carb = fct_relabel(as.factor(carb), ~paste(unique(carb), as.character("carburetors")))) 
toc()

output:

   mpg cyl         disp    hp  drat    wt  qsec vs         am          gear    carb         
   <dbl> <fct>      <dbl> <dbl> <dbl> <dbl> <dbl> <fct>      <fct>       <fct>   <fct>        
 1  21   4 cylinder  160    110  3.9   2.62  16.5 0 V shaped 0 Manual    3 gears 3 carburetors
 2  21   4 cylinder  160    110  3.9   2.88  17.0 0 V shaped 0 Manual    3 gears 3 carburetors
 3  22.8 6 cylinder  108     93  3.85  2.32  18.6 1 straight 0 Manual    3 gears 4 carburetors
 4  21.4 4 cylinder  258    110  3.08  3.22  19.4 1 straight 1 Automatic 4 gears 4 carburetors
 5  18.7 8 cylinder  360    175  3.15  3.44  17.0 0 V shaped 1 Automatic 4 gears 1 carburetors
 6  18.1 4 cylinder  225    105  2.76  3.46  20.2 1 straight 1 Automatic 4 gears 4 carburetors
 7  14.3 8 cylinder  360    245  3.21  3.57  15.8 0 V shaped 1 Automatic 4 gears 3 carburetors
 8  24.4 6 cylinder  147.    62  3.69  3.19  20   1 straight 1 Automatic 3 gears 1 carburetors
 9  22.8 6 cylinder  141.    95  3.92  3.15  22.9 1 straight 1 Automatic 3 gears 1 carburetors
10  19.2 4 cylinder  168.   123  3.92  3.44  18.3 1 straight 1 Automatic 3 gears 3 carburetors
# ... with 22 more rows

> toc()
0.04 sec elapsed
百善笑为先 2025-01-17 15:18:02

x 的类型为 integer 时,as.factor(x)factor(x) 更快、更高效并且 length(x) 很大。 mtcars 中的分类变量是整数值,但存储为 double:

nms <- c("cyl", "vs", "am", "gear", "carb")
vapply(mtcars[nms], typeof, "")
##      cyl       vs       am     gear     carb 
## "double" "double" "double" "double" "double"

在这种情况下,您可以强制有效地分解因子,

dd <- mtcars
dd[nms] <- lapply(dd[nms], function(x) as.factor(as.integer(x)))

然后使用 FWIW 有效地重命名级别

lvs <- list(cyl = paste(levels(dd$cyl), "cylinders"),
            vs = c("V-shaped", "straight"),
            am = c("automatic", "manual"),
            gear = paste(levels(dd$gear), "gears"),
            carb = paste(levels(dd$carb), "carburetors"))

for (nm in nms) levels(dd[[nm]]) <- lvs[[nm]]
head(dd[nms])
##                           cyl       vs        am    gear          carb
## Mazda RX4         6 cylinders V-shaped    manual 4 gears 4 carburetors
## Mazda RX4 Wag     6 cylinders V-shaped    manual 4 gears 4 carburetors
## Datsun 710        4 cylinders straight    manual 4 gears 1 carburetors
## Hornet 4 Drive    6 cylinders straight automatic 3 gears 1 carburetors
## Hornet Sportabout 8 cylinders V-shaped automatic 3 gears 2 carburetors
## Valiant           6 cylinders straight automatic 3 gears 1 carburetors

head(mtcars[nms])
##                   cyl vs am gear carb
## Mazda RX4           6  0  1    4    4
## Mazda RX4 Wag       6  0  1    4    4
## Datsun 710          4  1  1    4    1
## Hornet 4 Drive      6  1  0    3    1
## Hornet Sportabout   8  0  0    3    2
## Valiant             6  1  0    3    1

,这整个操作在我的机器上花费不到一毫秒。

## Unit: microseconds
##  expr     min     lq     mean median      uq      max neval
##  expr 212.954 217.71 289.4744 230.42 236.693 6116.749   100

as.factor(x) is faster and more efficient than factor(x) when x is of type integer and length(x) is large. The categorical variables in mtcars are integer-valued but stored as double:

nms <- c("cyl", "vs", "am", "gear", "carb")
vapply(mtcars[nms], typeof, "")
##      cyl       vs       am     gear     carb 
## "double" "double" "double" "double" "double"

In this situation, you can coerce to factor efficiently with

dd <- mtcars
dd[nms] <- lapply(dd[nms], function(x) as.factor(as.integer(x)))

then rename the levels efficiently with

lvs <- list(cyl = paste(levels(dd$cyl), "cylinders"),
            vs = c("V-shaped", "straight"),
            am = c("automatic", "manual"),
            gear = paste(levels(dd$gear), "gears"),
            carb = paste(levels(dd$carb), "carburetors"))

for (nm in nms) levels(dd[[nm]]) <- lvs[[nm]]
head(dd[nms])
##                           cyl       vs        am    gear          carb
## Mazda RX4         6 cylinders V-shaped    manual 4 gears 4 carburetors
## Mazda RX4 Wag     6 cylinders V-shaped    manual 4 gears 4 carburetors
## Datsun 710        4 cylinders straight    manual 4 gears 1 carburetors
## Hornet 4 Drive    6 cylinders straight automatic 3 gears 1 carburetors
## Hornet Sportabout 8 cylinders V-shaped automatic 3 gears 2 carburetors
## Valiant           6 cylinders straight automatic 3 gears 1 carburetors

head(mtcars[nms])
##                   cyl vs am gear carb
## Mazda RX4           6  0  1    4    4
## Mazda RX4 Wag       6  0  1    4    4
## Datsun 710          4  1  1    4    1
## Hornet 4 Drive      6  1  0    3    1
## Hornet Sportabout   8  0  0    3    2
## Valiant             6  1  0    3    1

FWIW, this entire operation takes less than a millisecond on my machine.

## Unit: microseconds
##  expr     min     lq     mean median      uq      max neval
##  expr 212.954 217.71 289.4744 230.42 236.693 6116.749   100
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文