虚拟编码语法（一个热门编码问题）

发布于 2025-01-16 17:15:03 字数 1695 浏览 0 评论 0原文

我有如下所示的样本数据：

id <- c("1a","2c","3d","4f","5g","6e","7f","8q","9r","10v","11x","12l")
O <- c(1,1,0,1,1,0,0,1,0,1,0,1)
dg1 <- c("A02","A84","B12","C94","D37","D12","D68","E12","F48","H12","Z83","")
dg2 <- c("B18","N34","A02","M01","B12","J02","K52","","I10","","","B18")
df <- cbind.data.frame(id,O,dg1,dg2)

我正在尝试获取一个如下所示的数据框，以便我可以针对每个变量对 O 进行单变量逻辑回归。

A02 <- c(1,0,1,0,0,0,0,0,0,0,0,0)
A84 <- c(0,1,0,0,0,0,0,0,0,0,0,0)
B12 <- c(0,0,1,0,1,0,0,0,0,0,0,0)
B18 <- c(1,0,0,0,0,0,0,0,0,0,0,1)
C94 <- c(0,0,0,1,0,0,0,0,0,0,0,0)
D12 <- c(0,0,0,0,0,1,0,0,0,0,0,0)
D37 <- c(0,0,0,0,1,0,0,0,0,0,0,0)
D68 <- c(0,0,0,0,0,0,1,0,0,0,0,0)
E12 <- c(0,0,0,0,0,0,0,1,0,0,0,0)
F48 <- c(0,0,0,0,0,0,0,0,1,0,0,0)
H12 <- c(0,0,0,0,0,0,0,0,0,1,0,0)
I10 <- c(0,0,0,0,0,0,0,0,1,0,0,0)
J02 <- c(0,0,0,0,0,1,0,0,0,0,0,0)
K52 <- c(0,0,0,0,0,0,1,0,0,0,0,0)
M01 <- c(0,0,0,1,0,0,0,0,0,0,0,0)
N34 <- c(0,1,0,0,0,0,0,0,0,0,0,0)
Z83 <- c(0,0,0,0,0,0,0,0,0,0,1,0)

df <- cbind.data.frame(df,A02,A84,B12,B18,C94,D12,D37,D68,E12,F48,H12,I10,J02,K52,M01,N34,Z83)

我尝试遵循此处的代码和此处但遇到了我不知道如何解决的问题使固定。谁能指出我的错误/误解？我更愿意在 dplyr 或 base 中找到解决方案，但真的愿意尝试任何事情。

尝试：

dumbo <- model.matrix(id ~ dg1+dg2,df)

library(recipes)
dumber <- df %>% recipe(id ~ .) %>%
  step_dummy(dg1:dg2,
             one_hot = TRUE)  %>% 
  prep() %>% bake(new_data=NULL)

原文

I have sample data that looks like this:

id <- c("1a","2c","3d","4f","5g","6e","7f","8q","9r","10v","11x","12l")
O <- c(1,1,0,1,1,0,0,1,0,1,0,1)
dg1 <- c("A02","A84","B12","C94","D37","D12","D68","E12","F48","H12","Z83","")
dg2 <- c("B18","N34","A02","M01","B12","J02","K52","","I10","","","B18")
df <- cbind.data.frame(id,O,dg1,dg2)

I am trying to get a data frame that looks like this so that I can do a univariate logistic regression on O against each variable.

A02 <- c(1,0,1,0,0,0,0,0,0,0,0,0)
A84 <- c(0,1,0,0,0,0,0,0,0,0,0,0)
B12 <- c(0,0,1,0,1,0,0,0,0,0,0,0)
B18 <- c(1,0,0,0,0,0,0,0,0,0,0,1)
C94 <- c(0,0,0,1,0,0,0,0,0,0,0,0)
D12 <- c(0,0,0,0,0,1,0,0,0,0,0,0)
D37 <- c(0,0,0,0,1,0,0,0,0,0,0,0)
D68 <- c(0,0,0,0,0,0,1,0,0,0,0,0)
E12 <- c(0,0,0,0,0,0,0,1,0,0,0,0)
F48 <- c(0,0,0,0,0,0,0,0,1,0,0,0)
H12 <- c(0,0,0,0,0,0,0,0,0,1,0,0)
I10 <- c(0,0,0,0,0,0,0,0,1,0,0,0)
J02 <- c(0,0,0,0,0,1,0,0,0,0,0,0)
K52 <- c(0,0,0,0,0,0,1,0,0,0,0,0)
M01 <- c(0,0,0,1,0,0,0,0,0,0,0,0)
N34 <- c(0,1,0,0,0,0,0,0,0,0,0,0)
Z83 <- c(0,0,0,0,0,0,0,0,0,0,1,0)

df <- cbind.data.frame(df,A02,A84,B12,B18,C94,D12,D37,D68,E12,F48,H12,I10,J02,K52,M01,N34,Z83)

I've attempted to follow the code here and here but ran into issues that I wasn't sure how to fix. Can anyone point out my mistake/misunderstanding? I would prefer to have a solution in dplyr or base, but really willing to try anything.

Attempts:

dumbo <- model.matrix(id ~ dg1+dg2,df)

library(recipes)
dumber <- df %>% recipe(id ~ .) %>%
  step_dummy(dg1:dg2,
             one_hot = TRUE)  %>% 
  prep() %>% bake(new_data=NULL)

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

绳情 2025-01-23 17:15:03

df %>%
  pivot_longer(-c(id, O), names_to = NULL) %>%
  filter(value != "") %>% 
  pivot_wider(
    id_cols = c(id, O),
    names_from = value,
    values_from = value,
    values_fn = length, 
    values_fill = 0, 
    names_sort = TRUE
  )

# A tibble: 12 x 19
   id        O   A02   A84   B12   B18   C94   D12   D37   D68   E12   F48   H12   I10   J02   K52
   <chr> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
 1 1a        1     1     0     0     1     0     0     0     0     0     0     0     0     0     0
 2 2c        1     0     1     0     0     0     0     0     0     0     0     0     0     0     0
 3 3d        0     1     0     1     0     0     0     0     0     0     0     0     0     0     0
 4 4f        1     0     0     0     0     1     0     0     0     0     0     0     0     0     0
 5 5g        1     0     0     1     0     0     0     1     0     0     0     0     0     0     0
 6 6e        0     0     0     0     0     0     1     0     0     0     0     0     0     1     0
 7 7f        0     0     0     0     0     0     0     0     1     0     0     0     0     0     1
 8 8q        1     0     0     0     0     0     0     0     0     1     0     0     0     0     0
 9 9r        0     0     0     0     0     0     0     0     0     0     1     0     1     0     0
10 10v       1     0     0     0     0     0     0     0     0     0     0     1     0     0     0
11 11x       0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
12 12l       1     0     0     0     1     0     0     0     0     0     0     0     0     0     0
# ... with 3 more variables: M01 <int>, N34 <int>, Z83 <int>

df %>%
  pivot_longer(-c(id, O), names_to = NULL) %>%
  filter(value != "") %>% 
  pivot_wider(
    id_cols = c(id, O),
    names_from = value,
    values_from = value,
    values_fn = length, 
    values_fill = 0, 
    names_sort = TRUE
  )

# A tibble: 12 x 19
   id        O   A02   A84   B12   B18   C94   D12   D37   D68   E12   F48   H12   I10   J02   K52
   <chr> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
 1 1a        1     1     0     0     1     0     0     0     0     0     0     0     0     0     0
 2 2c        1     0     1     0     0     0     0     0     0     0     0     0     0     0     0
 3 3d        0     1     0     1     0     0     0     0     0     0     0     0     0     0     0
 4 4f        1     0     0     0     0     1     0     0     0     0     0     0     0     0     0
 5 5g        1     0     0     1     0     0     0     1     0     0     0     0     0     0     0
 6 6e        0     0     0     0     0     0     1     0     0     0     0     0     0     1     0
 7 7f        0     0     0     0     0     0     0     0     1     0     0     0     0     0     1
 8 8q        1     0     0     0     0     0     0     0     0     1     0     0     0     0     0
 9 9r        0     0     0     0     0     0     0     0     0     0     1     0     1     0     0
10 10v       1     0     0     0     0     0     0     0     0     0     0     1     0     0     0
11 11x       0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
12 12l       1     0     0     0     1     0     0     0     0     0     0     0     0     0     0
# ... with 3 more variables: M01 <int>, N34 <int>, Z83 <int>

回复收藏 0 原文

弥繁 2025-01-23 17:15:03

library(tidyverse)


df %>% 
  left_join(
  df %>% 
    pivot_longer(c(dg1, dg2)) %>% 
    filter(value != "") %>% 
    pivot_wider(c(id, O), names_from = value) %>% 
    mutate(across(c(A02:Z83), ~if_else(is.na(.x), 0, 1)))
  )

Joining, by = c("id", "O")
    id O dg1 dg2 A02 B18 A84 N34 B12 C94 M01 D37 D12 J02 D68 K52 E12 F48 I10 H12 Z83
1   1a 1 A02 B18   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
2   2c 1 A84 N34   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0
3   3d 0 B12 A02   1   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0
4   4f 1 C94 M01   0   0   0   0   0   1   1   0   0   0   0   0   0   0   0   0   0
5   5g 1 D37 B12   0   0   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0
6   6e 0 D12 J02   0   0   0   0   0   0   0   0   1   1   0   0   0   0   0   0   0
7   7f 0 D68 K52   0   0   0   0   0   0   0   0   0   0   1   1   0   0   0   0   0
8   8q 1 E12       0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
9   9r 0 F48 I10   0   0   0   0   0   0   0   0   0   0   0   0   0   1   1   0   0
10 10v 1 H12       0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
11 11x 0 Z83       0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
12 12l 1     B18   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0

library(tidyverse)


df %>% 
  left_join(
  df %>% 
    pivot_longer(c(dg1, dg2)) %>% 
    filter(value != "") %>% 
    pivot_wider(c(id, O), names_from = value) %>% 
    mutate(across(c(A02:Z83), ~if_else(is.na(.x), 0, 1)))
  )

Joining, by = c("id", "O")
    id O dg1 dg2 A02 B18 A84 N34 B12 C94 M01 D37 D12 J02 D68 K52 E12 F48 I10 H12 Z83
1   1a 1 A02 B18   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
2   2c 1 A84 N34   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0
3   3d 0 B12 A02   1   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0
4   4f 1 C94 M01   0   0   0   0   0   1   1   0   0   0   0   0   0   0   0   0   0
5   5g 1 D37 B12   0   0   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0
6   6e 0 D12 J02   0   0   0   0   0   0   0   0   1   1   0   0   0   0   0   0   0
7   7f 0 D68 K52   0   0   0   0   0   0   0   0   0   0   1   1   0   0   0   0   0
8   8q 1 E12       0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
9   9r 0 F48 I10   0   0   0   0   0   0   0   0   0   0   0   0   0   1   1   0   0
10 10v 1 H12       0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
11 11x 0 Z83       0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
12 12l 1     B18   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0

回复收藏 0 原文

明媚殇 2025-01-23 17:15:03

我在 github 上有一个包 {dplyover} ，它有助于解决此类问题而无需数据矩形（旋转）。为了使其工作，我们首先需要将空单元格 "" 转换为 NA。然后我们可以使用 dplyover::dist_values 来获取没有 NA 的唯一值，并循环它们以创建新列。我们需要按行执行此操作，因为值可以位于 dg1 或 dg2 中。

library(dplyr)
library(dplyover) # https://timteafan.github.io/dplyover/

df %>% 
  mutate(across(c(dg1, dg2), ~ ifelse(.x == "", NA_character_, .x))) %>% 
  rowwise %>% 
  mutate(over(dist_values(c(.$dg1, .$dg2)),
              ~ifelse(.x %in% c(dg1, dg2), 1, 0)))

#> # A tibble: 12 x 21
#> # Rowwise: 
#>    id        O dg1   dg2     A02   A84   B12   B18   C94   D12   D37   D68   E12
#>    <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1 1a        1 A02   B18       1     0     0     1     0     0     0     0     0
#>  2 2c        1 A84   N34       0     1     0     0     0     0     0     0     0
#>  3 3d        0 B12   A02       1     0     1     0     0     0     0     0     0
#>  4 4f        1 C94   M01       0     0     0     0     1     0     0     0     0
#>  5 5g        1 D37   B12       0     0     1     0     0     0     1     0     0
#>  6 6e        0 D12   J02       0     0     0     0     0     1     0     0     0
#>  7 7f        0 D68   K52       0     0     0     0     0     0     0     1     0
#>  8 8q        1 E12   <NA>      0     0     0     0     0     0     0     0     1
#>  9 9r        0 F48   I10       0     0     0     0     0     0     0     0     0
#> 10 10v       1 H12   <NA>      0     0     0     0     0     0     0     0     0
#> 11 11x       0 Z83   <NA>      0     0     0     0     0     0     0     0     0
#> 12 12l       1 <NA>  B18       0     0     0     1     0     0     0     0     0
#> # … with 8 more variables: F48 <dbl>, H12 <dbl>, I10 <dbl>, J02 <dbl>,
#> #   K52 <dbl>, M01 <dbl>, N34 <dbl>, Z83 <dbl>

^{由 reprex 软件包 (v0.3.0) 创建于 2022 年 3 月 24 日}

I have a package {dplyover} on github which helps to solve this kind of problems without data rectangling (pivoting). To make it work we first need to convert the empty cells "" into NAs. Then we can use dplyover::dist_values to get the unique values without NAs and loop over them to create new columns. We need to do this rowwise, since the values can be either in dg1 or dg2.

library(dplyr)
library(dplyover) # https://timteafan.github.io/dplyover/

df %>% 
  mutate(across(c(dg1, dg2), ~ ifelse(.x == "", NA_character_, .x))) %>% 
  rowwise %>% 
  mutate(over(dist_values(c(.$dg1, .$dg2)),
              ~ifelse(.x %in% c(dg1, dg2), 1, 0)))

#> # A tibble: 12 x 21
#> # Rowwise: 
#>    id        O dg1   dg2     A02   A84   B12   B18   C94   D12   D37   D68   E12
#>    <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1 1a        1 A02   B18       1     0     0     1     0     0     0     0     0
#>  2 2c        1 A84   N34       0     1     0     0     0     0     0     0     0
#>  3 3d        0 B12   A02       1     0     1     0     0     0     0     0     0
#>  4 4f        1 C94   M01       0     0     0     0     1     0     0     0     0
#>  5 5g        1 D37   B12       0     0     1     0     0     0     1     0     0
#>  6 6e        0 D12   J02       0     0     0     0     0     1     0     0     0
#>  7 7f        0 D68   K52       0     0     0     0     0     0     0     1     0
#>  8 8q        1 E12   <NA>      0     0     0     0     0     0     0     0     1
#>  9 9r        0 F48   I10       0     0     0     0     0     0     0     0     0
#> 10 10v       1 H12   <NA>      0     0     0     0     0     0     0     0     0
#> 11 11x       0 Z83   <NA>      0     0     0     0     0     0     0     0     0
#> 12 12l       1 <NA>  B18       0     0     0     1     0     0     0     0     0
#> # … with 8 more variables: F48 <dbl>, H12 <dbl>, I10 <dbl>, J02 <dbl>,
#> #   K52 <dbl>, M01 <dbl>, N34 <dbl>, Z83 <dbl>

^{Created on 2022-03-24 by the reprex package (v0.3.0)}

回复收藏 0 原文

~没有更多了~