R:计算矩阵的频率表,给定截止值的数据框

发布于 2025-01-17 20:07:36 字数 8 浏览 0 评论 0原文

continue

Say I have a matrix like the following, with marker values per id, 10 events per id (in this example):

set.seed(123)
mymat <- matrix(rnorm(300), nrow=30)
rownames(mymat) <- paste0('id',rep(1:3,each=10))
colnames(mymat) <- letters[1:10]
>   head(mymat)
              a          b          c          d          e          f          g          h          i          j
id1 -0.56047565  0.4264642  0.3796395  0.9935039  0.1176466  0.7877388 -1.0633261  0.1192452 -0.7886220  0.8450130
id1 -0.23017749 -0.2950715 -0.5023235  0.5483970 -0.9474746  0.7690422  1.2631852  0.2436874 -0.5021987  0.9625280
id1  1.55870831  0.8951257 -0.3332074  0.2387317 -0.4905574  0.3322026 -0.3496504  1.2324759  1.4960607  0.6843094
id1  0.07050839  0.8781335 -1.0185754 -0.6279061 -0.2560922 -1.0083766 -0.8655129 -0.5160638 -1.1373036 -1.3952743
id1  0.12928774  0.8215811 -1.0717912  1.3606524  1.8438620 -0.1194526 -0.2362796 -0.9925072 -0.1790516  0.8496430
id1  1.71506499  0.6886403  0.3035286 -0.6002596 -0.6519499 -0.2803953 -0.1971759  1.6756969  1.9023618 -0.4465572

And an associated data frame of cutoff values (a min and a max cutoff per id and marker), like this one:

cutoff_df <- data.frame(id=paste0('id',rep(1:3,each=10)), marker=rep(letters[1:10],3), min=runif(30, 0, 2), max=runif(30, 5, 7))
>   head(cutoff_df)
   id marker       min      max
1 id1      a 0.4744594 6.518271
2 id1      b 1.3729807 6.689669
3 id1      c 0.4516368 5.915843
4 id1      d 0.6369892 6.459263
5 id1      e 0.3479676 5.208157
6 id1      f 1.6028592 5.439966

What I want to do here, is calculate a frequency table, so that I record the percentage of events per id and marker that fall into the cutoffs for that id and marker.

This is my attempt using some ugly nested loops... Wondering if there is a nicer and cleaner way to do this, ideally with base functions or data.table or tidyr...

My ugly code:

freq_mat <- matrix(nrow=length(unique(rownames(mymat))))
rownames(freq_mat) <- unique(rownames(mymat))
for (mk in colnames(mymat)){
  mk_freq <- NULL
  for (id in unique(rownames(mymat))){
    data <- mymat[rownames(mymat)==id,mk]
    min <- cutoff_df$min[cutoff_df$id==id & cutoff_df$marker==mk]
    max <- cutoff_df$max[cutoff_df$id==id & cutoff_df$marker==mk]
    ins <- length(data[data>=min & data<=max])
    freq <- ins/length(data)*100
    mk_freq <- c(mk_freq, freq)
  }
  mk_freq <- as.data.frame(mk_freq)
  names(mk_freq) <- mk
  freq_mat <- cbind(freq_mat, mk_freq)
}
> freq_mat
    freq_mat  a  b  c  d  e  f  g  h  i  j
id1       NA 20  0 20 40 10  0 30 10 20 30
id2       NA 10 30 30  0 20 10 10  0  0 70
id3       NA  0  0  0  0 30 10 30 10 30 60

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

狼性发作 2025-01-24 20:07:36

这样的东西?在这里,所有单元格的总和均为100。

library(tidyverse)

set.seed(123)
mymat <- matrix(rnorm(300), nrow = 30)
rownames(mymat) <- paste0("id", rep(1:3, each = 10))
colnames(mymat) <- letters[1:10]
cutoff_df <- data.frame(
  id = paste0("id", rep(1:3, each = 10)),
  marker = rep(letters[1:10], 3), min = runif(30, 0, 2), max = runif(30, 5, 7)
)

mymat %>%
  as_tibble(rownames = "id") %>%
  pivot_longer(-id, names_to = "marker") %>%
  left_join(cutoff_df) %>%
  filter(value <= max & value >= min) %>%
  count(id, marker) %>%
  # group_by(marker) %>% # e.g. to make sum of 100 per marker
  mutate(n = n / sum(n) * 100) %>%
  pivot_wider(names_from = marker, values_from = n, values_fill = list(n = 0))
#> Joining, by = c("id", "marker")
#> # A tibble: 3 × 11
#>   id        a     c     d     e     g     h     i     j     b     f
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  3.77  7.55  1.89  5.66  1.89  3.77  5.66  0     0   
#> 2 id2    1.89  5.66  0     3.77  1.89  0     0    13.2   5.66  1.89
#> 3 id3    0     0     0     5.66  5.66  1.89  5.66 11.3   0     1.89

在2022-03-30创建的 reprex package (v2.0.0)

Something like this? Here, the sum of all cells is 100.

library(tidyverse)

set.seed(123)
mymat <- matrix(rnorm(300), nrow = 30)
rownames(mymat) <- paste0("id", rep(1:3, each = 10))
colnames(mymat) <- letters[1:10]
cutoff_df <- data.frame(
  id = paste0("id", rep(1:3, each = 10)),
  marker = rep(letters[1:10], 3), min = runif(30, 0, 2), max = runif(30, 5, 7)
)

mymat %>%
  as_tibble(rownames = "id") %>%
  pivot_longer(-id, names_to = "marker") %>%
  left_join(cutoff_df) %>%
  filter(value <= max & value >= min) %>%
  count(id, marker) %>%
  # group_by(marker) %>% # e.g. to make sum of 100 per marker
  mutate(n = n / sum(n) * 100) %>%
  pivot_wider(names_from = marker, values_from = n, values_fill = list(n = 0))
#> Joining, by = c("id", "marker")
#> # A tibble: 3 × 11
#>   id        a     c     d     e     g     h     i     j     b     f
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  3.77  7.55  1.89  5.66  1.89  3.77  5.66  0     0   
#> 2 id2    1.89  5.66  0     3.77  1.89  0     0    13.2   5.66  1.89
#> 3 id3    0     0     0     5.66  5.66  1.89  5.66 11.3   0     1.89

Created on 2022-03-30 by the reprex package (v2.0.0)

丢了幸福的猪 2025-01-24 20:07:36

这是一个基于 purrr 包的解决方案。我不确定它是否更干净,但它更短。

library(purrr)
asplit(mymat,2) |>
  imap(~{
    with(filter(cutoff_df, marker == .y),
         outer(.x, min, ">=") &
         outer(.x, max, "<") &
         outer(names(.x), id, "=="))
  }) |>
  map(rowSums) |>
  map_dfr(~tapply(.x, names(.x), FUN = sum),
          .id = "marker")

##> + # A tibble: 10 × 4
##>    marker   id1   id2   id3
##>    <chr>  <dbl> <dbl> <dbl>
##>  1 a          2     1     0
##>  2 b          0     3     0
##>  3 c          2     3     0
##>  4 d          4     0     0
##>  5 e          1     2     3
##>  6 f          0     1     1
##>  7 g          3     1     3
##>  8 h          1     0     1
##>  9 i          2     0     3
##> 10 j          3     7     6

Here is a solution based on the purrr package. I'm not sure that it is cleaner but it is shorter.

library(purrr)
asplit(mymat,2) |>
  imap(~{
    with(filter(cutoff_df, marker == .y),
         outer(.x, min, ">=") &
         outer(.x, max, "<") &
         outer(names(.x), id, "=="))
  }) |>
  map(rowSums) |>
  map_dfr(~tapply(.x, names(.x), FUN = sum),
          .id = "marker")

##> + # A tibble: 10 × 4
##>    marker   id1   id2   id3
##>    <chr>  <dbl> <dbl> <dbl>
##>  1 a          2     1     0
##>  2 b          0     3     0
##>  3 c          2     3     0
##>  4 d          4     0     0
##>  5 e          1     2     3
##>  6 f          0     1     1
##>  7 g          3     1     3
##>  8 h          1     0     1
##>  9 i          2     0     3
##> 10 j          3     7     6
盛装女皇 2025-01-24 20:07:36

基于dplyr的另一个可能的解决方案:

library(dplyr)

data.frame(id = rownames(mymat), mymat) %>% 
 group_by(id) %>% 
 summarise(across(everything(),
 ~ sum(.x >= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 3] &
      .x <= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 4]))) %>% 
 mutate(aux = sum(cur_data()[,-1]), across(-id, ~ .x*100/aux[1]), aux = NULL)

#> # A tibble: 3 x 11
#>   id        a     b     c     d     e     f     g     h     i     j
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  0     3.77  7.55  1.89  0     5.66  1.89  3.77  5.66
#> 2 id2    1.89  5.66  5.66  0     3.77  1.89  1.89  0     0    13.2 
#> 3 id3    0     0     0     0     5.66  1.89  5.66  1.89  5.66 11.3

Another possible solution, based on dplyr:

library(dplyr)

data.frame(id = rownames(mymat), mymat) %>% 
 group_by(id) %>% 
 summarise(across(everything(),
 ~ sum(.x >= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 3] &
      .x <= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 4]))) %>% 
 mutate(aux = sum(cur_data()[,-1]), across(-id, ~ .x*100/aux[1]), aux = NULL)

#> # A tibble: 3 x 11
#>   id        a     b     c     d     e     f     g     h     i     j
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  0     3.77  7.55  1.89  0     5.66  1.89  3.77  5.66
#> 2 id2    1.89  5.66  5.66  0     3.77  1.89  1.89  0     0    13.2 
#> 3 id3    0     0     0     0     5.66  1.89  5.66  1.89  5.66 11.3
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文