R：计算矩阵的频率表，给定截止值的数据框

发布于 2025-01-17 20:07:36 字数 8 浏览 0 评论 0原文

continue

Say I have a matrix like the following, with marker values per id, 10 events per id (in this example):

set.seed(123)
mymat <- matrix(rnorm(300), nrow=30)
rownames(mymat) <- paste0('id',rep(1:3,each=10))
colnames(mymat) <- letters[1:10]
>   head(mymat)
              a          b          c          d          e          f          g          h          i          j
id1 -0.56047565  0.4264642  0.3796395  0.9935039  0.1176466  0.7877388 -1.0633261  0.1192452 -0.7886220  0.8450130
id1 -0.23017749 -0.2950715 -0.5023235  0.5483970 -0.9474746  0.7690422  1.2631852  0.2436874 -0.5021987  0.9625280
id1  1.55870831  0.8951257 -0.3332074  0.2387317 -0.4905574  0.3322026 -0.3496504  1.2324759  1.4960607  0.6843094
id1  0.07050839  0.8781335 -1.0185754 -0.6279061 -0.2560922 -1.0083766 -0.8655129 -0.5160638 -1.1373036 -1.3952743
id1  0.12928774  0.8215811 -1.0717912  1.3606524  1.8438620 -0.1194526 -0.2362796 -0.9925072 -0.1790516  0.8496430
id1  1.71506499  0.6886403  0.3035286 -0.6002596 -0.6519499 -0.2803953 -0.1971759  1.6756969  1.9023618 -0.4465572

And an associated data frame of cutoff values (a min and a max cutoff per id and marker), like this one:

cutoff_df <- data.frame(id=paste0('id',rep(1:3,each=10)), marker=rep(letters[1:10],3), min=runif(30, 0, 2), max=runif(30, 5, 7))
>   head(cutoff_df)
   id marker       min      max
1 id1      a 0.4744594 6.518271
2 id1      b 1.3729807 6.689669
3 id1      c 0.4516368 5.915843
4 id1      d 0.6369892 6.459263
5 id1      e 0.3479676 5.208157
6 id1      f 1.6028592 5.439966

What I want to do here, is calculate a frequency table, so that I record the percentage of events per id and marker that fall into the cutoffs for that id and marker.

This is my attempt using some ugly nested loops... Wondering if there is a nicer and cleaner way to do this, ideally with base functions or data.table or tidyr...

My ugly code:

freq_mat <- matrix(nrow=length(unique(rownames(mymat))))
rownames(freq_mat) <- unique(rownames(mymat))
for (mk in colnames(mymat)){
  mk_freq <- NULL
  for (id in unique(rownames(mymat))){
    data <- mymat[rownames(mymat)==id,mk]
    min <- cutoff_df$min[cutoff_df$id==id & cutoff_df$marker==mk]
    max <- cutoff_df$max[cutoff_df$id==id & cutoff_df$marker==mk]
    ins <- length(data[data>=min & data<=max])
    freq <- ins/length(data)*100
    mk_freq <- c(mk_freq, freq)
  }
  mk_freq <- as.data.frame(mk_freq)
  names(mk_freq) <- mk
  freq_mat <- cbind(freq_mat, mk_freq)
}
> freq_mat
    freq_mat  a  b  c  d  e  f  g  h  i  j
id1       NA 20  0 20 40 10  0 30 10 20 30
id2       NA 10 30 30  0 20 10 10  0  0 70
id3       NA  0  0  0  0 30 10 30 10 30 60

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

狼性发作 2025-01-24 20:07:36

这样的东西？在这里，所有单元格的总和均为100。

library(tidyverse)

set.seed(123)
mymat <- matrix(rnorm(300), nrow = 30)
rownames(mymat) <- paste0("id", rep(1:3, each = 10))
colnames(mymat) <- letters[1:10]
cutoff_df <- data.frame(
  id = paste0("id", rep(1:3, each = 10)),
  marker = rep(letters[1:10], 3), min = runif(30, 0, 2), max = runif(30, 5, 7)
)

mymat %>%
  as_tibble(rownames = "id") %>%
  pivot_longer(-id, names_to = "marker") %>%
  left_join(cutoff_df) %>%
  filter(value <= max & value >= min) %>%
  count(id, marker) %>%
  # group_by(marker) %>% # e.g. to make sum of 100 per marker
  mutate(n = n / sum(n) * 100) %>%
  pivot_wider(names_from = marker, values_from = n, values_fill = list(n = 0))
#> Joining, by = c("id", "marker")
#> # A tibble: 3 × 11
#>   id        a     c     d     e     g     h     i     j     b     f
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  3.77  7.55  1.89  5.66  1.89  3.77  5.66  0     0   
#> 2 id2    1.89  5.66  0     3.77  1.89  0     0    13.2   5.66  1.89
#> 3 id3    0     0     0     5.66  5.66  1.89  5.66 11.3   0     1.89

^{在2022-03-30创建的 reprex package （v2.0.0）}

Something like this? Here, the sum of all cells is 100.

library(tidyverse)

set.seed(123)
mymat <- matrix(rnorm(300), nrow = 30)
rownames(mymat) <- paste0("id", rep(1:3, each = 10))
colnames(mymat) <- letters[1:10]
cutoff_df <- data.frame(
  id = paste0("id", rep(1:3, each = 10)),
  marker = rep(letters[1:10], 3), min = runif(30, 0, 2), max = runif(30, 5, 7)
)

mymat %>%
  as_tibble(rownames = "id") %>%
  pivot_longer(-id, names_to = "marker") %>%
  left_join(cutoff_df) %>%
  filter(value <= max & value >= min) %>%
  count(id, marker) %>%
  # group_by(marker) %>% # e.g. to make sum of 100 per marker
  mutate(n = n / sum(n) * 100) %>%
  pivot_wider(names_from = marker, values_from = n, values_fill = list(n = 0))
#> Joining, by = c("id", "marker")
#> # A tibble: 3 × 11
#>   id        a     c     d     e     g     h     i     j     b     f
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  3.77  7.55  1.89  5.66  1.89  3.77  5.66  0     0   
#> 2 id2    1.89  5.66  0     3.77  1.89  0     0    13.2   5.66  1.89
#> 3 id3    0     0     0     5.66  5.66  1.89  5.66 11.3   0     1.89

^{Created on 2022-03-30 by the reprex package (v2.0.0)}

回复收藏 0 原文

丢了幸福的猪 2025-01-24 20:07:36

这是一个基于 purrr 包的解决方案。我不确定它是否更干净，但它更短。

library(purrr)
asplit(mymat,2) |>
  imap(~{
    with(filter(cutoff_df, marker == .y),
         outer(.x, min, ">=") &
         outer(.x, max, "<") &
         outer(names(.x), id, "=="))
  }) |>
  map(rowSums) |>
  map_dfr(~tapply(.x, names(.x), FUN = sum),
          .id = "marker")

##> + # A tibble: 10 × 4
##>    marker   id1   id2   id3
##>    <chr>  <dbl> <dbl> <dbl>
##>  1 a          2     1     0
##>  2 b          0     3     0
##>  3 c          2     3     0
##>  4 d          4     0     0
##>  5 e          1     2     3
##>  6 f          0     1     1
##>  7 g          3     1     3
##>  8 h          1     0     1
##>  9 i          2     0     3
##> 10 j          3     7     6

Here is a solution based on the purrr package. I'm not sure that it is cleaner but it is shorter.

library(purrr)
asplit(mymat,2) |>
  imap(~{
    with(filter(cutoff_df, marker == .y),
         outer(.x, min, ">=") &
         outer(.x, max, "<") &
         outer(names(.x), id, "=="))
  }) |>
  map(rowSums) |>
  map_dfr(~tapply(.x, names(.x), FUN = sum),
          .id = "marker")

##> + # A tibble: 10 × 4
##>    marker   id1   id2   id3
##>    <chr>  <dbl> <dbl> <dbl>
##>  1 a          2     1     0
##>  2 b          0     3     0
##>  3 c          2     3     0
##>  4 d          4     0     0
##>  5 e          1     2     3
##>  6 f          0     1     1
##>  7 g          3     1     3
##>  8 h          1     0     1
##>  9 i          2     0     3
##> 10 j          3     7     6

回复收藏 0 原文

盛装女皇 2025-01-24 20:07:36

基于dplyr的另一个可能的解决方案：

library(dplyr)

data.frame(id = rownames(mymat), mymat) %>% 
 group_by(id) %>% 
 summarise(across(everything(),
 ~ sum(.x >= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 3] &
      .x <= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 4]))) %>% 
 mutate(aux = sum(cur_data()[,-1]), across(-id, ~ .x*100/aux[1]), aux = NULL)

#> # A tibble: 3 x 11
#>   id        a     b     c     d     e     f     g     h     i     j
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  0     3.77  7.55  1.89  0     5.66  1.89  3.77  5.66
#> 2 id2    1.89  5.66  5.66  0     3.77  1.89  1.89  0     0    13.2 
#> 3 id3    0     0     0     0     5.66  1.89  5.66  1.89  5.66 11.3

Another possible solution, based on dplyr:

library(dplyr)

data.frame(id = rownames(mymat), mymat) %>% 
 group_by(id) %>% 
 summarise(across(everything(),
 ~ sum(.x >= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 3] &
      .x <= cutoff_df[(cutoff_df$id == cur_group()$id[1]) & (cur_column() == cutoff_df$marker), 4]))) %>% 
 mutate(aux = sum(cur_data()[,-1]), across(-id, ~ .x*100/aux[1]), aux = NULL)

#> # A tibble: 3 x 11
#>   id        a     b     c     d     e     f     g     h     i     j
#>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 id1    3.77  0     3.77  7.55  1.89  0     5.66  1.89  3.77  5.66
#> 2 id2    1.89  5.66  5.66  0     3.77  1.89  1.89  0     0    13.2 
#> 3 id3    0     0     0     0     5.66  1.89  5.66  1.89  5.66 11.3

回复收藏 0 原文

~没有更多了~