Dplyr:仅当行值 > 时才使用汇总跨来取列的平均值0
我有一个基因表达分数的数据框(细胞x基因)。我还将每个单元格所属的簇存储为一列。
我想计算一组基因(列)的每个簇的平均表达值,但是,我只想包含值 >这些计算中的 0。
我的尝试如下:
test <- gene_scores_df2 %>%
select(all_of(gene_list), Clusters) %>%
group_by(Clusters) %>%
summarize(across(c(1:13), ~mean(. > 0)))
这会产生以下小标题:
# A tibble: 16 x 14
Clusters SLC17A7 GAD1 GAD2 SLC32A1 GLI3 TNC PROX1 SCGN LHX6 NXPH1 MEIS2 ZFHX3 C3
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 C1 0.611 0.605 0.817 0.850 0.979 0.590 0.725 0.434 0.275 0.728 0.949 0.886 0.332
2 C10 0.484 0.401 0.434 0.401 0.791 0.387 0.431 0.362 0.204 0.652 0.715 0.580 0.186
3 C11 0.495 0.5 0.538 0.412 0.847 0.437 0.516 0.453 0.187 0.764 0.804 0.640 0.160
4 C12 0.807 0.626 0.559 0.703 0.942 0.448 0.644 0.366 0.403 0.702 0.917 0.859 0.228
5 C13 0.489 0.578 0.709 0.719 0.796 0.409 0.565 0.371 0.367 0.773 0.716 0.776 0.169
6 C14 0.541 0.347 0.330 0.388 0.731 0.281 0.438 0.279 0.198 0.577 0.777 0.633 0.128
7 C15 0.152 0.306 0.337 0.198 0.629 0.304 0.331 0.179 0.132 0.496 0.509 0.405 0.0556
8 C16 0.402 0.422 0.542 0.418 0.813 0.514 0.614 0.287 0.267 0.729 0.574 0.737 0.279
9 C2 0.152 0.480 0.458 0.297 0.883 0.423 0.511 0.195 0.152 0.722 0.692 0.598 0.0632
10 C3 0.585 0.679 0.659 0.711 0.996 0.886 0.801 0.297 0.305 0.789 0.992 0.963 0.346
11 C4 0.567 0.756 0.893 0.940 0.892 0.334 0.797 0.750 0.376 0.686 0.897 0.885 0.240
12 C5 0.220 0.516 0.560 0.625 0.673 0.250 0.466 0.275 0.358 0.590 0.571 0.641 0.112
13 C6 0.558 0.908 0.836 0.973 0.725 0.280 0.830 0.642 0.871 0.927 0.830 0.916 0.202
14 C7 0.380 0.743 0.749 0.772 0.825 0.415 0.480 0.211 0.199 0.614 0.860 0.901 0.135
15 C8 0.616 0.348 0.312 0.334 0.749 0.271 0.451 0.520 0.129 0.542 0.743 0.735 0.147
16 C9 0.406 0.381 0.400 0.265 0.679 0.266 0.465 0.233 0.0820 0.648 0.565 0.557 0.119
但是,当我对照(我假设的)单列上的类似过程进行检查时,我会得到不同的平均值。
这是 SLC1747 的代码:
gene_scores_df2 %>%
select(SLC17A7, Clusters) %>%
group_by(Clusters) %>%
filter(SLC17A7 > 0) %>%
summarize(SLC17A7 = mean(SLC17A7))
结果:
# A tibble: 16 x 2
Clusters SLC17A7
<chr> <dbl>
1 C1 0.780
2 C10 1.42
3 C11 1.21
4 C12 1.64
5 C13 1.09
6 C14 1.83
7 C15 1.61
8 C16 0.968
9 C2 1.09
10 C3 0.512
11 C4 0.920
12 C5 1.53
13 C6 0.814
14 C7 1.22
15 C8 2.24
16 C9 1.72
我不确定上面的第一次尝试到底出了什么问题。
任何建议将不胜感激。
原始 df 的代码片段
# First 20 cols of:
gene_scores_df2 %>%
select(all_of(gene_list), Clusters) %>%
group_by(Clusters)
structure(list(SLC17A7 = c(0.273, 0.722, 0.699, 0.71, 0.333,
0.674, 0.63, 0.481, 0.274, 0.981, 0.586, 0.401, 0.325, 0.583,
0, 0.348, 0.287, 0, 0.295, 0.351), GAD1 = c(0.355, 0.392, 0.455,
0.34, 0.108, 1.169, 0, 0.426, 2.219, 0.099, 1.16, 0.332, 0.404,
0.284, 0, 5.297, 0.518, 0.027, 1.19, 0.346), GAD2 = c(0.12, 0.562,
0.337, 0.49, 0.095, 0.958, 0.09, 1.518, 1.464, 0.175, 0.419,
0.536, 0.501, 1.103, 0.343, 0, 0.247, 0, 0.635, 0.906), SLC32A1 = c(0,
0.97, 0.067, 0.999, 0.224, 1.04, 0, 2.569, 1.544, 0.059, 2.177,
3.227, 3.603, 1.229, 0.102, 2.421, 0.055, 0.826, 2.646, 0.228
), GLI3 = c(1.527, 0.487, 0.341, 3.352, 0.346, 0.694, 1.395,
0.767, 1.334, 1.373, 1.7, 2.216, 0.394, 1.029, 1.235, 0.55, 2.043,
4.469, 2.901, 4.139), TNC = c(0, 0, 0.448, 0.03, 1.377, 0.045,
0, 0.169, 0.123, 0, 0.188, 0.075, 0, 1.074, 0, 1.272, 0.124,
0.505, 0.173, 0.889), PROX1 = c(0, 0.075, 0.167, 0.782, 0.802,
0.561, 0.098, 0.734, 0.448, 1.645, 0.735, 0.795, 0.102, 0.317,
0.124, 0.324, 0.352, 0.236, 0.826, 0.308), SCGN = c(0.696, 0.234,
0, 0.202, 0.059, 0.162, 0, 0.653, 0.383, 0.42, 0.094, 0.779,
0.228, 0.248, 0.171, 0.089, 0.081, 0.026, 0.159, 0), LHX6 = c(0,
0, 0.134, 0.1, 0.829, 1.489, 0, 0.38, 0.526, 0.117, 0, 0.205,
0.299, 2.235, 0, 1.335, 0, 0.115, 0.454, 0.108), NXPH1 = c(0.792,
0.143, 0.175, 0.658, 0, 1.034, 1.798, 0.219, 0.896, 0.249, 1.336,
1.507, 0.26, 0.242, 1.235, 2.16, 0.235, 0.349, 1.297, 2.234),
MEIS2 = c(4.337, 0.559, 0.978, 1.972, 0.964, 0.657, 0.162,
0.827, 0.882, 0.157, 1.494, 1.171, 2.524, 2.458, 0.205, 0.448,
2.027, 4.767, 1.514, 2.077), ZFHX3 = c(1.48, 1.38, 2.323,
1.039, 1.343, 1.354, 0.238, 1.224, 1.676, 0.811, 0.316, 2.012,
2.298, 1.869, 0.201, 0.176, 1.829, 1.081, 0.522, 0.959),
C3 = c(0.52, 0.527, 0, 0.073, 0, 0.15, 0.094, 0.315, 0.174,
0, 0, 0.17, 0.165, 0, 0.237, 0, 0.091, 0.095, 0, 0.081),
Clusters = c("C12", "C5", "C13", "C4", "C12", "C13", "C13",
"C4", "C6", "C8", "C4", "C4", "C4", "C12", "C5", "C6", "C1",
"C3", "C4", "C3")), row.names = c(NA, -20L), groups = structure(list(
Clusters = c("C1", "C12", "C13", "C3", "C4", "C5", "C6",
"C8"), .rows = structure(list(17L, c(1L, 5L, 14L), c(3L,
6L, 7L), c(18L, 20L), c(4L, 8L, 11L, 12L, 13L, 19L), c(2L,
15L), c(9L, 16L), 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
I have a dataframe of gene expression scores (cells x genes). I also have the cluster that each cell belongs to in stored as a column.
I want to calculate the mean expression values per cluster for a group of genes (columns), however, I only want to include values > 0 in these calculations.
My attempt at this is as follows:
test <- gene_scores_df2 %>%
select(all_of(gene_list), Clusters) %>%
group_by(Clusters) %>%
summarize(across(c(1:13), ~mean(. > 0)))
This produces the following tibble:
# A tibble: 16 x 14
Clusters SLC17A7 GAD1 GAD2 SLC32A1 GLI3 TNC PROX1 SCGN LHX6 NXPH1 MEIS2 ZFHX3 C3
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 C1 0.611 0.605 0.817 0.850 0.979 0.590 0.725 0.434 0.275 0.728 0.949 0.886 0.332
2 C10 0.484 0.401 0.434 0.401 0.791 0.387 0.431 0.362 0.204 0.652 0.715 0.580 0.186
3 C11 0.495 0.5 0.538 0.412 0.847 0.437 0.516 0.453 0.187 0.764 0.804 0.640 0.160
4 C12 0.807 0.626 0.559 0.703 0.942 0.448 0.644 0.366 0.403 0.702 0.917 0.859 0.228
5 C13 0.489 0.578 0.709 0.719 0.796 0.409 0.565 0.371 0.367 0.773 0.716 0.776 0.169
6 C14 0.541 0.347 0.330 0.388 0.731 0.281 0.438 0.279 0.198 0.577 0.777 0.633 0.128
7 C15 0.152 0.306 0.337 0.198 0.629 0.304 0.331 0.179 0.132 0.496 0.509 0.405 0.0556
8 C16 0.402 0.422 0.542 0.418 0.813 0.514 0.614 0.287 0.267 0.729 0.574 0.737 0.279
9 C2 0.152 0.480 0.458 0.297 0.883 0.423 0.511 0.195 0.152 0.722 0.692 0.598 0.0632
10 C3 0.585 0.679 0.659 0.711 0.996 0.886 0.801 0.297 0.305 0.789 0.992 0.963 0.346
11 C4 0.567 0.756 0.893 0.940 0.892 0.334 0.797 0.750 0.376 0.686 0.897 0.885 0.240
12 C5 0.220 0.516 0.560 0.625 0.673 0.250 0.466 0.275 0.358 0.590 0.571 0.641 0.112
13 C6 0.558 0.908 0.836 0.973 0.725 0.280 0.830 0.642 0.871 0.927 0.830 0.916 0.202
14 C7 0.380 0.743 0.749 0.772 0.825 0.415 0.480 0.211 0.199 0.614 0.860 0.901 0.135
15 C8 0.616 0.348 0.312 0.334 0.749 0.271 0.451 0.520 0.129 0.542 0.743 0.735 0.147
16 C9 0.406 0.381 0.400 0.265 0.679 0.266 0.465 0.233 0.0820 0.648 0.565 0.557 0.119
However, when I check this against (what I assume is) a similar procedure on a single column I get different mean values.
Here is the code for SLC1747:
gene_scores_df2 %>%
select(SLC17A7, Clusters) %>%
group_by(Clusters) %>%
filter(SLC17A7 > 0) %>%
summarize(SLC17A7 = mean(SLC17A7))
And the result:
# A tibble: 16 x 2
Clusters SLC17A7
<chr> <dbl>
1 C1 0.780
2 C10 1.42
3 C11 1.21
4 C12 1.64
5 C13 1.09
6 C14 1.83
7 C15 1.61
8 C16 0.968
9 C2 1.09
10 C3 0.512
11 C4 0.920
12 C5 1.53
13 C6 0.814
14 C7 1.22
15 C8 2.24
16 C9 1.72
I'm unsure what exactly is wrong with the first attempt above.
Any suggestions would be greatly appreciated.
Code snippet for original df for
# First 20 cols of:
gene_scores_df2 %>%
select(all_of(gene_list), Clusters) %>%
group_by(Clusters)
structure(list(SLC17A7 = c(0.273, 0.722, 0.699, 0.71, 0.333,
0.674, 0.63, 0.481, 0.274, 0.981, 0.586, 0.401, 0.325, 0.583,
0, 0.348, 0.287, 0, 0.295, 0.351), GAD1 = c(0.355, 0.392, 0.455,
0.34, 0.108, 1.169, 0, 0.426, 2.219, 0.099, 1.16, 0.332, 0.404,
0.284, 0, 5.297, 0.518, 0.027, 1.19, 0.346), GAD2 = c(0.12, 0.562,
0.337, 0.49, 0.095, 0.958, 0.09, 1.518, 1.464, 0.175, 0.419,
0.536, 0.501, 1.103, 0.343, 0, 0.247, 0, 0.635, 0.906), SLC32A1 = c(0,
0.97, 0.067, 0.999, 0.224, 1.04, 0, 2.569, 1.544, 0.059, 2.177,
3.227, 3.603, 1.229, 0.102, 2.421, 0.055, 0.826, 2.646, 0.228
), GLI3 = c(1.527, 0.487, 0.341, 3.352, 0.346, 0.694, 1.395,
0.767, 1.334, 1.373, 1.7, 2.216, 0.394, 1.029, 1.235, 0.55, 2.043,
4.469, 2.901, 4.139), TNC = c(0, 0, 0.448, 0.03, 1.377, 0.045,
0, 0.169, 0.123, 0, 0.188, 0.075, 0, 1.074, 0, 1.272, 0.124,
0.505, 0.173, 0.889), PROX1 = c(0, 0.075, 0.167, 0.782, 0.802,
0.561, 0.098, 0.734, 0.448, 1.645, 0.735, 0.795, 0.102, 0.317,
0.124, 0.324, 0.352, 0.236, 0.826, 0.308), SCGN = c(0.696, 0.234,
0, 0.202, 0.059, 0.162, 0, 0.653, 0.383, 0.42, 0.094, 0.779,
0.228, 0.248, 0.171, 0.089, 0.081, 0.026, 0.159, 0), LHX6 = c(0,
0, 0.134, 0.1, 0.829, 1.489, 0, 0.38, 0.526, 0.117, 0, 0.205,
0.299, 2.235, 0, 1.335, 0, 0.115, 0.454, 0.108), NXPH1 = c(0.792,
0.143, 0.175, 0.658, 0, 1.034, 1.798, 0.219, 0.896, 0.249, 1.336,
1.507, 0.26, 0.242, 1.235, 2.16, 0.235, 0.349, 1.297, 2.234),
MEIS2 = c(4.337, 0.559, 0.978, 1.972, 0.964, 0.657, 0.162,
0.827, 0.882, 0.157, 1.494, 1.171, 2.524, 2.458, 0.205, 0.448,
2.027, 4.767, 1.514, 2.077), ZFHX3 = c(1.48, 1.38, 2.323,
1.039, 1.343, 1.354, 0.238, 1.224, 1.676, 0.811, 0.316, 2.012,
2.298, 1.869, 0.201, 0.176, 1.829, 1.081, 0.522, 0.959),
C3 = c(0.52, 0.527, 0, 0.073, 0, 0.15, 0.094, 0.315, 0.174,
0, 0, 0.17, 0.165, 0, 0.237, 0, 0.091, 0.095, 0, 0.081),
Clusters = c("C12", "C5", "C13", "C4", "C12", "C13", "C13",
"C4", "C6", "C8", "C4", "C4", "C4", "C12", "C5", "C6", "C1",
"C3", "C4", "C3")), row.names = c(NA, -20L), groups = structure(list(
Clusters = c("C1", "C12", "C13", "C3", "C4", "C5", "C6",
"C8"), .rows = structure(list(17L, c(1L, 5L, 14L), c(3L,
6L, 7L), c(18L, 20L), c(4L, 8L, 11L, 12L, 13L, 19L), c(2L,
15L), c(9L, 16L), 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
您想要的是:
~mean(. > 0)
检查某个元素是否大于 0,从而返回 TRUE/FALSE,然后给出基础 0/1 的平均值。相反,您想要过滤每一列,这可以通过通常的[]
方法来实现What you want is:
~mean(. > 0)
checks if an element is greater 0 or not and thus returns TRUE/FALSE and then gives you the mean of the underlying 0/1's. Instead you want to filter each column which can be achieved with the usual[]
approach