R：如何绘制散点图进行统计检验？

发布于 2025-01-14 11:08:41 字数 3458 浏览 3 评论 0原文

我想创建一个基因散点图，其中 x 轴为突变患者的数量，y 轴为统计测试的结果（以探索是否有任何突变基因在有反应或无反应的患者中富集）。

本质上，x 轴代表 res.sig 数据帧的 Group1 列中具有 Responder 值的患者数量，并用Hugo_Symbol 信息。 y 轴表示 res.sig 数据帧中的 -log10 fdr 值。

# Clinical enrichment
response.ce = clinicalEnrichment(maf=d, clinicalFeature="Response")

# Significant associations p-value < 0.05
res.sig <- response.ce$groupwise_comparision[p_value < 0.05]

# Genes with the number of mutated patients 
library(dplyr)
x <- res.sig %>%
  filter(Group1=='Responder') %>%
  mutate(first_letter = substr(n_mutated_group1, 1, 1)) %>%
  pull(first_letter)

ann <- res.sig %>%
  filter(Group1=="Responder") %>%
  pull(Hugo_Symbol)

# FDR
y <- res.sig %>%
  filter(Group1=="Responder") %>%
  pull(fdr)


# Scatter plot
png("Rplot06.png")
plot(x, -log10(y), type="p", xlim=c(0,25), main="Scatterplot of statistically significant mutated genes", xlab="Number of mutated patients", ylab="-log10(fdr)", pch=19)
text(x, y-1, labels=ann)
dev.off()

我的图仅显示三个点（没有标签），尽管我期望有 7 个带有标签的点（对应于 x 和 y 轴中的 7 个值）。

res.sig

> dput(res.sig)
structure(list(Hugo_Symbol = c("ERCC2", "ERCC2", "AKAP9", "AKAP9", 
"HERC1", "HERC1", "HECTD1", "HECTD1", "MACF1", "MACF1", "MROH2B", 
"MROH2B", "KMT2C", "KMT2C"), Group1 = c("Non-Responder", "Responder", 
"Non-Responder", "Responder", "Non-Responder", "Responder", "Non-Responder", 
"Responder", "Non-Responder", "Responder", "Non-Responder", "Responder", 
"Non-Responder", "Responder"), Group2 = c("Rest", "Rest", "Rest", 
"Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest", 
"Rest", "Rest", "Rest"), n_mutated_group1 = c("0 of 25", "9", 
"0 of 25", "6", "0 of 25", "6", "0 of 25", "6", "0 of 25", "6", 
"0 of 25", "6", "1 of 25", "7"), n_mutated_group2 = c("9 of 25", 
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", "6 of 25", 
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", "7 of 25", 
"1 of 25"), p_value = c(0.00163083541184905, 0.00163083541184905, 
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.022289766970618, 
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.022289766970618, 
0.022289766970618, 0.022289766970618, 0.0487971536957187, 0.0487971536957187
), OR = c(0, Inf, 0, Inf, 0, Inf, 0, Inf, 0, Inf, 0, Inf, 0.111488645279478, 
8.96952328636894), OR_low = c(0, 2.56647319276964, 0, 1.33358819424024, 
0, 1.33358819424024, 0, 1.33358819424024, 0, 1.33358819424024, 
0, 1.33358819424024, 0.00228988507629356, 1.0079479819766), OR_high = c(0.38963976043749, 
Inf, 0.749856668137133, Inf, 0.749856668137133, Inf, 0.749856668137133, 
Inf, 0.749856668137133, Inf, 0.749856668137133, Inf, 0.992114690322592, 
436.703138665198), fdr = c(0.109265972593886, 0.109265972593886, 
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568, 
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568, 
0.248902397838568, 0.248902397838568, 0.467058471087594, 0.467058471087594
)), row.names = c(NA, -14L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x000002adab171ef0>, index = structure(integer(0), "`__Group1`" = c(1L, 
3L, 5L, 7L, 9L, 11L, 13L, 2L, 4L, 6L, 8L, 10L, 12L, 14L)))

原文

I want to create a scatter plot of genes with the number of mutated patients on the x-axis and the results of a statistical test (to explore if any mutated genes are enriched in patients who either responded or not) on the y-axis.

Essentially, the x-axis represents the number of patients with the Responder value in Group1 column of the res.sig dataframe and is labelled with the Hugo_Symbol information. The y-axis represents the -log10 fdr value in the res.sig dataframe.

# Clinical enrichment
response.ce = clinicalEnrichment(maf=d, clinicalFeature="Response")

# Significant associations p-value < 0.05
res.sig <- response.ce$groupwise_comparision[p_value < 0.05]

# Genes with the number of mutated patients 
library(dplyr)
x <- res.sig %>%
  filter(Group1=='Responder') %>%
  mutate(first_letter = substr(n_mutated_group1, 1, 1)) %>%
  pull(first_letter)

ann <- res.sig %>%
  filter(Group1=="Responder") %>%
  pull(Hugo_Symbol)

# FDR
y <- res.sig %>%
  filter(Group1=="Responder") %>%
  pull(fdr)


# Scatter plot
png("Rplot06.png")
plot(x, -log10(y), type="p", xlim=c(0,25), main="Scatterplot of statistically significant mutated genes", xlab="Number of mutated patients", ylab="-log10(fdr)", pch=19)
text(x, y-1, labels=ann)
dev.off()

My plot only show three points (no labels), although I'm expecting 7 points with labels (corresponding with the 7 values in the x and y axis).

res.sig

> dput(res.sig)
structure(list(Hugo_Symbol = c("ERCC2", "ERCC2", "AKAP9", "AKAP9", 
"HERC1", "HERC1", "HECTD1", "HECTD1", "MACF1", "MACF1", "MROH2B", 
"MROH2B", "KMT2C", "KMT2C"), Group1 = c("Non-Responder", "Responder", 
"Non-Responder", "Responder", "Non-Responder", "Responder", "Non-Responder", 
"Responder", "Non-Responder", "Responder", "Non-Responder", "Responder", 
"Non-Responder", "Responder"), Group2 = c("Rest", "Rest", "Rest", 
"Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest", 
"Rest", "Rest", "Rest"), n_mutated_group1 = c("0 of 25", "9", 
"0 of 25", "6", "0 of 25", "6", "0 of 25", "6", "0 of 25", "6", 
"0 of 25", "6", "1 of 25", "7"), n_mutated_group2 = c("9 of 25", 
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", "6 of 25", 
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", "7 of 25", 
"1 of 25"), p_value = c(0.00163083541184905, 0.00163083541184905, 
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.022289766970618, 
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.022289766970618, 
0.022289766970618, 0.022289766970618, 0.0487971536957187, 0.0487971536957187
), OR = c(0, Inf, 0, Inf, 0, Inf, 0, Inf, 0, Inf, 0, Inf, 0.111488645279478, 
8.96952328636894), OR_low = c(0, 2.56647319276964, 0, 1.33358819424024, 
0, 1.33358819424024, 0, 1.33358819424024, 0, 1.33358819424024, 
0, 1.33358819424024, 0.00228988507629356, 1.0079479819766), OR_high = c(0.38963976043749, 
Inf, 0.749856668137133, Inf, 0.749856668137133, Inf, 0.749856668137133, 
Inf, 0.749856668137133, Inf, 0.749856668137133, Inf, 0.992114690322592, 
436.703138665198), fdr = c(0.109265972593886, 0.109265972593886, 
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568, 
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568, 
0.248902397838568, 0.248902397838568, 0.467058471087594, 0.467058471087594
)), row.names = c(NA, -14L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x000002adab171ef0>, index = structure(integer(0), "`__Group1`" = c(1L, 
3L, 5L, 7L, 9L, 11L, 13L, 2L, 4L, 6L, 8L, 10L, 12L, 14L)))

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

挽梦忆笙歌 2025-01-21 11:08:41

我的图仅显示三个点（没有标签），尽管我期望有 7 个带有标签的点（对应于 x 和 y 轴中的 7 个值）。

所有 7 个点都在那里，只是其中 5 个点是相同的，因此它们被绘制在彼此之上，导致“缺失”点。独特点只有3个。

如果您使用 ggplot ，那么您可以“抖动”这些点以添加一些噪音：

set.seed(6)
data.frame(x = x, y = (-log10(y))) %>% 
  mutate(x = as.numeric(x)) %>% 
  ggplot(aes(x = x, y = y)) + 
    geom_point(position=position_jitter(height = 0.05), alpha = 0.5, size = 3) +
    xlim(0,25)

My plot only show three points (no labels), although I'm expecting 7 points with labels (corresponding with the 7 values in the x and y axis).

All 7 points are there, it's just that 5 of them are identical and therefore they are plotted on top of each other resulting in "missing" points. There are only 3 unique points.

If you use ggplot then you can "jitter" the points to add some noise:

set.seed(6)
data.frame(x = x, y = (-log10(y))) %>% 
  mutate(x = as.numeric(x)) %>% 
  ggplot(aes(x = x, y = y)) + 
    geom_point(position=position_jitter(height = 0.05), alpha = 0.5, size = 3) +
    xlim(0,25)

回复收藏 0 原文

~没有更多了~