R:如何绘制散点图进行统计检验?
我想创建一个基因散点图,其中 x 轴为突变患者的数量,y 轴为统计测试的结果(以探索是否有任何突变基因在有反应或无反应的患者中富集)。
本质上,x 轴代表 res.sig
数据帧的 Group1
列中具有 Responder
值的患者数量,并用Hugo_Symbol
信息。 y 轴表示 res.sig
数据帧中的 -log10 fdr
值。
# Clinical enrichment
response.ce = clinicalEnrichment(maf=d, clinicalFeature="Response")
# Significant associations p-value < 0.05
res.sig <- response.ce$groupwise_comparision[p_value < 0.05]
# Genes with the number of mutated patients
library(dplyr)
x <- res.sig %>%
filter(Group1=='Responder') %>%
mutate(first_letter = substr(n_mutated_group1, 1, 1)) %>%
pull(first_letter)
ann <- res.sig %>%
filter(Group1=="Responder") %>%
pull(Hugo_Symbol)
# FDR
y <- res.sig %>%
filter(Group1=="Responder") %>%
pull(fdr)
# Scatter plot
png("Rplot06.png")
plot(x, -log10(y), type="p", xlim=c(0,25), main="Scatterplot of statistically significant mutated genes", xlab="Number of mutated patients", ylab="-log10(fdr)", pch=19)
text(x, y-1, labels=ann)
dev.off()
我的图仅显示三个点(没有标签),尽管我期望有 7 个带有标签的点(对应于 x 和 y 轴中的 7 个值)。
res.sig
> dput(res.sig)
structure(list(Hugo_Symbol = c("ERCC2", "ERCC2", "AKAP9", "AKAP9",
"HERC1", "HERC1", "HECTD1", "HECTD1", "MACF1", "MACF1", "MROH2B",
"MROH2B", "KMT2C", "KMT2C"), Group1 = c("Non-Responder", "Responder",
"Non-Responder", "Responder", "Non-Responder", "Responder", "Non-Responder",
"Responder", "Non-Responder", "Responder", "Non-Responder", "Responder",
"Non-Responder", "Responder"), Group2 = c("Rest", "Rest", "Rest",
"Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest",
"Rest", "Rest", "Rest"), n_mutated_group1 = c("0 of 25", "9",
"0 of 25", "6", "0 of 25", "6", "0 of 25", "6", "0 of 25", "6",
"0 of 25", "6", "1 of 25", "7"), n_mutated_group2 = c("9 of 25",
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", "6 of 25",
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", "7 of 25",
"1 of 25"), p_value = c(0.00163083541184905, 0.00163083541184905,
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.022289766970618,
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.022289766970618,
0.022289766970618, 0.022289766970618, 0.0487971536957187, 0.0487971536957187
), OR = c(0, Inf, 0, Inf, 0, Inf, 0, Inf, 0, Inf, 0, Inf, 0.111488645279478,
8.96952328636894), OR_low = c(0, 2.56647319276964, 0, 1.33358819424024,
0, 1.33358819424024, 0, 1.33358819424024, 0, 1.33358819424024,
0, 1.33358819424024, 0.00228988507629356, 1.0079479819766), OR_high = c(0.38963976043749,
Inf, 0.749856668137133, Inf, 0.749856668137133, Inf, 0.749856668137133,
Inf, 0.749856668137133, Inf, 0.749856668137133, Inf, 0.992114690322592,
436.703138665198), fdr = c(0.109265972593886, 0.109265972593886,
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568,
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568,
0.248902397838568, 0.248902397838568, 0.467058471087594, 0.467058471087594
)), row.names = c(NA, -14L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x000002adab171ef0>, index = structure(integer(0), "`__Group1`" = c(1L,
3L, 5L, 7L, 9L, 11L, 13L, 2L, 4L, 6L, 8L, 10L, 12L, 14L)))
I want to create a scatter plot of genes with the number of mutated patients on the x-axis and the results of a statistical test (to explore if any mutated genes are enriched in patients who either responded or not) on the y-axis.
Essentially, the x-axis represents the number of patients with the Responder
value in Group1
column of the res.sig
dataframe and is labelled with the Hugo_Symbol
information. The y-axis represents the -log10 fdr
value in the res.sig
dataframe.
# Clinical enrichment
response.ce = clinicalEnrichment(maf=d, clinicalFeature="Response")
# Significant associations p-value < 0.05
res.sig <- response.ce$groupwise_comparision[p_value < 0.05]
# Genes with the number of mutated patients
library(dplyr)
x <- res.sig %>%
filter(Group1=='Responder') %>%
mutate(first_letter = substr(n_mutated_group1, 1, 1)) %>%
pull(first_letter)
ann <- res.sig %>%
filter(Group1=="Responder") %>%
pull(Hugo_Symbol)
# FDR
y <- res.sig %>%
filter(Group1=="Responder") %>%
pull(fdr)
# Scatter plot
png("Rplot06.png")
plot(x, -log10(y), type="p", xlim=c(0,25), main="Scatterplot of statistically significant mutated genes", xlab="Number of mutated patients", ylab="-log10(fdr)", pch=19)
text(x, y-1, labels=ann)
dev.off()
My plot only show three points (no labels), although I'm expecting 7 points with labels (corresponding with the 7 values in the x and y axis).
res.sig
> dput(res.sig)
structure(list(Hugo_Symbol = c("ERCC2", "ERCC2", "AKAP9", "AKAP9",
"HERC1", "HERC1", "HECTD1", "HECTD1", "MACF1", "MACF1", "MROH2B",
"MROH2B", "KMT2C", "KMT2C"), Group1 = c("Non-Responder", "Responder",
"Non-Responder", "Responder", "Non-Responder", "Responder", "Non-Responder",
"Responder", "Non-Responder", "Responder", "Non-Responder", "Responder",
"Non-Responder", "Responder"), Group2 = c("Rest", "Rest", "Rest",
"Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest", "Rest",
"Rest", "Rest", "Rest"), n_mutated_group1 = c("0 of 25", "9",
"0 of 25", "6", "0 of 25", "6", "0 of 25", "6", "0 of 25", "6",
"0 of 25", "6", "1 of 25", "7"), n_mutated_group2 = c("9 of 25",
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", "6 of 25",
"0 of 25", "6 of 25", "0 of 25", "6 of 25", "0 of 25", "7 of 25",
"1 of 25"), p_value = c(0.00163083541184905, 0.00163083541184905,
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.022289766970618,
0.022289766970618, 0.022289766970618, 0.022289766970618, 0.022289766970618,
0.022289766970618, 0.022289766970618, 0.0487971536957187, 0.0487971536957187
), OR = c(0, Inf, 0, Inf, 0, Inf, 0, Inf, 0, Inf, 0, Inf, 0.111488645279478,
8.96952328636894), OR_low = c(0, 2.56647319276964, 0, 1.33358819424024,
0, 1.33358819424024, 0, 1.33358819424024, 0, 1.33358819424024,
0, 1.33358819424024, 0.00228988507629356, 1.0079479819766), OR_high = c(0.38963976043749,
Inf, 0.749856668137133, Inf, 0.749856668137133, Inf, 0.749856668137133,
Inf, 0.749856668137133, Inf, 0.749856668137133, Inf, 0.992114690322592,
436.703138665198), fdr = c(0.109265972593886, 0.109265972593886,
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568,
0.248902397838568, 0.248902397838568, 0.248902397838568, 0.248902397838568,
0.248902397838568, 0.248902397838568, 0.467058471087594, 0.467058471087594
)), row.names = c(NA, -14L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x000002adab171ef0>, index = structure(integer(0), "`__Group1`" = c(1L,
3L, 5L, 7L, 9L, 11L, 13L, 2L, 4L, 6L, 8L, 10L, 12L, 14L)))
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
所有 7 个点都在那里,只是其中 5 个点是相同的,因此它们被绘制在彼此之上,导致“缺失”点。独特点只有3个。
如果您使用 ggplot ,那么您可以“抖动”这些点以添加一些噪音:
All 7 points are there, it's just that 5 of them are identical and therefore they are plotted on top of each other resulting in "missing" points. There are only 3 unique points.
If you use
ggplot
then you can "jitter" the points to add some noise: