R:计算数据集子集的相关性?

发布于 2024-10-26 17:02:48 字数 4072 浏览 7 评论 0原文

我有一个包含 20 个变量 v1 - v20 的数据集。现在我想使用 cor(...) 计算 v2 和 v10 直到 v15 以及 v3 和 v10 直到 v15 之间的相关性。最好的方法是什么?我是否必须使用

cor(v2, v10)
cor(v2, v11)
cor(v2, v12)
and so on?

以下是实际数据集对每个变量对执行此操作:

   > dput(dataset)
structure(list(Number = 1:15, Question.1.1 = c(3L, 4L, 5L, 5L, 
4L, 5L, 5L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L), Question.1.2 = c(1L, 
2L, 1L, 1L, 4L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 1L), Question.2.1 = c(5L, 
3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Question.2.2 = c(2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Question.3.1 = c(2L, 
NA, 4L, 5L, 4L, 3L, 5L, 3L, 5L, 5L, 5L, 5L, 4L, 4L, 4L), Question.3.2 = c(2L, 
NA, 1L, 1L, 2L, 2L, 1L, 4L, 3L, 1L, 1L, 1L, 2L, 2L, 1L), Question.4.1 = c(3L, 
2L, 5L, 2L, 5L, 5L, 5L, 3L, 5L, 5L, 5L, 5L, 4L, 5L, 2L), Question.4.2 = c(2L, 
2L, 1L, 2L, 2L, 1L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 4L), Question.5.1 = c(5L, 
3L, 5L, 3L, 4L, 4L, 5L, 3L, 5L, 5L, 5L, 5L, 5L, 4L, 4L), Question.5.2 = c(2L, 
2L, 1L, 1L, 3L, 2L, 1L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 1L), Question.6.1 = c(5L, 
2L, 2L, 2L, 3L, 2L, 3L, 1L, 3L, 3L, 5L, 4L, 3L, 3L, 1L), Question.6.2 = c(2L, 
3L, 2L, 1L, 2L, 3L, 3L, 3L, 3L, 2L, 1L, 1L, 2L, 2L, 1L), Question.7.1 = c(5L, 
2L, 5L, 5L, 5L, 3L, 5L, 5L, 2L, 4L, 5L, 5L, 5L, 4L, 5L), Question.7.2 = c(1L, 
4L, 1L, 1L, 2L, 2L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L), Question.8.1 = c(4L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Question.8.2 = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Question.9.1 = c(5L, 
3L, 5L, 4L, 4L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 5L, 4L, 3L), Question.9.2 = c(1L, 
1L, 1L, 2L, 2L, 1L, 2L, 1L, 4L, 2L, 1L, 2L, 2L, 1L, 2L), AQ.1 = c(5L, 
5L, 5L, 1L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L, 5L, 5L, 3L), AQ.2 = c(2L, 
5L, 2L, 1L, 2L, 5L, 2L, 1L, 5L, 1L, 1L, 4L, 2L, 3L, 3L), Task.1 = c(5L, 
2L, 5L, 1L, 4L, 5L, 5L, 4L, 4L, 5L, 5L, 4L, 5L, 5L, 5L), Task.2 = c(4L, 
3L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Task.3 = c(4L, 
3L, 4L, 1L, 3L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 4L, 4L, 4L), Task.4 = c(5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Task.5 = c(5L, 
4L, 5L, 4L, 4L, 5L, 4L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 4L), GQ.1 = c(4L, 
2L, 2L, 5L, 4L, 4L, 5L, 4L, 5L, 5L, 5L, 4L, 4L, 5L, 4L), GQ.2 = c(4L, 
4L, 4L, 5L, 5L, 4L, 4L, 3L, 3L, 3L, 5L, 5L, 5L, 4L, 3L), GQ.3 = c(5L, 
3L, 2L, 5L, 3L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 4L, 4L, 4L), GQ.4 = c(5L, 
2L, 1L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 5L, 5L, 4L, 4L, 1L), GQ.5 = c(4L, 
3L, 4L, 5L, 5L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 4L, 3L), GQ.6 = c(2L, 
2L, 1L, 1L, 2L, 1L, 4L, 1L, 4L, 5L, 5L, 1L, 5L, 1L, 5L), GQ.7 = c(4L, 
5L, 5L, 5L, 4L, 2L, 3L, 5L, 3L, 5L, 5L, 2L, 5L, 3L, 2L), GQ.8 = c(2L, 
4L, 3L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), GQ.9 = c(3L, 
5L, 2L, 3L, 4L, 4L, 5L, 3L, 4L, 4L, 3L, 3L, 4L, 2L, 2L), GQ.10 = c(3L, 
4L, 1L, 2L, 3L, 4L, 5L, 5L, 5L, 5L, 4L, 4L, 5L, 5L, 2L), Feature.1 = c(4L, 
4L, 2L, 3L, 4L, 4L, 5L, 5L, 4L, 5L, 5L, 4L, 5L, 3L, 4L), Feature.2 = c(4L, 
4L, 2L, 1L, 5L, 4L, 5L, 5L, 5L, 4L, 4L, 3L, 5L, 3L, 2L), Feature.3 = c(3L, 
2L, 1L, 2L, 5L, 5L, 2L, 4L, 2L, 4L, 4L, 5L, 2L, 4L, 2L), Feature.4 = c(3L, 
3L, 3L, 4L, 3L, 4L, 5L, 5L, 4L, 4L, 4L, 3L, 4L, 3L, 3L), Feature.5 = c(2L, 
2L, 3L, 3L, 4L, 3L, 4L, 4L, 2L, 4L, 3L, 4L, 5L, 3L, 1L), Feature.6 = c(5L, 
5L, 1L, 1L, 5L, 5L, 5L, 4L, 4L, 5L, 5L, 5L, 5L, 4L, 4L), Feature.7 = c(5L, 
3L, 2L, 5L, 4L, 5L, 3L, 5L, 4L, 5L, 5L, 5L, 5L, 4L, 4L)), .Names = c("Number", 
"Question.1.1", "Question.1.2", "Question.2.1", "Question.2.2", 
"Question.3.1", "Question.3.2", "Question.4.1", "Question.4.2", 
"Question.5.1", "Question.5.2", "Question.6.1", "Question.6.2", 
"Question.7.1", "Question.7.2", "Question.8.1", "Question.8.2", 
"Question.9.1", "Question.9.2", "AQ.1", "AQ.2", "Task.1", "Task.2", 
"Task.3", "Task.4", "Task.5", "GQ.1", "GQ.2", "GQ.3", "GQ.4", 
"GQ.5", "GQ.6", "GQ.7", "GQ.8", "GQ.9", "GQ.10", "Feature.1", 
"Feature.2", "Feature.3", "Feature.4", "Feature.5", "Feature.6", 
"Feature.7"), class = "data.frame", row.names = c(NA, -15L))

I have a dataset with 20 variables v1 - v20. Now I'd like to use cor(...) to calculate the correlation between v2 and v10 until v15 and v3 and v10 until v15. What's the best way to do this? Do I have to do this for each variable pair using

cor(v2, v10)
cor(v2, v11)
cor(v2, v12)
and so on?

Here is the actual dataset:

   > dput(dataset)
structure(list(Number = 1:15, Question.1.1 = c(3L, 4L, 5L, 5L, 
4L, 5L, 5L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L), Question.1.2 = c(1L, 
2L, 1L, 1L, 4L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 1L), Question.2.1 = c(5L, 
3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Question.2.2 = c(2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Question.3.1 = c(2L, 
NA, 4L, 5L, 4L, 3L, 5L, 3L, 5L, 5L, 5L, 5L, 4L, 4L, 4L), Question.3.2 = c(2L, 
NA, 1L, 1L, 2L, 2L, 1L, 4L, 3L, 1L, 1L, 1L, 2L, 2L, 1L), Question.4.1 = c(3L, 
2L, 5L, 2L, 5L, 5L, 5L, 3L, 5L, 5L, 5L, 5L, 4L, 5L, 2L), Question.4.2 = c(2L, 
2L, 1L, 2L, 2L, 1L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 4L), Question.5.1 = c(5L, 
3L, 5L, 3L, 4L, 4L, 5L, 3L, 5L, 5L, 5L, 5L, 5L, 4L, 4L), Question.5.2 = c(2L, 
2L, 1L, 1L, 3L, 2L, 1L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 1L), Question.6.1 = c(5L, 
2L, 2L, 2L, 3L, 2L, 3L, 1L, 3L, 3L, 5L, 4L, 3L, 3L, 1L), Question.6.2 = c(2L, 
3L, 2L, 1L, 2L, 3L, 3L, 3L, 3L, 2L, 1L, 1L, 2L, 2L, 1L), Question.7.1 = c(5L, 
2L, 5L, 5L, 5L, 3L, 5L, 5L, 2L, 4L, 5L, 5L, 5L, 4L, 5L), Question.7.2 = c(1L, 
4L, 1L, 1L, 2L, 2L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L), Question.8.1 = c(4L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Question.8.2 = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Question.9.1 = c(5L, 
3L, 5L, 4L, 4L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 5L, 4L, 3L), Question.9.2 = c(1L, 
1L, 1L, 2L, 2L, 1L, 2L, 1L, 4L, 2L, 1L, 2L, 2L, 1L, 2L), AQ.1 = c(5L, 
5L, 5L, 1L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L, 5L, 5L, 3L), AQ.2 = c(2L, 
5L, 2L, 1L, 2L, 5L, 2L, 1L, 5L, 1L, 1L, 4L, 2L, 3L, 3L), Task.1 = c(5L, 
2L, 5L, 1L, 4L, 5L, 5L, 4L, 4L, 5L, 5L, 4L, 5L, 5L, 5L), Task.2 = c(4L, 
3L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Task.3 = c(4L, 
3L, 4L, 1L, 3L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 4L, 4L, 4L), Task.4 = c(5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Task.5 = c(5L, 
4L, 5L, 4L, 4L, 5L, 4L, 5L, 4L, 5L, 5L, 5L, 5L, 5L, 4L), GQ.1 = c(4L, 
2L, 2L, 5L, 4L, 4L, 5L, 4L, 5L, 5L, 5L, 4L, 4L, 5L, 4L), GQ.2 = c(4L, 
4L, 4L, 5L, 5L, 4L, 4L, 3L, 3L, 3L, 5L, 5L, 5L, 4L, 3L), GQ.3 = c(5L, 
3L, 2L, 5L, 3L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 4L, 4L, 4L), GQ.4 = c(5L, 
2L, 1L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 5L, 5L, 4L, 4L, 1L), GQ.5 = c(4L, 
3L, 4L, 5L, 5L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 4L, 3L), GQ.6 = c(2L, 
2L, 1L, 1L, 2L, 1L, 4L, 1L, 4L, 5L, 5L, 1L, 5L, 1L, 5L), GQ.7 = c(4L, 
5L, 5L, 5L, 4L, 2L, 3L, 5L, 3L, 5L, 5L, 2L, 5L, 3L, 2L), GQ.8 = c(2L, 
4L, 3L, 2L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L), GQ.9 = c(3L, 
5L, 2L, 3L, 4L, 4L, 5L, 3L, 4L, 4L, 3L, 3L, 4L, 2L, 2L), GQ.10 = c(3L, 
4L, 1L, 2L, 3L, 4L, 5L, 5L, 5L, 5L, 4L, 4L, 5L, 5L, 2L), Feature.1 = c(4L, 
4L, 2L, 3L, 4L, 4L, 5L, 5L, 4L, 5L, 5L, 4L, 5L, 3L, 4L), Feature.2 = c(4L, 
4L, 2L, 1L, 5L, 4L, 5L, 5L, 5L, 4L, 4L, 3L, 5L, 3L, 2L), Feature.3 = c(3L, 
2L, 1L, 2L, 5L, 5L, 2L, 4L, 2L, 4L, 4L, 5L, 2L, 4L, 2L), Feature.4 = c(3L, 
3L, 3L, 4L, 3L, 4L, 5L, 5L, 4L, 4L, 4L, 3L, 4L, 3L, 3L), Feature.5 = c(2L, 
2L, 3L, 3L, 4L, 3L, 4L, 4L, 2L, 4L, 3L, 4L, 5L, 3L, 1L), Feature.6 = c(5L, 
5L, 1L, 1L, 5L, 5L, 5L, 4L, 4L, 5L, 5L, 5L, 5L, 4L, 4L), Feature.7 = c(5L, 
3L, 2L, 5L, 4L, 5L, 3L, 5L, 4L, 5L, 5L, 5L, 5L, 4L, 4L)), .Names = c("Number", 
"Question.1.1", "Question.1.2", "Question.2.1", "Question.2.2", 
"Question.3.1", "Question.3.2", "Question.4.1", "Question.4.2", 
"Question.5.1", "Question.5.2", "Question.6.1", "Question.6.2", 
"Question.7.1", "Question.7.2", "Question.8.1", "Question.8.2", 
"Question.9.1", "Question.9.2", "AQ.1", "AQ.2", "Task.1", "Task.2", 
"Task.3", "Task.4", "Task.5", "GQ.1", "GQ.2", "GQ.3", "GQ.4", 
"GQ.5", "GQ.6", "GQ.7", "GQ.8", "GQ.9", "GQ.10", "Feature.1", 
"Feature.2", "Feature.3", "Feature.4", "Feature.5", "Feature.6", 
"Feature.7"), class = "data.frame", row.names = c(NA, -15L))

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(4

错爱 2024-11-02 17:02:48

我可能误解了这个问题......但你为什么不在数据框上运行 cor 呢?

例如:

data <- data.frame(q1=sample(1:5, 15, rep=1), 
          q2=sample(1:5, 15, rep=1), 
          q3=sample(1:5, 15, rep=1), 
          q4=sample(1:5, 15, rep=1), 
          q5=sample(1:5, 15, rep=1), 
          q6=sample(1:5, 15, rep=1), 
          q7=sample(1:5, 15, rep=1), 
          q8=sample(1:5, 15, rep=1), 
          q9=sample(1:5, 15, rep=1), 
          q10=sample(1:5, 15, rep=1))

print(cor(data))

如果您只需要某些相关值,您甚至可以

image(cor(data), x=1:10, y=1:10, zlim=c(-1,1))

将 corr 的结果放入变量中并提取您需要的结果。

例如,我们想要第 2 列与第 5 至 10 列的相关性,我们将:

corrs <- cor(data)
print(corrs[2, 5:10]) # or corrs[5:10, 2], the correlation matrix is symmetric

I may have misunderstood the problem... but why don't you just to run cor on the data frame?

For instance:

data <- data.frame(q1=sample(1:5, 15, rep=1), 
          q2=sample(1:5, 15, rep=1), 
          q3=sample(1:5, 15, rep=1), 
          q4=sample(1:5, 15, rep=1), 
          q5=sample(1:5, 15, rep=1), 
          q6=sample(1:5, 15, rep=1), 
          q7=sample(1:5, 15, rep=1), 
          q8=sample(1:5, 15, rep=1), 
          q9=sample(1:5, 15, rep=1), 
          q10=sample(1:5, 15, rep=1))

print(cor(data))

You can even

image(cor(data), x=1:10, y=1:10, zlim=c(-1,1))

If you just need certain values of correlations just put the result of corr in a variable and pull out the results you need.

For instance, we want the correlation of column 2 with columns 5 to 10 we will:

corrs <- cor(data)
print(corrs[2, 5:10]) # or corrs[5:10, 2], the correlation matrix is symmetric
花落人断肠 2024-11-02 17:02:48

显式对数据集进行子集化并在该数据集上运行相关命令。假设您的变量排序正确,请尝试如下操作:

cor(dat[,c(2, 10:15)][,1]
cor(dat[,c(3, 10:15)][,1]

如果它们未排序,您只需对它们进行排序或用引号命名变量即可。例如:

cor(dat[,c('v3', 'v10', 'v11', 'v12', 'v13', 'v14', 'v15')][,1]

Subset the dataset explicitly and run the correlation command on that dataset. Assuming your variables are ordered properly, try something like this:

cor(dat[,c(2, 10:15)][,1]
cor(dat[,c(3, 10:15)][,1]

If they are not ordered, you'll just need to order them or name the variables in quotes instead. E.g.:

cor(dat[,c('v3', 'v10', 'v11', 'v12', 'v13', 'v14', 'v15')][,1]
冷血 2024-11-02 17:02:48

使用 subset 命令:

dtf <- subset(mtcars, select = c(mpg, hp, wt))
cor(dtf)
         mpg         hp         wt
mpg  1.0000000 -0.7761684 -0.8676594
hp  -0.7761684  1.0000000  0.6587479
wt  -0.8676594  0.6587479  1.0000000

或者使用 psych 包和 corr.test 函数:

library(psych)
corr.test(dtf)
Call:corr.test(x = dtf)
Correlation matrix 
      mpg    hp    wt
mpg  1.00 -0.78 -0.87
hp  -0.78  1.00  0.66
wt  -0.87  0.66  1.00
Sample Size 
    mpg hp wt
mpg  32 32 32
hp   32 32 32
wt   32 32 32
Probability value 
    mpg hp wt
mpg   0  0  0
hp    0  0  0
wt    0  0  0

Use subset command:

dtf <- subset(mtcars, select = c(mpg, hp, wt))
cor(dtf)
         mpg         hp         wt
mpg  1.0000000 -0.7761684 -0.8676594
hp  -0.7761684  1.0000000  0.6587479
wt  -0.8676594  0.6587479  1.0000000

Or use psych package and corr.test function:

library(psych)
corr.test(dtf)
Call:corr.test(x = dtf)
Correlation matrix 
      mpg    hp    wt
mpg  1.00 -0.78 -0.87
hp  -0.78  1.00  0.66
wt  -0.87  0.66  1.00
Sample Size 
    mpg hp wt
mpg  32 32 32
hp   32 32 32
wt   32 32 32
Probability value 
    mpg hp wt
mpg   0  0  0
hp    0  0  0
wt    0  0  0
妞丶爷亲个 2024-11-02 17:02:48

这个问题似乎是由在整个数据帧上运行 corr 导致的信息过载引起的。我没有太多使用它,但 ggplot 成名的 Hadley Wickham 的 plyr 包 似乎提供了一些优雅的解决方案子集化和管理输出。

The question seems motivated by the information overload that results from running corr on the entire dataframe. I have not used it much but the plyr package by Hadley Wickham of ggplot fame seems to offer some elegant solutions to subsetting and managing the output.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文