通过多元正态分布模拟25个变量的高度相关数据

发布于 2025-01-22 21:45:18 字数 1731 浏览 0 评论 0原文

我尝试使用4,000个观测值模拟高度相关(平均绝对相关性〜0.7)变量。

但是,我无法获得Sigma正确定,因此我需要使用技巧来获得Sigma PD。因此,我失去了大部分初始相关性,从〜0.7降至〜0.25。

原因可能是我需要正面的负相关变量。如果所有相关性仅为正相关,我就可以模拟25个变量的高度相关变量,但是我平均需要50/50的正/负数。

我在《守则》中进行了区分,以获取信息丰富的(相关)&非信息(嘈杂)变量。

有解决这个问题的解决方案吗?

谢谢你的宝贵时间。

library("matrixcalc")
library("MASS")
library("Matrix")

p_inf <- 4
p_unf <- 21
N <- 4000

# Number of unique correlations in correlation matrix
n_inf_sample = (p_inf^2-p_inf)/2
n_unf_unf_and_inf_unf = p_inf*p_unf+(p_unf^2-p_unf)/2

Corr_inf_sample <- sample(runif(n_inf_sample,-0.3,0.3))

Corr_unf_sample <- sample(c(runif(n_unf_unf_and_inf_unf, -0.99,0.99),
                            runif(n_unf_unf_and_inf_unf,0.7,0.9),
                            runif(n_unf_unf_and_inf_unf,-0.9,-0.7)
                            ), n_unf_unf_and_inf_unf)

Corr_matrix <- matrix(ncol = p_inf+p_unf, nrow = p_inf+p_unf,0)
Corr_matrix[upper.tri(Corr_matrix, diag= FALSE)] <- c(Corr_inf_sample,Corr_unf_sample)
Corr_matrix[lower.tri(Corr_matrix, diag= FALSE)] <- t(Corr_matrix)[lower.tri(t(Corr_matrix))]
diag(Corr_matrix) <- 1

#  Mu and Sigma 
mu<- c(runif(p_inf+p_unf, -3,3)) 
sigma<- Corr_matrix

if (is.positive.definite(as.matrix(sigma))) {
  df <- as.data.frame(mvrnorm(n=N, mu=mu, Sigma=sigma))
  print("Sigma is Positive Definite")
}
if (!is.positive.definite(as.matrix(sigma))) {
  sigma2 <- nearPD(sigma)
  df<-as.data.frame(mvrnorm(n=N, mu=mu, Sigma=sigma2$mat)) 
  print("Sigma is not Positive Definite, we create PD matrix")
}

print(c("Mean absolute accorelation sigma: ", round(mean(abs(sigma)),3)))
print(c("Mean absolute correlation variables: " , round(mean(abs(cor(df))),3)))

I try to simulate highly correlated (average absolute correlation of ~0.7) variables for 25 variables using 4,000 observations.

However, I can't get Sigma positive definite, and thus I need to use a trick to get sigma PD. And hereby I lose most of the initial correlation, I go from ~0.7 to ~0.25.

The reason might be that I need positive ánd negative correlated variables. I am able to simulate highly correlated variables for 25 variables if all correlations are only positive, however I need 50/50 positive/negative on average.

I make a distinction in the code for informative (relevant) & uninformative (noisy) variables.

Is there a solution to this issue?

Thank you for your time.

library("matrixcalc")
library("MASS")
library("Matrix")

p_inf <- 4
p_unf <- 21
N <- 4000

# Number of unique correlations in correlation matrix
n_inf_sample = (p_inf^2-p_inf)/2
n_unf_unf_and_inf_unf = p_inf*p_unf+(p_unf^2-p_unf)/2

Corr_inf_sample <- sample(runif(n_inf_sample,-0.3,0.3))

Corr_unf_sample <- sample(c(runif(n_unf_unf_and_inf_unf, -0.99,0.99),
                            runif(n_unf_unf_and_inf_unf,0.7,0.9),
                            runif(n_unf_unf_and_inf_unf,-0.9,-0.7)
                            ), n_unf_unf_and_inf_unf)

Corr_matrix <- matrix(ncol = p_inf+p_unf, nrow = p_inf+p_unf,0)
Corr_matrix[upper.tri(Corr_matrix, diag= FALSE)] <- c(Corr_inf_sample,Corr_unf_sample)
Corr_matrix[lower.tri(Corr_matrix, diag= FALSE)] <- t(Corr_matrix)[lower.tri(t(Corr_matrix))]
diag(Corr_matrix) <- 1

#  Mu and Sigma 
mu<- c(runif(p_inf+p_unf, -3,3)) 
sigma<- Corr_matrix

if (is.positive.definite(as.matrix(sigma))) {
  df <- as.data.frame(mvrnorm(n=N, mu=mu, Sigma=sigma))
  print("Sigma is Positive Definite")
}
if (!is.positive.definite(as.matrix(sigma))) {
  sigma2 <- nearPD(sigma)
  df<-as.data.frame(mvrnorm(n=N, mu=mu, Sigma=sigma2$mat)) 
  print("Sigma is not Positive Definite, we create PD matrix")
}

print(c("Mean absolute accorelation sigma: ", round(mean(abs(sigma)),3)))
print(c("Mean absolute correlation variables: " , round(mean(abs(cor(df))),3)))

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文