计算 df 中每个变量的皮尔逊相关性和 p 值,以便选择变量在 R 中执行逻辑回归

发布于 2025-01-10 02:12:22 字数 1103 浏览 0 评论 0原文

我有一个 df,我正在尝试执行逻辑回归来预测变量 var12。在此之前,我想选择在此模型中使用哪些变量。我想计算变量 var12 上每个变量的人相关性和 p 值,并可能绘制它们以便从视觉上检查线性度。有人可以帮忙吗?非常感谢

structure(list(id = c(1, 3, 5, 10, 11, 13, 15, 17, 18, 21), 
    var1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), var2 = c(0.1, 
    0.77, 0.75, 0.09, 0.84, 0.52, 0.45, 0.27, 0.71, 0.15), var3 = c("D", 
    "D", "B", "B", "B", "E", "E", "C", "C", "B"), var4 = c(5L, 
    5L, 6L, 7L, 7L, 6L, 6L, 7L, 7L, 7L), var5 = c(0L, 0L, 2L, 
    0L, 0L, 2L, 2L, 0L, 0L, 0L), var6 = c(55L, 55L, 52L, 46L, 
    46L, 38L, 38L, 33L, 33L, 41L), var7 = c(50L, 50L, 50L, 
    50L, 50L, 50L, 50L, 68L, 68L, 50L), var8 = c("B12", "B12", 
    "B12", "B12", "B12", "B12", "B12", "B12", "B12", "B12"), 
    var9 = c("Regular", "Regular", "Diesel", "Diesel", "Diesel", 
    "Regular", "Regular", "Diesel", "Diesel", "Diesel"), var10 = c(1217L, 
    1217L, 54L, 76L, 76L, 3003L, 3003L, 137L, 137L, 60L), var11 = c("R82", 
    "R82", "R22", "R72", "R72", "R31", "R31", "R91", "R91", "R52"
    ), var12 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(NA, 
10L), class = "data.frame")

i have a df and I am trying to perform logistic regression to predict the variable var12. Before that I want to choose which variables will i use in this model. I want to calculate the person correlation and p value for every variable over the variable var12 and perhaps plot them in order to check linearity from the visual also. Can anyone help? thank you very much

structure(list(id = c(1, 3, 5, 10, 11, 13, 15, 17, 18, 21), 
    var1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), var2 = c(0.1, 
    0.77, 0.75, 0.09, 0.84, 0.52, 0.45, 0.27, 0.71, 0.15), var3 = c("D", 
    "D", "B", "B", "B", "E", "E", "C", "C", "B"), var4 = c(5L, 
    5L, 6L, 7L, 7L, 6L, 6L, 7L, 7L, 7L), var5 = c(0L, 0L, 2L, 
    0L, 0L, 2L, 2L, 0L, 0L, 0L), var6 = c(55L, 55L, 52L, 46L, 
    46L, 38L, 38L, 33L, 33L, 41L), var7 = c(50L, 50L, 50L, 
    50L, 50L, 50L, 50L, 68L, 68L, 50L), var8 = c("B12", "B12", 
    "B12", "B12", "B12", "B12", "B12", "B12", "B12", "B12"), 
    var9 = c("Regular", "Regular", "Diesel", "Diesel", "Diesel", 
    "Regular", "Regular", "Diesel", "Diesel", "Diesel"), var10 = c(1217L, 
    1217L, 54L, 76L, 76L, 3003L, 3003L, 137L, 137L, 60L), var11 = c("R82", 
    "R82", "R22", "R72", "R72", "R31", "R31", "R91", "R91", "R52"
    ), var12 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(NA, 
10L), class = "data.frame")

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

仅此而已 2025-01-17 02:12:22

您需要从字符变量中创建虚拟变量。注意我在您的示例中重置了 var12 的值。

image

                                                                                                                                                                                                                                                                                                   
#reset var12 to have better data to play with
seed(100)
df$var12 <- sample(c(0,1), 10, replace = TRUE)

#libraries needed
library(dplyr)
library(tidyverse)
library(fastDummies)

#isolate variables and create dummy variables
cor_data <- df %>% 
    select(!c("id","var12")) %>%
    dummy_cols() %>%            #create dummy variables
    select_if(is.numeric)       #isoalate just numeric variables


#collect variables and the pairs to a dataframe
data_all <- data.frame()

for (colname in names(cor_data)){
    r <- paste0(colname)
    data <- data.frame(x = cor_data[colname], y = df["var12"])
    names(data) <- c("x", "y")
    dd <- data.frame(vars = r, data = data) %>%
        group_by(vars) %>%
        nest()
    data_all <- rbind(data_all, dd)
}

#define model
myModel <- function(df){
    cor.test(df$data.x, df$data.y, method = "spearman")
}

#run model on the data pairs
data_all <- data_all %>%
    mutate(model = map(data, myModel))

#use broom::glance to add the results to the dataframe
glance <- data_all %>% 
    mutate(glance = map(model, broom::glance)) %>% 
    unnest(glance, .drop = TRUE)

#extract the stats you want and graph
glance %>%
    select(statistic, p.value) %>%
    ggplot() +
    geom_col(aes(x = reorder(vars, desc(statistic)), y = statistic)) +
    geom_text(aes(x = reorder(vars, desc(statistic)), y = statistic, 
        label = paste0("p.value = ", round(p.value, 5))), 
              color = "white", hjust = 1.2, angle = 90) +
    labs(x = "variable", title= "Correlation with var12") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))


You'll want to make dummy variables out of the character vars. Note I reset the values of var12 in your example.

image

                                                                                                                                                                                                                                                                                                   
#reset var12 to have better data to play with
seed(100)
df$var12 <- sample(c(0,1), 10, replace = TRUE)

#libraries needed
library(dplyr)
library(tidyverse)
library(fastDummies)

#isolate variables and create dummy variables
cor_data <- df %>% 
    select(!c("id","var12")) %>%
    dummy_cols() %>%            #create dummy variables
    select_if(is.numeric)       #isoalate just numeric variables


#collect variables and the pairs to a dataframe
data_all <- data.frame()

for (colname in names(cor_data)){
    r <- paste0(colname)
    data <- data.frame(x = cor_data[colname], y = df["var12"])
    names(data) <- c("x", "y")
    dd <- data.frame(vars = r, data = data) %>%
        group_by(vars) %>%
        nest()
    data_all <- rbind(data_all, dd)
}

#define model
myModel <- function(df){
    cor.test(df$data.x, df$data.y, method = "spearman")
}

#run model on the data pairs
data_all <- data_all %>%
    mutate(model = map(data, myModel))

#use broom::glance to add the results to the dataframe
glance <- data_all %>% 
    mutate(glance = map(model, broom::glance)) %>% 
    unnest(glance, .drop = TRUE)

#extract the stats you want and graph
glance %>%
    select(statistic, p.value) %>%
    ggplot() +
    geom_col(aes(x = reorder(vars, desc(statistic)), y = statistic)) +
    geom_text(aes(x = reorder(vars, desc(statistic)), y = statistic, 
        label = paste0("p.value = ", round(p.value, 5))), 
              color = "white", hjust = 1.2, angle = 90) +
    labs(x = "variable", title= "Correlation with var12") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))


~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文