对排名数据使用“cor.test()”

发布于 2025-01-19 03:35:36 字数 1660 浏览 0 评论 0原文

我想使用排名数据进行 Spearman 相关性测试。我如何使用cor.test()来做到这一点？我不希望该函数对数据进行重新排序。

此外，数据需要采用什么形式？从帮助来看，与相关矩阵相比，它似乎是原始数据。

考虑这个例子，

## Hollander & Wolfe (1973), p. 187f.
## Assessment of tuna quality.  We compare the Hunter L measure of
##  lightness to the averages of consumer panel scores (recoded as
##  integer values from 1 to 6 and averaged over 80 such values) in
##  9 lots of canned tuna.
library(tidyverse)
A <- tibble(
x = c(44.4, 45.9, 41.9, 53.3, 44.7, 44.1, 50.7, 45.2, 60.1),
y = c( 2.6,  3.1,  2.5,  5.0,  3.6,  4.0,  5.2,  2.8,  3.8)
) %>% 
mutate(rank_x = rank(x),
rank_y = rank(y)
)

Spearman 相关系数被定义为排序变量之间的 Pearson 相关性，


cor(A$x, A$y, method = "spearman")
#[1] 0.6
cor(A$rank_x, A$rank_y, method = "pearson")
#[1] 0.6

那么 cor.test() 又如何呢？我可以使用排名数据作为其输入吗？

 x1 <-  cor.test(A$x, A$y, method = "spearman")
 x1
#   Spearman's rank correlation rho
#
# data:  A$x and A$y
# S = 48, p-value = 0.1
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho 
# 0.6 

 x2 <-  cor.test(A$rank_x, A$rank_y, method = "pearson")
x2
# Pearson's product-moment correlation
# data:  A$rank_x and A$rank_y
# t = 2, df = 7, p-value = 0.09
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# -0.11  0.90
# sample estimates:
# cor 
# 0.6 

x3 <- cor.test(A$rank_x, A$rank_y, method = "spearman")
# Spearman's rank correlation rho
#
# data:  A$rank_x and A$rank_y
# S = 48, p-value = 0.1
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho 
# 0.6

原文

I would like to do a Spearman correlation test using rank data. How can I do this with cor.test()? I don't want the function to rerank the data.

Additionally, what form does the data need to be in? From the help, it seems to be the raw data as compared to a correlation matrix.

Consider this example

## Hollander & Wolfe (1973), p. 187f.
## Assessment of tuna quality.  We compare the Hunter L measure of
##  lightness to the averages of consumer panel scores (recoded as
##  integer values from 1 to 6 and averaged over 80 such values) in
##  9 lots of canned tuna.
library(tidyverse)
A <- tibble(
x = c(44.4, 45.9, 41.9, 53.3, 44.7, 44.1, 50.7, 45.2, 60.1),
y = c( 2.6,  3.1,  2.5,  5.0,  3.6,  4.0,  5.2,  2.8,  3.8)
) %>% 
mutate(rank_x = rank(x),
rank_y = rank(y)
)

Spearman's correlation coefficient is defined as Pearson's correlation between ranked variables


cor(A$x, A$y, method = "spearman")
#[1] 0.6
cor(A$rank_x, A$rank_y, method = "pearson")
#[1] 0.6

what about cor.test()? Can I use the rank data as its input?

 x1 <-  cor.test(A$x, A$y, method = "spearman")
 x1
#   Spearman's rank correlation rho
#
# data:  A$x and A$y
# S = 48, p-value = 0.1
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho 
# 0.6 

 x2 <-  cor.test(A$rank_x, A$rank_y, method = "pearson")
x2
# Pearson's product-moment correlation
# data:  A$rank_x and A$rank_y
# t = 2, df = 7, p-value = 0.09
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# -0.11  0.90
# sample estimates:
# cor 
# 0.6 

x3 <- cor.test(A$rank_x, A$rank_y, method = "spearman")
# Spearman's rank correlation rho
#
# data:  A$rank_x and A$rank_y
# S = 48, p-value = 0.1
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho 
# 0.6

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

相思故 2025-01-26 03:35:36

是的，您应该使用 method = Spearman 来获取排名数据或原始数据。如果使用排名数据，则不会在函数中对数据重新排名。

正如帮助文件所暗示的，使用 method=Pearson 和排名数据对排名进行 Pearson 相关性检验，该检验遵循 t 分布。然而，由于排名不是连续变量，因此这种方法是不正确的。

getAnywhere(cor.test.default)
A single object matching ‘cor.test.default’ was found
It was found in the following places
  registered S3 method for cor.test from namespace stats
  namespace:stats
with value

function (x, y, alternative = c("two.sided", "less", 
    "greater"), method = c("pearson", "kendall", 
    "spearman"), exact = NULL, conf.level = 0.95, continuity = FALSE, 
    ...) 
{
    alternative <- match.arg(alternative)
    method <- match.arg(method)
    DNAME <- paste(deparse1(substitute(x)), "and", deparse1(substitute(y)))
    if (!is.numeric(x)) 
        stop("'x' must be a numeric vector")
    if (!is.numeric(y)) 
        stop("'y' must be a numeric vector")
    if (length(x) != length(y)) 
        stop("'x' and 'y' must have the same length")
    OK <- complete.cases(x, y)
    x <- x[OK]
    y <- y[OK]
    n <- length(x)
    NVAL <- 0
    conf.int <- FALSE
    if (method == "pearson") {
        if (n < 3L) 
            stop("not enough finite observations")
        method <- "Pearson's product-moment correlation"
        names(NVAL) <- "correlation"
        r <- cor(x, y)
        df <- n - 2L
        ESTIMATE <- c(cor = r)
        PARAMETER <- c(df = df)
        STATISTIC <- c(t = sqrt(df) * r/sqrt(1 - r^2))
        if (n > 3) {
            if (!missing(conf.level) && (length(conf.level) != 
                1 || !is.finite(conf.level) || conf.level < 0 || 
                conf.level > 1)) 
                stop("'conf.level' must be a single number between 0 and 1")
            conf.int <- TRUE
            z <- atanh(r)
            sigma <- 1/sqrt(n - 3)
            cint <- switch(alternative, less = c(-Inf, z + sigma * 
                qnorm(conf.level)), greater = c(z - sigma * qnorm(conf.level), 
                Inf), two.sided = z + c(-1, 1) * sigma * qnorm((1 + 
                conf.level)/2))
            cint <- tanh(cint)
            attr(cint, "conf.level") <- conf.level
        }
        PVAL <- switch(alternative, less = pt(STATISTIC, df), 
            greater = pt(STATISTIC, df, lower.tail = FALSE), 
            two.sided = 2 * min(pt(STATISTIC, df), pt(STATISTIC, 
                df, lower.tail = FALSE)))
    }
    else {
        if (n < 2) 
            stop("not enough finite observations")
        PARAMETER <- NULL
        TIES <- (min(length(unique(x)), length(unique(y))) < 
            n)
        if (method == "kendall") {
            method <- "Kendall's rank correlation tau"
            names(NVAL) <- "tau"
            r <- cor(x, y, method = "kendall")
            ESTIMATE <- c(tau = r)
            if (!is.finite(ESTIMATE)) {
                ESTIMATE[] <- NA
                STATISTIC <- c(T = NA)
                PVAL <- NA
            }
            else {
                if (is.null(exact)) 
                  exact <- (n < 50)
                if (exact && !TIES) {
                  q <- round((r + 1) * n * (n - 1)/4)
                  STATISTIC <- c(T = q)
                  pkendall <- function(q, n) .Call(C_pKendall, 
                    q, n)
                  PVAL <- switch(alternative, two.sided = {
                    if (q > n * (n - 1)/4) p <- 1 - pkendall(q - 
                      1, n) else p <- pkendall(q, n)
                    min(2 * p, 1)
                  }, greater = 1 - pkendall(q - 1, n), less = pkendall(q, 
                    n))
                }
                else {
                  xties <- table(x[duplicated(x)]) + 1
                  yties <- table(y[duplicated(y)]) + 1
                  T0 <- n * (n - 1)/2
                  T1 <- sum(xties * (xties - 1))/2
                  T2 <- sum(yties * (yties - 1))/2
                  S <- r * sqrt((T0 - T1) * (T0 - T2))
                  v0 <- n * (n - 1) * (2 * n + 5)
                  vt <- sum(xties * (xties - 1) * (2 * xties + 
                    5))
                  vu <- sum(yties * (yties - 1) * (2 * yties + 
                    5))
                  v1 <- sum(xties * (xties - 1)) * sum(yties * 
                    (yties - 1))
                  v2 <- sum(xties * (xties - 1) * (xties - 2)) * 
                    sum(yties * (yties - 1) * (yties - 2))
                  var_S <- (v0 - vt - vu)/18 + v1/(2 * n * (n - 
                    1)) + v2/(9 * n * (n - 1) * (n - 2))
                  if (exact && TIES) 
                    warning("Cannot compute exact p-value with ties")
                  if (continuity) 
                    S <- sign(S) * (abs(S) - 1)
                  STATISTIC <- c(z = S/sqrt(var_S))
                  PVAL <- switch(alternative, less = pnorm(STATISTIC), 
                    greater = pnorm(STATISTIC, lower.tail = FALSE), 
                    two.sided = 2 * min(pnorm(STATISTIC), pnorm(STATISTIC, 
                      lower.tail = FALSE)))
                }
            }
        }
        else {
            method <- "Spearman's rank correlation rho"
            if (is.null(exact)) 
                exact <- TRUE
            names(NVAL) <- "rho"
            r <- cor(rank(x), rank(y))
            ESTIMATE <- c(rho = r)
            if (!is.finite(ESTIMATE)) {
                ESTIMATE[] <- NA
                STATISTIC <- c(S = NA)
                PVAL <- NA
            }
            else {
                pspearman <- function(q, n, lower.tail = TRUE) {
                  if (n <= 1290 && exact) 
                    .Call(C_pRho, round(q) + 2 * lower.tail, 
                      n, lower.tail)
                  else {
                    den <- (n * (n^2 - 1))/6
                    if (continuity) 
                      den <- den + 1
                    r <- 1 - q/den
                    pt(r/sqrt((1 - r^2)/(n - 2)), df = n - 2, 
                      lower.tail = !lower.tail)
                  }
                }
                q <- (n^3 - n) * (1 - r)/6
                STATISTIC <- c(S = q)
                if (TIES && exact) {
                  exact <- FALSE
                  warning("Cannot compute exact p-value with ties")
                }
                PVAL <- switch(alternative, two.sided = {
                  p <- if (q > (n^3 - n)/6) pspearman(q, n, lower.tail = FALSE) else pspearman(q, 
                    n, lower.tail = TRUE)
                  min(2 * p, 1)
                }, greater = pspearman(q, n, lower.tail = TRUE), 
                  less = pspearman(q, n, lower.tail = FALSE))
            }
        }
    }
    RVAL <- list(statistic = STATISTIC, parameter = PARAMETER, 
        p.value = as.numeric(PVAL), estimate = ESTIMATE, null.value = NVAL, 
        alternative = alternative, method = method, data.name = DNAME)
    if (conf.int) 
        RVAL <- c(RVAL, list(conf.int = cint))
    class(RVAL) <- "htest"
    RVAL
}
<bytecode: 0x0000018603fa9418>
<environment: namespace:stats>

Yes, you should use method = Spearman for ranked or original data. If rank data is used, the data is not reranked in the function.

As the help file implies, using method=Pearson with rank data conducts a Pearson's correlation test on the ranks, which would follow a t-distribution. However, since the ranks are not continuous variables, this approach is not correct.

getAnywhere(cor.test.default)
A single object matching ‘cor.test.default’ was found
It was found in the following places
  registered S3 method for cor.test from namespace stats
  namespace:stats
with value

function (x, y, alternative = c("two.sided", "less", 
    "greater"), method = c("pearson", "kendall", 
    "spearman"), exact = NULL, conf.level = 0.95, continuity = FALSE, 
    ...) 
{
    alternative <- match.arg(alternative)
    method <- match.arg(method)
    DNAME <- paste(deparse1(substitute(x)), "and", deparse1(substitute(y)))
    if (!is.numeric(x)) 
        stop("'x' must be a numeric vector")
    if (!is.numeric(y)) 
        stop("'y' must be a numeric vector")
    if (length(x) != length(y)) 
        stop("'x' and 'y' must have the same length")
    OK <- complete.cases(x, y)
    x <- x[OK]
    y <- y[OK]
    n <- length(x)
    NVAL <- 0
    conf.int <- FALSE
    if (method == "pearson") {
        if (n < 3L) 
            stop("not enough finite observations")
        method <- "Pearson's product-moment correlation"
        names(NVAL) <- "correlation"
        r <- cor(x, y)
        df <- n - 2L
        ESTIMATE <- c(cor = r)
        PARAMETER <- c(df = df)
        STATISTIC <- c(t = sqrt(df) * r/sqrt(1 - r^2))
        if (n > 3) {
            if (!missing(conf.level) && (length(conf.level) != 
                1 || !is.finite(conf.level) || conf.level < 0 || 
                conf.level > 1)) 
                stop("'conf.level' must be a single number between 0 and 1")
            conf.int <- TRUE
            z <- atanh(r)
            sigma <- 1/sqrt(n - 3)
            cint <- switch(alternative, less = c(-Inf, z + sigma * 
                qnorm(conf.level)), greater = c(z - sigma * qnorm(conf.level), 
                Inf), two.sided = z + c(-1, 1) * sigma * qnorm((1 + 
                conf.level)/2))
            cint <- tanh(cint)
            attr(cint, "conf.level") <- conf.level
        }
        PVAL <- switch(alternative, less = pt(STATISTIC, df), 
            greater = pt(STATISTIC, df, lower.tail = FALSE), 
            two.sided = 2 * min(pt(STATISTIC, df), pt(STATISTIC, 
                df, lower.tail = FALSE)))
    }
    else {
        if (n < 2) 
            stop("not enough finite observations")
        PARAMETER <- NULL
        TIES <- (min(length(unique(x)), length(unique(y))) < 
            n)
        if (method == "kendall") {
            method <- "Kendall's rank correlation tau"
            names(NVAL) <- "tau"
            r <- cor(x, y, method = "kendall")
            ESTIMATE <- c(tau = r)
            if (!is.finite(ESTIMATE)) {
                ESTIMATE[] <- NA
                STATISTIC <- c(T = NA)
                PVAL <- NA
            }
            else {
                if (is.null(exact)) 
                  exact <- (n < 50)
                if (exact && !TIES) {
                  q <- round((r + 1) * n * (n - 1)/4)
                  STATISTIC <- c(T = q)
                  pkendall <- function(q, n) .Call(C_pKendall, 
                    q, n)
                  PVAL <- switch(alternative, two.sided = {
                    if (q > n * (n - 1)/4) p <- 1 - pkendall(q - 
                      1, n) else p <- pkendall(q, n)
                    min(2 * p, 1)
                  }, greater = 1 - pkendall(q - 1, n), less = pkendall(q, 
                    n))
                }
                else {
                  xties <- table(x[duplicated(x)]) + 1
                  yties <- table(y[duplicated(y)]) + 1
                  T0 <- n * (n - 1)/2
                  T1 <- sum(xties * (xties - 1))/2
                  T2 <- sum(yties * (yties - 1))/2
                  S <- r * sqrt((T0 - T1) * (T0 - T2))
                  v0 <- n * (n - 1) * (2 * n + 5)
                  vt <- sum(xties * (xties - 1) * (2 * xties + 
                    5))
                  vu <- sum(yties * (yties - 1) * (2 * yties + 
                    5))
                  v1 <- sum(xties * (xties - 1)) * sum(yties * 
                    (yties - 1))
                  v2 <- sum(xties * (xties - 1) * (xties - 2)) * 
                    sum(yties * (yties - 1) * (yties - 2))
                  var_S <- (v0 - vt - vu)/18 + v1/(2 * n * (n - 
                    1)) + v2/(9 * n * (n - 1) * (n - 2))
                  if (exact && TIES) 
                    warning("Cannot compute exact p-value with ties")
                  if (continuity) 
                    S <- sign(S) * (abs(S) - 1)
                  STATISTIC <- c(z = S/sqrt(var_S))
                  PVAL <- switch(alternative, less = pnorm(STATISTIC), 
                    greater = pnorm(STATISTIC, lower.tail = FALSE), 
                    two.sided = 2 * min(pnorm(STATISTIC), pnorm(STATISTIC, 
                      lower.tail = FALSE)))
                }
            }
        }
        else {
            method <- "Spearman's rank correlation rho"
            if (is.null(exact)) 
                exact <- TRUE
            names(NVAL) <- "rho"
            r <- cor(rank(x), rank(y))
            ESTIMATE <- c(rho = r)
            if (!is.finite(ESTIMATE)) {
                ESTIMATE[] <- NA
                STATISTIC <- c(S = NA)
                PVAL <- NA
            }
            else {
                pspearman <- function(q, n, lower.tail = TRUE) {
                  if (n <= 1290 && exact) 
                    .Call(C_pRho, round(q) + 2 * lower.tail, 
                      n, lower.tail)
                  else {
                    den <- (n * (n^2 - 1))/6
                    if (continuity) 
                      den <- den + 1
                    r <- 1 - q/den
                    pt(r/sqrt((1 - r^2)/(n - 2)), df = n - 2, 
                      lower.tail = !lower.tail)
                  }
                }
                q <- (n^3 - n) * (1 - r)/6
                STATISTIC <- c(S = q)
                if (TIES && exact) {
                  exact <- FALSE
                  warning("Cannot compute exact p-value with ties")
                }
                PVAL <- switch(alternative, two.sided = {
                  p <- if (q > (n^3 - n)/6) pspearman(q, n, lower.tail = FALSE) else pspearman(q, 
                    n, lower.tail = TRUE)
                  min(2 * p, 1)
                }, greater = pspearman(q, n, lower.tail = TRUE), 
                  less = pspearman(q, n, lower.tail = FALSE))
            }
        }
    }
    RVAL <- list(statistic = STATISTIC, parameter = PARAMETER, 
        p.value = as.numeric(PVAL), estimate = ESTIMATE, null.value = NVAL, 
        alternative = alternative, method = method, data.name = DNAME)
    if (conf.int) 
        RVAL <- c(RVAL, list(conf.int = cint))
    class(RVAL) <- "htest"
    RVAL
}
<bytecode: 0x0000018603fa9418>
<environment: namespace:stats>

回复收藏 0 原文

~没有更多了~