在r中抽样

发布于 2025-01-24 16:01:01 字数 921 浏览 1 评论 0原文

给定一个矩阵，例如MAT

> set.seed(1)
> mat <- matrix(rbinom(100,1,0.5),10,10)
> rownames(mat) <- paste0(sample(LETTERS[1:2],10,replace=T),c(1:nrow(mat)))
> colnames(mat) <- paste0(sample(LETTERS[1:2],10,replace=T),c(1:ncol(mat)))
> mat
    A1 A2 A3 B4 B5 B6 B7 A8 B9 B10
B1   0  0  1  0  1  0  1  0  0   0
B2   0  0  0  1  1  1  0  1  1   0
B3   1  1  1  0  1  0  0  0  0   1
A4   1  0  0  0  1  0  0  0  0   1
A5   0  1  0  1  1  0  1  0  1   1
A6   1  0  0  1  1  0  0  1  0   1
A7   1  1  0  1  0  0  0  1  1   0
B8   1  1  0  0  0  1  1  0  0   0
A9   1  0  1  1  1  1  0  1  0   1
A10  0  1  0  0  1  0  1  1  0   1

我想采样表单的子播放：

   A  B
A  0  1
B  1  0

编辑：具体来说，submatrix应在a-row/b column中包含一个1，b row/a中的1个 - 列和其他两个单元格。

我可以通过随机选择一个A行和一个B行，然后随机选择一个A柱和一个B柱，然后检查其是否具有所需模式来做到这一点。但是，我试图找到一种更有效的方法，即使在大/稀疏的矩阵中也可以工作。谢谢！

原文

Given a matrix like mat

> set.seed(1)
> mat <- matrix(rbinom(100,1,0.5),10,10)
> rownames(mat) <- paste0(sample(LETTERS[1:2],10,replace=T),c(1:nrow(mat)))
> colnames(mat) <- paste0(sample(LETTERS[1:2],10,replace=T),c(1:ncol(mat)))
> mat
    A1 A2 A3 B4 B5 B6 B7 A8 B9 B10
B1   0  0  1  0  1  0  1  0  0   0
B2   0  0  0  1  1  1  0  1  1   0
B3   1  1  1  0  1  0  0  0  0   1
A4   1  0  0  0  1  0  0  0  0   1
A5   0  1  0  1  1  0  1  0  1   1
A6   1  0  0  1  1  0  0  1  0   1
A7   1  1  0  1  0  0  0  1  1   0
B8   1  1  0  0  0  1  1  0  0   0
A9   1  0  1  1  1  1  0  1  0   1
A10  0  1  0  0  1  0  1  1  0   1

I want to sample submatrices of the form:

   A  B
A  0  1
B  1  0

EDIT: Specifically, the submatrix should contain a 1 in the A-row/B-column, a 1 in the B-row/A-column, and 0s in the other two cells.

I can do this by randomly selecting one A-row and one B-row, then randomly selecting one A-column and one B-column, then checking whether it has the desired pattern. But, I'm trying to find a more efficient method that will work even in large/sparse matrices. Thanks!

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

温柔少女心 2025-01-31 16:01:01

您可以在维度名称上进行示例：

set.seed(1)
dims <- lapply(dimnames(mat), \(x) c(sample(which(grepl("A", x)), 1), sample(which(grepl("B", x)), 1)))

mat[dims[[1]], dims[[2]]]

   A3 B4
A4  0  0
B8  0  0

You can sample on the dimension names:

set.seed(1)
dims <- lapply(dimnames(mat), \(x) c(sample(which(grepl("A", x)), 1), sample(which(grepl("B", x)), 1)))

mat[dims[[1]], dims[[2]]]

   A3 B4
A4  0  0
B8  0  0

回复收藏 0 原文

旧伤慢歌 2025-01-31 16:01:01

一个人可以列举包含值1的元素的所有可能的成对组合，然后消除共享一个行或列和对的对，该对不会导致0 submatrix main的元素对角线。来自每个剩余对的结果行和列将定义满足所需模式的所有可能的子膜，这些都将是微不足道的样品。对于1元素（例如，在可用的内存上挖掘）的矩阵，这是可行的。

对于具有大量1元素的稀疏矩阵，获得高效，矢量化解决方案的一种直接方法也是拒绝的：1的示例对元素> 元素的元素元素如果相应的主角元素不是0，则拒绝。下面的解决方案假定<代码> 0 比1更多的元素。（如果相反的话，则应修改为主要对角线的两个0元素进行采样，如果抗异构元素不是1。）拒绝率将取决于主要在稀疏基质的密度上。示例矩阵相当致密，因此其排斥率相对较高。

library(data.table)
library(Matrix)

set.seed(1)
m <- matrix(rbinom(100,1,0.5),10,10)
n <- 20L # sample 20 pairs (before rejection)
m <- as(m, "ngTMatrix")
mIdx <- matrix(sample(length(m@i), 2L*n, TRUE), ncol = 2)
(data.table(
  row1 = m@i[mIdx[,1]],
  col1 = m@j[mIdx[,2]],
  row2 = m@i[mIdx[,2]],
  col2 = m@j[mIdx[,1]]
) + 1L)[
  row1 != row2 & col1 != col2 & !(m[matrix(c(row1, col1), n)] + m[matrix(c(row2, col2), n)])
]
#>    row1 col1 row2 col2
#> 1:    1    4    2    7
#> 2:   10    6    2   10
#> 3:    7    7    8    9

在这里，它作为一个函数实现，该函数返回有或没有替换的指定数量的样本。

sampleSubMat <- function(m, n, replace = FALSE, maxIter = 10L) {
  # convert m to a sparse matrix in triplet format if it's not already
  if (!grepl("TMatrix", class(m))) m <- as(1*m, "dgTMatrix")
  nLeft <- n
  # over-sample based on dimensions and density of the matrix
  mult <- 1.1/(1 - length(m@i)/prod(dim(m)))^2/prod(1 - 1/(dim(m - 1)))
  iter <- 1L
  
  if (replace) { # sampling with replacement (duplicates allowed)
    # more efficient to store individual data.table objects from each
    # iteration in a list, then rbindlist at the end
    lDT <- vector("list", maxIter)
    
    while (nLeft > 0L) {
      if (iter > maxIter) {
        message(sprintf("Max iterations (%i) reached", maxIter))
        # print("Max iterations reached")
        return(rbindlist(lDT[1:(iter - 1L)])[1:n])
      }
      nCurr <- ceiling(nLeft*mult)
      mIdx <- matrix(sample(length(m@i), 2L*nCurr, TRUE), ncol = 2)
      lDT[[iter]] <- (data.table(
        row1 = m@i[mIdx[,1]],
        col1 = m@j[mIdx[,2]],
        row2 = m@i[mIdx[,2]],
        col2 = m@j[mIdx[,1]]
      ) + 1L)[
        row1 != row2 & col1 != col2 & !(m[matrix(c(row1, col1), nCurr)] + m[matrix(c(row2, col2), nCurr)])
      ]
      if (nrow(lDT[[iter]])) {
        mult <- 1.1*mult*nLeft/nrow(lDT[[iter]])
        nLeft <- nLeft - nrow(lDT[[iter]])
        iter <- iter + 1L
      } else {
        # no pattern found, double the samples for the next iteration
        mult <- mult*2
      }
    }
    rbindlist(lDT[1:(iter - 1L)])[1:n]
  } else { # sampling without replacement (no duplicates allowed)
    # rbindlist on each iteration to check for duplicates
    dtOut <- data.table(
      row1 = integer(0), col1 = integer(0),
      row2 = integer(0), col2 = integer(0)
    )
    while (nLeft > 0L) {
      if (iter > maxIter) {
        message(sprintf("Max iterations (%i) reached", maxIter))
        return(dtOut)
      }
      nCurr <- ceiling(nLeft*mult)
      mIdx <- matrix(sample(length(m@i), 2L*nCurr, TRUE), ncol = 2)
      dt <- (data.table(
        row1 = m@i[mIdx[,1]],
        col1 = m@j[mIdx[,2]],
        row2 = m@i[mIdx[,2]],
        col2 = m@j[mIdx[,1]]
      ) + 1L)[
        row1 != row2 & col1 != col2 & !(m[matrix(c(row1, col1), nCurr)] + m[matrix(c(row2, col2), nCurr)])
      ]
      if (nrow(dt)) {
        dtOut <- unique(rbindlist(list(dtOut, dt)))
        mult <- 1.1*mult*nLeft/(nrow(dtOut) - n + nLeft)
        nLeft <- nLeft - nrow(dtOut)
      } else {
        mult <- mult*2
      }
    }
    dtOut[1:n]
  }
}

(dtSamples1 <- sampleSubMat(m, 10L))
#>     row1 col1 row2 col2
#>  1:    3    6    2   10
#>  2:    3    8    7    5
#>  3:    9    7    5    1
#>  4:    5    8   10    9
#>  5:    7    7    1    8
#>  6:    3    8    9    2
#>  7:    5    1    3    9
#>  8:    5    1    8    5
#>  9:    4    7    1    1
#> 10:   10    6    8    8
(dtSamples2 <- sampleSubMat(m, 10L, TRUE))
#>     row1 col1 row2 col2
#>  1:    6    7    5    8
#>  2:    2   10    3    4
#>  3:    7    7   10    1
#>  4:    5    1    4    9
#>  5:    1    8    9    7
#>  6:   10    1    8    8
#>  7:    8    8    7    6
#>  8:    7   10    3    9
#>  9:    2   10    3    9
#> 10:    2    1    3    6
# timing 1k samples from a random 10k-square matrix with 1M elements
idx <- sample(1e8, 1e6)
m <- new("ngTMatrix", i = as.integer(((idx - 1) %% 1e4)), j = as.integer(((idx - 1) %/% 1e4)), Dim = c(1e4L, 1e4L))
system.time(dtSamples3 <- sampleSubMat(m, 1e3L)) # without replacement
#>    user  system elapsed 
#>    1.08    0.31    1.40
system.time(dtSamples4 <- sampleSubMat(m, 1e3L, TRUE)) # with replacement
#>    user  system elapsed 
#>    0.89    0.32    1.21
Created on 2022-04-29 by the reprex package (v2.0.1)

One could enumerate all possible pairwise combinations of elements containing the value 1, then eliminate pairs that share a row or column and pairs that would not result in 0 elements for the submatrix main diagonal. The resulting rows and columns from each remaining pair would define all possible submatrices that meet the desired pattern and these would be trivial to sample. This is feasible for matrices with a relatively small number of 1 elements (e.g., <100K--depending on available memory).

For sparse matrices with a large number of 1 elements, a straightforward way to get an efficient, vectorized solution is also with rejection: sample pairs of 1 elements for each submatrix's antidiagonal and reject if the corresponding main diagonal elements are not 0. The below solution assumes more elements are 0 than 1. (If the opposite is true, it should be modified to sample two 0 elements for the main diagonal and reject if the antidiagonal elements are not 1.) The rejection rate will depend mainly on the density of the sparse matrix. The example matrix is rather dense, so it has a relatively high rejection rate.

library(data.table)
library(Matrix)

set.seed(1)
m <- matrix(rbinom(100,1,0.5),10,10)
n <- 20L # sample 20 pairs (before rejection)
m <- as(m, "ngTMatrix")
mIdx <- matrix(sample(length(m@i), 2L*n, TRUE), ncol = 2)
(data.table(
  row1 = m@i[mIdx[,1]],
  col1 = m@j[mIdx[,2]],
  row2 = m@i[mIdx[,2]],
  col2 = m@j[mIdx[,1]]
) + 1L)[
  row1 != row2 & col1 != col2 & !(m[matrix(c(row1, col1), n)] + m[matrix(c(row2, col2), n)])
]
#>    row1 col1 row2 col2
#> 1:    1    4    2    7
#> 2:   10    6    2   10
#> 3:    7    7    8    9

Here it is implemented as a function that returns a specified number of samples either with or without replacement.

sampleSubMat <- function(m, n, replace = FALSE, maxIter = 10L) {
  # convert m to a sparse matrix in triplet format if it's not already
  if (!grepl("TMatrix", class(m))) m <- as(1*m, "dgTMatrix")
  nLeft <- n
  # over-sample based on dimensions and density of the matrix
  mult <- 1.1/(1 - length(m@i)/prod(dim(m)))^2/prod(1 - 1/(dim(m - 1)))
  iter <- 1L
  
  if (replace) { # sampling with replacement (duplicates allowed)
    # more efficient to store individual data.table objects from each
    # iteration in a list, then rbindlist at the end
    lDT <- vector("list", maxIter)
    
    while (nLeft > 0L) {
      if (iter > maxIter) {
        message(sprintf("Max iterations (%i) reached", maxIter))
        # print("Max iterations reached")
        return(rbindlist(lDT[1:(iter - 1L)])[1:n])
      }
      nCurr <- ceiling(nLeft*mult)
      mIdx <- matrix(sample(length(m@i), 2L*nCurr, TRUE), ncol = 2)
      lDT[[iter]] <- (data.table(
        row1 = m@i[mIdx[,1]],
        col1 = m@j[mIdx[,2]],
        row2 = m@i[mIdx[,2]],
        col2 = m@j[mIdx[,1]]
      ) + 1L)[
        row1 != row2 & col1 != col2 & !(m[matrix(c(row1, col1), nCurr)] + m[matrix(c(row2, col2), nCurr)])
      ]
      if (nrow(lDT[[iter]])) {
        mult <- 1.1*mult*nLeft/nrow(lDT[[iter]])
        nLeft <- nLeft - nrow(lDT[[iter]])
        iter <- iter + 1L
      } else {
        # no pattern found, double the samples for the next iteration
        mult <- mult*2
      }
    }
    rbindlist(lDT[1:(iter - 1L)])[1:n]
  } else { # sampling without replacement (no duplicates allowed)
    # rbindlist on each iteration to check for duplicates
    dtOut <- data.table(
      row1 = integer(0), col1 = integer(0),
      row2 = integer(0), col2 = integer(0)
    )
    while (nLeft > 0L) {
      if (iter > maxIter) {
        message(sprintf("Max iterations (%i) reached", maxIter))
        return(dtOut)
      }
      nCurr <- ceiling(nLeft*mult)
      mIdx <- matrix(sample(length(m@i), 2L*nCurr, TRUE), ncol = 2)
      dt <- (data.table(
        row1 = m@i[mIdx[,1]],
        col1 = m@j[mIdx[,2]],
        row2 = m@i[mIdx[,2]],
        col2 = m@j[mIdx[,1]]
      ) + 1L)[
        row1 != row2 & col1 != col2 & !(m[matrix(c(row1, col1), nCurr)] + m[matrix(c(row2, col2), nCurr)])
      ]
      if (nrow(dt)) {
        dtOut <- unique(rbindlist(list(dtOut, dt)))
        mult <- 1.1*mult*nLeft/(nrow(dtOut) - n + nLeft)
        nLeft <- nLeft - nrow(dtOut)
      } else {
        mult <- mult*2
      }
    }
    dtOut[1:n]
  }
}

(dtSamples1 <- sampleSubMat(m, 10L))
#>     row1 col1 row2 col2
#>  1:    3    6    2   10
#>  2:    3    8    7    5
#>  3:    9    7    5    1
#>  4:    5    8   10    9
#>  5:    7    7    1    8
#>  6:    3    8    9    2
#>  7:    5    1    3    9
#>  8:    5    1    8    5
#>  9:    4    7    1    1
#> 10:   10    6    8    8
(dtSamples2 <- sampleSubMat(m, 10L, TRUE))
#>     row1 col1 row2 col2
#>  1:    6    7    5    8
#>  2:    2   10    3    4
#>  3:    7    7   10    1
#>  4:    5    1    4    9
#>  5:    1    8    9    7
#>  6:   10    1    8    8
#>  7:    8    8    7    6
#>  8:    7   10    3    9
#>  9:    2   10    3    9
#> 10:    2    1    3    6
# timing 1k samples from a random 10k-square matrix with 1M elements
idx <- sample(1e8, 1e6)
m <- new("ngTMatrix", i = as.integer(((idx - 1) %% 1e4)), j = as.integer(((idx - 1) %/% 1e4)), Dim = c(1e4L, 1e4L))
system.time(dtSamples3 <- sampleSubMat(m, 1e3L)) # without replacement
#>    user  system elapsed 
#>    1.08    0.31    1.40
system.time(dtSamples4 <- sampleSubMat(m, 1e3L, TRUE)) # with replacement
#>    user  system elapsed 
#>    0.89    0.32    1.21
Created on 2022-04-29 by the reprex package (v2.0.1)

回复收藏 0 原文

~没有更多了~