如何在数据上并行化明确的循环。表（通过参考）

发布于 2025-01-24 04:16:04 字数 4437 浏览 4 评论 0原文

问题语句

我有此函数访问三个数据表：snp_pairs，zygosity和ae。

我正在迭代snp_pairs，并用两个新列对其进行更新。

数据表zygosity和ae仅读取而根本没有修改。

我想知道在共享这三个数据表的同时是否有一种平行的方式，即不要将它们复制到线程（我认为是什么Eg Parallel :: mclapply（） /代码>会做）。

函数`daeqtl_mapping（）`

daeqtl_mapping_ <- function(snp_pairs, zygosity, ae) {

  # Ensure that the data tables are all keyed.
  # `snp_pairs` keys: first two columns
  data.table::setkeyv(snp_pairs, colnames(snp_pairs)[1:2])
  # `zygosity` keys: first column
  data.table::setkeyv(zygosity, colnames(zygosity)[1])
  # `ae` keys: first column
  data.table::setkeyv(ae, colnames(ae)[1])

  n <- nrow(snp_pairs)
  for(i in seq_len(n)) {

    candidate_snp <- snp_pairs[[i, 'candidate_snp']]
    dae_snp <- snp_pairs[[i, 'dae_snp']]

    if(dae_snp %is_not_in% snp_pairs) {
      datatable::set(snp_pairs, i = i, j = 'pvalue', value = NA)
      datatable::set(snp_pairs, i = i, j = 'case', value = 0L)
      break
    }

    if(dae_snp %is_not_in% ae) {
      datatable::set(snp_pairs, i = i, j = 'pvalue', value = NA)
      datatable::set(snp_pairs, i = i, j = 'case', value = 0L)
      break
    }

    # `csnp_hom`: lgl indicating homozygous samples for the candidate snp
    csnp_hom <- is_hom(dt = zygosity, snp = candidate_snp)

    # `csnp_het`: lgl indicating heterozygous samples for the candidate snp
    csnp_het <- is_het(dt = zygosity, snp = candidate_snp)

    # `dsnp_het`: lgl indicating heterozygous samples for the dae snp
    dsnp_het <- is_het(dt = zygosity, snp = dae_snp)

    # `ae_hom`: dbl with allelic ratios of homozygous samples (candidate snp)
    ae_hom <- as.numeric(ae[dae_snp][, 1 := NULL][, dsnp_het & csnp_hom, with = FALSE])

    # `ae_het`: dbl with allelic ratios of heterozygous samples (candidate snp)
    ae_het <- as.numeric(ae[dae_snp][, 1 := NULL][, dsnp_het & csnp_het, with = FALSE])


    df <- daeqtl_test(ae_hom = ae_hom, ae_het = ae_het)
    for (col in names(df)) data.table::set(snp_pairs, i = i, j = col, value = df[[col]])
  }
}

mwe

请先安装daeqtlr，其中远程>: install_github（“ maialab/daeqtlr”）。

library(daeqtlr)

# Download the RDS files from this google drive folder
# https://drive.google.com/drive/folders/1lefDRPW2W6jBMnwUQIy4VGD9ssks4R-E?usp=sharing

snp_pairs <- data.table::setDT(readRDS(file = '~/dwl/snp_pairs.rds'))
zygosity <- data.table::setDT(readRDS(file = '~/dwl/zygosity.rds'))
ae <- data.table::setDT(readRDS(file = '~/dwl/ae.rds'))

daeqtlr:::daeqtl_mapping_(snp_pairs, zygosity, ae)
snp_pairs
#>          dae_snp candidate_snp chromosome dae_snp_position
#>    1: rs10035235     rs1001420          5         40798542
#>    2: rs10035235     rs1001684          5         40798542
#>    3: rs10035235     rs1002424          5         40798542
#>    4: rs10035235     rs1002922          5         40798542
#>    5: rs10035235    rs10043093          5         40798542
#>   ---                                                     
#> 1482: rs10035418     rs9324855          5        141437664
#> 1483: rs10035418     rs9324858          5        141437664
#> 1484: rs10035418     rs9324859          5        141437664
#> 1485: rs10035418     rs9686896          5        141437664
#> 1486: rs10035418      rs998794          5        141437664
#>       candidate_snnp_position    pvalue case
#>    1:                41000241 0.1155707    4
#>    2:                40810324 0.4341583    4
#>    3:                40767295 0.2739109    4
#>    4:                40386453 0.9556968    4
#>    5:                40324118 0.3911479    4
#>   ---                                       
#> 1482:               141693599 0.9098420    4
#> 1483:               141884896 0.8291068    4
#> 1484:               141899785 0.7482426    4
#> 1485:               141731340 0.9556116    4
#> 1486:               141187135 0.6369524    4

^{在2022-04-26创建的 reprex package （v2.0.1）}

原文

Problem statement

I have this function daeqtl_mapping_() that takes in three data tables: snp_pairs, zygosity and ae.

I am iterating over each row of snp_pairs, and updating it with two new columns.

The data tables zygosity and ae are only read and not modified at all.

I'd like to know if there is a way of parallelizing the for-loop while sharing these three data tables, i.e. not copy them over to threads (which is, I think, what e.g. parallel::mclapply() would do).

Function `daeqtl_mapping()`

daeqtl_mapping_ <- function(snp_pairs, zygosity, ae) {

  # Ensure that the data tables are all keyed.
  # `snp_pairs` keys: first two columns
  data.table::setkeyv(snp_pairs, colnames(snp_pairs)[1:2])
  # `zygosity` keys: first column
  data.table::setkeyv(zygosity, colnames(zygosity)[1])
  # `ae` keys: first column
  data.table::setkeyv(ae, colnames(ae)[1])

  n <- nrow(snp_pairs)
  for(i in seq_len(n)) {

    candidate_snp <- snp_pairs[[i, 'candidate_snp']]
    dae_snp <- snp_pairs[[i, 'dae_snp']]

    if(dae_snp %is_not_in% snp_pairs) {
      datatable::set(snp_pairs, i = i, j = 'pvalue', value = NA)
      datatable::set(snp_pairs, i = i, j = 'case', value = 0L)
      break
    }

    if(dae_snp %is_not_in% ae) {
      datatable::set(snp_pairs, i = i, j = 'pvalue', value = NA)
      datatable::set(snp_pairs, i = i, j = 'case', value = 0L)
      break
    }

    # `csnp_hom`: lgl indicating homozygous samples for the candidate snp
    csnp_hom <- is_hom(dt = zygosity, snp = candidate_snp)

    # `csnp_het`: lgl indicating heterozygous samples for the candidate snp
    csnp_het <- is_het(dt = zygosity, snp = candidate_snp)

    # `dsnp_het`: lgl indicating heterozygous samples for the dae snp
    dsnp_het <- is_het(dt = zygosity, snp = dae_snp)

    # `ae_hom`: dbl with allelic ratios of homozygous samples (candidate snp)
    ae_hom <- as.numeric(ae[dae_snp][, 1 := NULL][, dsnp_het & csnp_hom, with = FALSE])

    # `ae_het`: dbl with allelic ratios of heterozygous samples (candidate snp)
    ae_het <- as.numeric(ae[dae_snp][, 1 := NULL][, dsnp_het & csnp_het, with = FALSE])


    df <- daeqtl_test(ae_hom = ae_hom, ae_het = ae_het)
    for (col in names(df)) data.table::set(snp_pairs, i = i, j = col, value = df[[col]])
  }
}

MWE

Please first install daeqtlr with remotes::install_github("maialab/daeqtlr").

library(daeqtlr)

# Download the RDS files from this google drive folder
# https://drive.google.com/drive/folders/1lefDRPW2W6jBMnwUQIy4VGD9ssks4R-E?usp=sharing

snp_pairs <- data.table::setDT(readRDS(file = '~/dwl/snp_pairs.rds'))
zygosity <- data.table::setDT(readRDS(file = '~/dwl/zygosity.rds'))
ae <- data.table::setDT(readRDS(file = '~/dwl/ae.rds'))

daeqtlr:::daeqtl_mapping_(snp_pairs, zygosity, ae)
snp_pairs
#>          dae_snp candidate_snp chromosome dae_snp_position
#>    1: rs10035235     rs1001420          5         40798542
#>    2: rs10035235     rs1001684          5         40798542
#>    3: rs10035235     rs1002424          5         40798542
#>    4: rs10035235     rs1002922          5         40798542
#>    5: rs10035235    rs10043093          5         40798542
#>   ---                                                     
#> 1482: rs10035418     rs9324855          5        141437664
#> 1483: rs10035418     rs9324858          5        141437664
#> 1484: rs10035418     rs9324859          5        141437664
#> 1485: rs10035418     rs9686896          5        141437664
#> 1486: rs10035418      rs998794          5        141437664
#>       candidate_snnp_position    pvalue case
#>    1:                41000241 0.1155707    4
#>    2:                40810324 0.4341583    4
#>    3:                40767295 0.2739109    4
#>    4:                40386453 0.9556968    4
#>    5:                40324118 0.3911479    4
#>   ---                                       
#> 1482:               141693599 0.9098420    4
#> 1483:               141884896 0.8291068    4
#> 1484:               141899785 0.7482426    4
#> 1485:               141731340 0.9556116    4
#> 1486:               141187135 0.6369524    4

^{Created on 2022-04-26 by the reprex package (v2.0.1)}

分享到QQ

分享到微博