用于从 R 时间线手动采样的 for 循环的省时替代方案

发布于 2025-01-14 22:32:37 字数 3845 浏览 5 评论 0原文

因此，我在一年中的 x 个时间点对一组湖泊进行了采样。我还在水中部署了记录器等，并且我想使用这些记录器在 x 天/小时前的访问时间点的每日平均值。有时我也只是抓取访问时间点的样本。

这是我的解决方案，它工作得很好，但由于我对一些模型假设进行了大量实验并进行了敏感性分析，所以它的运行速度慢得令人不满意。

我似乎已经用循环解决了大部分 R 问题，并且经常遇到更高效的脚本，看到一些更有效的代码替代方案将非常有趣。

下面的代码只是生成一些虚拟数据。

library(dplyr)
library(lubridate)

do.pct.sat <- function(x,y,z){
  t <- x
  do <- y
  p <- z
  atm <- (p*100)/101325
  do.sat <- atm*exp(-139.34411+157570.1/(t+273.15)-66423080/(t+273.15)^2+12438000000/(t+273.15)^3-862194900000/(t+273.15)^4)
  do.pct.sat <- (do/do.sat)*100
  return(do.pct.sat)
}#function for calculating the % oxygen saturation

#here's some dummy date resembling real data
date.initial <- as.POSIXct("2022-06-01")#deployment date
date.end <- as.POSIXct("2022-10-01")#date of retrieval
id <- c("a","b","c")#lake id
lake <- list()#make dataset list for each lake
s <- list()#list of dataframes for the samples from the lake logger timelines

#loop below generates dummy data. this is not part of the real script that I want to improve.
for(i in 1:3){
  datetime <- seq(from = date.initial,to = date.end,by=10*60)#10 minute intervals from deploy to retrieve
  l <- length(datetime)#vector length of datetime
  #set dummy data
  do <- rnorm(l,mean = 10,sd=3)#o2 conc.
  pressure <- rnorm(l,mean = 980,sd=50)#baro pressure
  temp <- rnorm(l,mean=15,sd=5)#water temp
  k.z <- rnorm(l,mean=0.35,sd=0.1)#gas exchange koeff / mixed layer depth
  dosat.pct <- do.pct.sat(temp,do,pressure)#oxygen sat in %
  
  iso <- as.data.frame(cbind(datetime,do,dosat.pct,temp,pressure,k.z))#bind dummy dataframe to resemble real data
  iso$datetime <- as.POSIXct(iso$datetime,origin = "1970-01-01")
  
  lake[[i]] <- iso#save the data frame to the lake logger list
  samples <- as.POSIXct(sample((date.initial+5*24*60*60):date.end, 7, replace=FALSE),origin = "1970-01-01")#randomize 7 timepoints
  s[[i]] <- as.data.frame(samples)#save it in empty data frame
  s[[i]]$lake <- id[i]
}
names(lake) <- id
samples <- bind_rows(s) 

samples$samples <- round_date(samples$samples,unit="10 minutes")#rounds my random samples to closest 10 minute

下面是我想要实现的函数（同一库）。我认为它运行得很慢，因为我一次只约会一个约会，然后再约会下一个约会；

    sample.lakes <- function(average=3){

  dts <- list()#empty list
  for(i in 1:length(lake)){
    print(id[i])
    data = lake[[i]]
    y <- samples[grepl(id[i],samples$lake),]
    
    dates <- y$samples
    #empty vectors to fill with values sampled in loop
    avg.kz <- vector()
    sd.kz <- vector()
    do.mgl <- vector()
    dosat.pct <- vector()
    temp.c <- vector()
    for (k in 1:length(dates)){
      print(k)
      #below I filter the logger data to contain timepoint of sampling minus number of days I want the average from 'averages'.
      prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
      #fill the empty vectors with value I desire, mean and sd k.z and point sample of the other variables. 
      avg.kz[k] = mean(prior.days$k.z)
      sd.kz[k] = sd(prior.days$k.z)
      temp.c[k] <- data[grepl(dates[k],data$datetime),]$temp
      do.mgl[k] <- data[grepl(dates[k],data$datetime),]$do
      dosat.pct[k] <- data[grepl(dates[k],data$datetime),]$dosat.pct

      
      
    }
    sd.kz[is.na(sd.kz)] <- 0
    
    #add them to data frame y
    y$dosat.pct <- dosat.pct
    y$do.mgl <- do.mgl
    y$temp.c <- temp.c
    y$avg.kz <- avg.kz
    y$sd.kz <- sd.kz
    
    dts[[i]] <- y#add to single-row dataframe
  }
  iso <- bind_rows(dts)#make a complete dataframe with samples.

  return(iso)
  
}

iso <- sample.lakes(average=4)#do not set average to > 5 in this example script

我将非常感谢任何建议！

原文

So I have sampled a set of lakes at x timepoints throughout the year. I also have deployed loggers etc. in the water and I want to use daily averages from these loggers, at the timepoint of the visit to x days/hours before. Sometimes I also just grab the a sample for the timepoint of the visit.

This is my solution, it works just fine but since I experiment alot with some model assumptions and perform sensitivity analyses it operates unsatisfactory slow.

I seem to have solved most of my R problems with loops and I often encounter more efficient scripts, it would be very interesting to see some more effective alternatives to my code.

Below code just generates some dummy data..

library(dplyr)
library(lubridate)

do.pct.sat <- function(x,y,z){
  t <- x
  do <- y
  p <- z
  atm <- (p*100)/101325
  do.sat <- atm*exp(-139.34411+157570.1/(t+273.15)-66423080/(t+273.15)^2+12438000000/(t+273.15)^3-862194900000/(t+273.15)^4)
  do.pct.sat <- (do/do.sat)*100
  return(do.pct.sat)
}#function for calculating the % oxygen saturation

#here's some dummy date resembling real data
date.initial <- as.POSIXct("2022-06-01")#deployment date
date.end <- as.POSIXct("2022-10-01")#date of retrieval
id <- c("a","b","c")#lake id
lake <- list()#make dataset list for each lake
s <- list()#list of dataframes for the samples from the lake logger timelines

#loop below generates dummy data. this is not part of the real script that I want to improve.
for(i in 1:3){
  datetime <- seq(from = date.initial,to = date.end,by=10*60)#10 minute intervals from deploy to retrieve
  l <- length(datetime)#vector length of datetime
  #set dummy data
  do <- rnorm(l,mean = 10,sd=3)#o2 conc.
  pressure <- rnorm(l,mean = 980,sd=50)#baro pressure
  temp <- rnorm(l,mean=15,sd=5)#water temp
  k.z <- rnorm(l,mean=0.35,sd=0.1)#gas exchange koeff / mixed layer depth
  dosat.pct <- do.pct.sat(temp,do,pressure)#oxygen sat in %
  
  iso <- as.data.frame(cbind(datetime,do,dosat.pct,temp,pressure,k.z))#bind dummy dataframe to resemble real data
  iso$datetime <- as.POSIXct(iso$datetime,origin = "1970-01-01")
  
  lake[[i]] <- iso#save the data frame to the lake logger list
  samples <- as.POSIXct(sample((date.initial+5*24*60*60):date.end, 7, replace=FALSE),origin = "1970-01-01")#randomize 7 timepoints
  s[[i]] <- as.data.frame(samples)#save it in empty data frame
  s[[i]]$lake <- id[i]
}
names(lake) <- id
samples <- bind_rows(s) 

samples$samples <- round_date(samples$samples,unit="10 minutes")#rounds my random samples to closest 10 minute

Below is the function that I want to effectivize (same library). I think it operates slow because I take one date at a time, before taking the next;

    sample.lakes <- function(average=3){

  dts <- list()#empty list
  for(i in 1:length(lake)){
    print(id[i])
    data = lake[[i]]
    y <- samples[grepl(id[i],samples$lake),]
    
    dates <- y$samples
    #empty vectors to fill with values sampled in loop
    avg.kz <- vector()
    sd.kz <- vector()
    do.mgl <- vector()
    dosat.pct <- vector()
    temp.c <- vector()
    for (k in 1:length(dates)){
      print(k)
      #below I filter the logger data to contain timepoint of sampling minus number of days I want the average from 'averages'.
      prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
      #fill the empty vectors with value I desire, mean and sd k.z and point sample of the other variables. 
      avg.kz[k] = mean(prior.days$k.z)
      sd.kz[k] = sd(prior.days$k.z)
      temp.c[k] <- data[grepl(dates[k],data$datetime),]$temp
      do.mgl[k] <- data[grepl(dates[k],data$datetime),]$do
      dosat.pct[k] <- data[grepl(dates[k],data$datetime),]$dosat.pct

      
      
    }
    sd.kz[is.na(sd.kz)] <- 0
    
    #add them to data frame y
    y$dosat.pct <- dosat.pct
    y$do.mgl <- do.mgl
    y$temp.c <- temp.c
    y$avg.kz <- avg.kz
    y$sd.kz <- sd.kz
    
    dts[[i]] <- y#add to single-row dataframe
  }
  iso <- bind_rows(dts)#make a complete dataframe with samples.

  return(iso)
  
}

iso <- sample.lakes(average=4)#do not set average to > 5 in this example script

I would appreciaty any suggestions alot!

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

叶落知秋 2025-01-21 22:32:37

我的猜测是，这部分

data[grepl(dates[k],data$datetime),]

在内部 for 循环中使用 grepl: 速度很慢。

您不能尝试仅查看日期时间是否与 == 相同吗？

此外，您只需对 data 进行一次子集化。

试试这个作为替代方案：

for (k in 1:length(dates)){
  print(k)
  
  prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
  
  avg.kz[k] = mean(prior.days$k.z)
  sd.kz[k] = sd(prior.days$k.z)
  
  sub_data <- data[data$datetime == dates[k], ]
  
  temp.c[k] <- sub_data$temp
  do.mgl[k] <- sub_data$do
  dosat.pct[k] <- sub_data$dosat.pct
}

My guess is that this part using grepl:

data[grepl(dates[k],data$datetime),]

inside your inner for loop is slow.

Couldn't you instead try just seeing if the datetimes are the same with ==?

In addition, you only need to subset data once.

Try this as an alternative:

for (k in 1:length(dates)){
  print(k)
  
  prior.days = filter(data, datetime > as.POSIXct(dates[k])-(24*60*60)*average & datetime < as.POSIXct(dates[k]))
  
  avg.kz[k] = mean(prior.days$k.z)
  sd.kz[k] = sd(prior.days$k.z)
  
  sub_data <- data[data$datetime == dates[k], ]
  
  temp.c[k] <- sub_data$temp
  do.mgl[k] <- sub_data$do
  dosat.pct[k] <- sub_data$dosat.pct
}

回复收藏 0 原文

~没有更多了~