运行随机森林循环，预测未采样的位置并保存输出

发布于 2025-01-19 11:13:53 字数 6995 浏览 1 评论 0原文

我想经营一个随机的森林100次，节省火车并测试AUC和OOB分数，预测为空间范围建模，保存平均值，标准误差和每个单元的变化系数，然后写入栅格，如果有人知道如何知道如何添加TSS分数。抱歉，有代码的前半部分缺少代码，零散的代码和错误，但是从随机的森林模型开始有问题，任何帮助都将不胜感激。

Packages:
require(raster);require(dismo);require(sp); 
require(pROC);require(FinCal);require(ggplot2);require(randomForest);require (ROCR) 
require(tidyverse); require(dplyr)

n.boot <- 100
length.map <- nrow(predictor_variables)
boot_mat <- array(0, c(length.map,n.boot)) # matrix where rows will store predictions for each 
bootstrap (columns)
influence_mat <- array(0,c(21,n.boot)) # for saving information on predictors 
auc_mat <- array(0,c(n.boot, 4)) #to save the RF training and test AUC and OOB. 

for (i in 1:n.boot) # bootstrap loop
{ # create training and evaluation presence/ relative absence dataframes
train_ind <- sample(seq_len(nrow(sampled)), size = floor(0.75 * nrow(sampled))) # index of 
rows for 75% of presence data
# creat training and evaluation presences 
sampled _train <- sampled [train_ind, ]
sampled _eval <- sampled [-train_ind, ]

#creation of absences
rnd <- sort(sample(seq(1,length.abs),nrow(sampled _train)*2)) 
rnd.ev <- sort(sample(seq(1,length.abs),nrow(sampled _eval)*2)) 
absence.rnd <- absence[rnd,] 
absence.rnd.ev <- absence[rnd.ev,]

# creation of presence and absence
sampled.PresAbs.B <- rbind(sampled _train,absence.rnd) # final presence/absence data
sampled_eval <- rbind(sampled_eval,absence.rnd.ev)

training<- randomForest(response~., sampled.PresAbs.B [,c(5:25)], ntree=1000, mtry = 2, 
importance = TRUE) # column number of predictor variables to be used in RF
# need to save influence/importance of predictors on response for each run.
# need to save training (training) and test AUC for each run.
# need to save OOB error rate too for training and test runs if possible.

# predict spatially to map 
pred_map <-  predict(predictor_variables, training, type = "prob", index = 2) 
boot_mat[,i] <- round(pred.map, digits = 2) 
}

save(boot_mat, file=" sampled_boot_mat.Rdata")
save(auc_mat, file=" sampled_dev_mat.Rdata")
save(influence_mat, file=" sampled_inf_mat.Rdata")

# need to calculate mean and standard error of influence of predictors on response, then write 
to csv.

# calculate mean suitability and CV spatially
sampled.boot.mean<-apply(boot_mat,1,mean)
sampled.boot.sd<-apply(boot_mat,1,sd)
sampled.boot.cv<- sampled.boot.sd/sampled.boot.mean # calculation of Coefficient of variation if needed

# export the maps
sampled.map.mean <- cbind(sna_all_preds[, c("X", "Y")], sampled.boot.mean) # mean prediciton
sampled.map.UC <- cbind(sna_all_preds[, c("X", "Y")], sampled.boot.sd) # uncertainty 
sampled.map.CV <-cbind(sna_all_preds[,c("X", "Y")], sampled.boot.cv) # CV
#can then export as rasters easily enough.

例如数据：

dput(head(sna_all_boffffs_and_preds, 10))
structure(list(site = 1:10, X = c(708396.365970067, 708646.365970067, 
708896.365970067, 709146.365970067, 708396.365970067, 708646.365970067, 
708896.365970067, 709146.365970067, 708396.365970067, 708646.365970067
), Y = c(1243944.65711016, 1243944.65711016, 1243944.65711016, 
1243944.65711016, 1243694.65711016, 1243694.65711016, 1243694.65711016, 
1243694.65711016, 1243444.65711016, 1243444.65711016), sna_boffffs_clip = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_), adet_2010_2019_250_clip = c(-14.4839498108682, 
-14.4839498108682, -14.4839498108682, -14.4839498108682, -14.4839498108682, 
-14.4839498108682, -14.4839498108682, -14.4839498108682, -14.4839498108682, 
-14.4839498108682), bathy_250_slope_clip = c(3.62498998641968, 
1.90744984149933, 4.96259355545044, 8.03348731994629, 2.80171227455139, 
2.33035254478455, 5.86305713653564, 8.246413230896, 5.99569511413574, 
6.80374336242676), bathy_250_vrm_007_clip = c(0.00215762853622437, 
0.00210392475128174, 0.00183302164077759, 0.00150519609451294, 
0.00216823816299438, 0.00220978260040283, 0.0020909309387207, 
0.00188064575195312, 0.00211155414581299, 0.00220668315887451
), btr_100km_clip = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), btr_10km_clip = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0), btr_50km_clip = c(0, 0, 0, 0, 0, 
0, 0, 0, 0, 0), carbonate_250_clip = c(30.7859439849854, 30.7859439849854, 
30.7859439849854, 30.7859439849854, 30.7859439849854, 30.7859439849854, 
30.7859439849854, 30.7859439849854, 30.7859439849854, 30.7859439849854
), Catch_kg_per_km2_AllMethods_250_clip = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0), current_speed_250_clip = c(0.17399999499321, 0.17399999499321, 
0.17399999499321, 0.17399999499321, 0.17399999499321, 0.17399999499321, 
0.17399999499321, 0.17399999499321, 0.17399999499321, 0.17399999499321
), dist_shore_250_clip = c(67926.890625, 67982.53125, 68039.0546875, 
68096.4375, 67683.0859375, 67738.9296875, 67795.6484375, 67853.2421875, 
67439.328125, 67495.3671875), ebed_20_av_250_clip = c(0.538848269429723, 
0.538848269429723, 0.538848269429723, 0.538848269429723, 0.538848269429723, 
0.538848269429723, 0.538848269429723, 0.538848269429723, 0.538848269429723, 
0.538848269429723), gravel_250_clip = c(19.7878284454346, 19.7878284454346, 
19.7878284454346, 19.7878284454346, 19.7878284454346, 19.7878284454346, 
19.7878284454346, 19.7878284454346, 19.7878284454346, 19.7878284454346
), hvis_20_av_250_clip = c(21.4873270945488, 21.4873270945488, 
21.4873270945488, 21.4873270945488, 21.4873270945488, 21.4873270945488, 
21.4873270945488, 21.4873270945488, 21.4873270945488, 21.4873270945488
), orb_vel_250_clip = c(150L, 150L, 150L, 150L, 150L, 150L, 150L, 
150L, 150L, 150L), rocky_reef_clip = c(0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L), sal_depth_250_clip = c(35.231746673584, 35.231746673584, 
35.231746673584, 35.231746673584, 35.231746673584, 35.231746673584, 
35.231746673584, 35.231746673584, 35.231746673584, 35.231746673584
), sand_250_clip = c(19.3533153533936, 19.3533153533936, 19.3533153533936, 
19.3533153533936, 19.3533153533936, 19.3533153533936, 19.3533153533936, 
19.3533153533936, 19.3533153533936, 19.3533153533936), sst_20_av_250_clip = c(16.6609603188989, 
16.6609603188989, 16.6609603188989, 16.6609603188989, 16.6609603188989, 
16.6609603188989, 16.6609603188989, 16.6609603188989, 16.6609603188989, 
16.6609603188989), sst_annual_amp_250_clip = c(2.67400002479553, 
2.67400002479553, 2.67400002479553, 2.67400002479553, 2.67400002479553, 
2.67400002479553, 2.67400002479553, 2.67400002479553, 2.67400002479553, 
2.67400002479553), sst_grad_250_clip = c(0.00596274901181459, 
0.00596274901181459, 0.00596274901181459, 0.00596274901181459, 
0.00596274901181459, 0.00596274901181459, 0.00596274901181459, 
0.00596274901181459, 0.00596274901181459, 0.00596274901181459
), temp_res_250_clip = c(4.63328790664673, 4.63328790664673, 
4.63328790664673, 4.63328790664673, 4.63328790664673, 4.63328790664673, 
4.63328790664673, 4.63328790664673, 4.63328790664673, 4.63328790664673
)), row.names = c(NA, 10L), class = "data.frame")

原文

I would like to run a random forest 100 times, save train and test AUC and OOB scores, predict to model spatial extent, save mean, standard error and coefficient of variation per cell, then write to raster, bonus points if anyone knows how to add TSS score. Apologies there is missing code, fragmented code and errors, the first half of the code works, but from the random forest model onwards there are problems, any help would be greatly appreciated.

Packages:
require(raster);require(dismo);require(sp); 
require(pROC);require(FinCal);require(ggplot2);require(randomForest);require (ROCR) 
require(tidyverse); require(dplyr)

n.boot <- 100
length.map <- nrow(predictor_variables)
boot_mat <- array(0, c(length.map,n.boot)) # matrix where rows will store predictions for each 
bootstrap (columns)
influence_mat <- array(0,c(21,n.boot)) # for saving information on predictors 
auc_mat <- array(0,c(n.boot, 4)) #to save the RF training and test AUC and OOB. 

for (i in 1:n.boot) # bootstrap loop
{ # create training and evaluation presence/ relative absence dataframes
train_ind <- sample(seq_len(nrow(sampled)), size = floor(0.75 * nrow(sampled))) # index of 
rows for 75% of presence data
# creat training and evaluation presences 
sampled _train <- sampled [train_ind, ]
sampled _eval <- sampled [-train_ind, ]

#creation of absences
rnd <- sort(sample(seq(1,length.abs),nrow(sampled _train)*2)) 
rnd.ev <- sort(sample(seq(1,length.abs),nrow(sampled _eval)*2)) 
absence.rnd <- absence[rnd,] 
absence.rnd.ev <- absence[rnd.ev,]

# creation of presence and absence
sampled.PresAbs.B <- rbind(sampled _train,absence.rnd) # final presence/absence data
sampled_eval <- rbind(sampled_eval,absence.rnd.ev)

training<- randomForest(response~., sampled.PresAbs.B [,c(5:25)], ntree=1000, mtry = 2, 
importance = TRUE) # column number of predictor variables to be used in RF
# need to save influence/importance of predictors on response for each run.
# need to save training (training) and test AUC for each run.
# need to save OOB error rate too for training and test runs if possible.

# predict spatially to map 
pred_map <-  predict(predictor_variables, training, type = "prob", index = 2) 
boot_mat[,i] <- round(pred.map, digits = 2) 
}

save(boot_mat, file=" sampled_boot_mat.Rdata")
save(auc_mat, file=" sampled_dev_mat.Rdata")
save(influence_mat, file=" sampled_inf_mat.Rdata")

# need to calculate mean and standard error of influence of predictors on response, then write 
to csv.

# calculate mean suitability and CV spatially
sampled.boot.mean<-apply(boot_mat,1,mean)
sampled.boot.sd<-apply(boot_mat,1,sd)
sampled.boot.cv<- sampled.boot.sd/sampled.boot.mean # calculation of Coefficient of variation if needed

# export the maps
sampled.map.mean <- cbind(sna_all_preds[, c("X", "Y")], sampled.boot.mean) # mean prediciton
sampled.map.UC <- cbind(sna_all_preds[, c("X", "Y")], sampled.boot.sd) # uncertainty 
sampled.map.CV <-cbind(sna_all_preds[,c("X", "Y")], sampled.boot.cv) # CV
#can then export as rasters easily enough.

e.g of data:

dput(head(sna_all_boffffs_and_preds, 10))
structure(list(site = 1:10, X = c(708396.365970067, 708646.365970067, 
708896.365970067, 709146.365970067, 708396.365970067, 708646.365970067, 
708896.365970067, 709146.365970067, 708396.365970067, 708646.365970067
), Y = c(1243944.65711016, 1243944.65711016, 1243944.65711016, 
1243944.65711016, 1243694.65711016, 1243694.65711016, 1243694.65711016, 
1243694.65711016, 1243444.65711016, 1243444.65711016), sna_boffffs_clip = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_), adet_2010_2019_250_clip = c(-14.4839498108682, 
-14.4839498108682, -14.4839498108682, -14.4839498108682, -14.4839498108682, 
-14.4839498108682, -14.4839498108682, -14.4839498108682, -14.4839498108682, 
-14.4839498108682), bathy_250_slope_clip = c(3.62498998641968, 
1.90744984149933, 4.96259355545044, 8.03348731994629, 2.80171227455139, 
2.33035254478455, 5.86305713653564, 8.246413230896, 5.99569511413574, 
6.80374336242676), bathy_250_vrm_007_clip = c(0.00215762853622437, 
0.00210392475128174, 0.00183302164077759, 0.00150519609451294, 
0.00216823816299438, 0.00220978260040283, 0.0020909309387207, 
0.00188064575195312, 0.00211155414581299, 0.00220668315887451
), btr_100km_clip = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), btr_10km_clip = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0), btr_50km_clip = c(0, 0, 0, 0, 0, 
0, 0, 0, 0, 0), carbonate_250_clip = c(30.7859439849854, 30.7859439849854, 
30.7859439849854, 30.7859439849854, 30.7859439849854, 30.7859439849854, 
30.7859439849854, 30.7859439849854, 30.7859439849854, 30.7859439849854
), Catch_kg_per_km2_AllMethods_250_clip = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0), current_speed_250_clip = c(0.17399999499321, 0.17399999499321, 
0.17399999499321, 0.17399999499321, 0.17399999499321, 0.17399999499321, 
0.17399999499321, 0.17399999499321, 0.17399999499321, 0.17399999499321
), dist_shore_250_clip = c(67926.890625, 67982.53125, 68039.0546875, 
68096.4375, 67683.0859375, 67738.9296875, 67795.6484375, 67853.2421875, 
67439.328125, 67495.3671875), ebed_20_av_250_clip = c(0.538848269429723, 
0.538848269429723, 0.538848269429723, 0.538848269429723, 0.538848269429723, 
0.538848269429723, 0.538848269429723, 0.538848269429723, 0.538848269429723, 
0.538848269429723), gravel_250_clip = c(19.7878284454346, 19.7878284454346, 
19.7878284454346, 19.7878284454346, 19.7878284454346, 19.7878284454346, 
19.7878284454346, 19.7878284454346, 19.7878284454346, 19.7878284454346
), hvis_20_av_250_clip = c(21.4873270945488, 21.4873270945488, 
21.4873270945488, 21.4873270945488, 21.4873270945488, 21.4873270945488, 
21.4873270945488, 21.4873270945488, 21.4873270945488, 21.4873270945488
), orb_vel_250_clip = c(150L, 150L, 150L, 150L, 150L, 150L, 150L, 
150L, 150L, 150L), rocky_reef_clip = c(0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L), sal_depth_250_clip = c(35.231746673584, 35.231746673584, 
35.231746673584, 35.231746673584, 35.231746673584, 35.231746673584, 
35.231746673584, 35.231746673584, 35.231746673584, 35.231746673584
), sand_250_clip = c(19.3533153533936, 19.3533153533936, 19.3533153533936, 
19.3533153533936, 19.3533153533936, 19.3533153533936, 19.3533153533936, 
19.3533153533936, 19.3533153533936, 19.3533153533936), sst_20_av_250_clip = c(16.6609603188989, 
16.6609603188989, 16.6609603188989, 16.6609603188989, 16.6609603188989, 
16.6609603188989, 16.6609603188989, 16.6609603188989, 16.6609603188989, 
16.6609603188989), sst_annual_amp_250_clip = c(2.67400002479553, 
2.67400002479553, 2.67400002479553, 2.67400002479553, 2.67400002479553, 
2.67400002479553, 2.67400002479553, 2.67400002479553, 2.67400002479553, 
2.67400002479553), sst_grad_250_clip = c(0.00596274901181459, 
0.00596274901181459, 0.00596274901181459, 0.00596274901181459, 
0.00596274901181459, 0.00596274901181459, 0.00596274901181459, 
0.00596274901181459, 0.00596274901181459, 0.00596274901181459
), temp_res_250_clip = c(4.63328790664673, 4.63328790664673, 
4.63328790664673, 4.63328790664673, 4.63328790664673, 4.63328790664673, 
4.63328790664673, 4.63328790664673, 4.63328790664673, 4.63328790664673
)), row.names = c(NA, 10L), class = "data.frame")

分享到QQ

分享到微博