绘制多类随机森林模型的决策边界

发布于 2025-01-31 18:59:03 字数 5431 浏览 2 评论 0原文

我从交叉验证和数据科学stackexchange中进行了交叉点，因为我被告知我的问题很重。如果规则不允许，我将删除 - 我不知道该位置是什么。

我使用的是带有10个类的MNIST数据集（数字0到9）。我正在使用具有49个预测变量的压缩版本（x1，x2，...，x49）。我已经训练了一个随机的森林模型，并创建了一个测试数据集，即网格，我在其上使用训练有素的模型来生成预测作为类概率和类。我正在尝试在这里概括在这里生成决策边界的代码，当时只有两个结果类别： on of on”>变化统计学习要素的k-neareber邻居分类器的决策边界？” 在这里： https：// https：// stats.stackexchange.com/questions/21572/how-to-to-plot-decision-boundary-ok-nearest-neymest-neighbor-classifier-from-elements-o

，在这里： ggplot2中的决策边界图

我试图使用前2个预测器（ x1和x2），尽管已经用所有49进行了预测。。

## Create a grid of data which is the Test data...
## traindat is the dataset that the model was trained om
data<- traindat
resolution = 50 (there will be 50 rows)
## Extract the 49 predictor variables and drop the toutcome variable
data <- data[,2:50]
head(data)
## Get the variable names in a list
ln <- vector(mode="list", length=49)
ln<-as.list(names(data))

data_mat<-matrix(0,50,49)
r <- sapply(data, range, na.rm = TRUE)
for (i in 1:49){
  
  data_mat[,i]<- seq(r[1,i], r[2,i], length.out = resolution)
}
data_mat
mat<-as.matrix(data_mat)
m<-as.data.frame(mat)
## Create test data grid                                                       

fn<-function(x) seq(min(x)+1, max(x) + 1, length.out=50)
test2<-apply(m, 2, fn)
test2<-as.data.frame(test2)
colnames(test2)<-unlist(ln)
test2<-as.data.frame(test2)
## label is a column that should contain the Predicted class labels
test2$label<-"-1"
test2<-test2 %>%
  relocate(label, .before = x1)
## finalModel is the model obtained from training the Random Forest on traindat
prob=predict(rf_gridsearch$finalModel,test2,type="prob")
prob2=predict(rf_gridsearch$finalModel,test2,type="response")
prob2<-as.data.frame(prob)
head(prob2)

## Create predicted classes 0 to 9 and probabilities for the Test data
fn<-function(x) which.max(x)-1
outCls<-apply(prob2, 1, fn)
outCls
fn<-function(x) max(x)
outProb<-apply(prob2, 1, fn)
outProb
##Data structure for plotting

require(dplyr)

dataf2 <- bind_rows(mutate(test2,
                          prob=outProb,
                          cls=0,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                                 prob=outProb,
                                 cls=1,
                                 prob_cls=ifelse(outCls==cls,
                                                 1, 0)),
                   mutate(test2,
                                 prob=outProb,
                                 cls=2,
                                 prob_cls=ifelse(outCls==cls,
                                                 1, 0)),
                   
                   
                   mutate(test2,
                          prob=outProb,
                          cls=3,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=4,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=5,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=6,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=7,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=8,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=9,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0))
                   
                   
                   )

## Solution from Stackexchange based on only two outcome classes
ggplot()+
  geom_raster(data= dataf2, aes(x= x1, y=x2, fill=dataf2$cls ), interpolate = TRUE)+
  geom_contour(data= NULL, aes(x= dataf2$x1, y=dataf2$x2, z= dataf2$prob), breaks=c(1.5), color="black", size=1)+
  theme_few()+
  scale_colour_manual(values = cols)+
  labs(colour = "", fill="")+
  scale_fill_gradient2(low="#338cea", mid="white", high="#dd7e7e", 
                       midpoint=0.5, limits=range(dataf2$prob))+
  theme(legend.position = "none")

另外，为什么必须基于预测的概率基于轮廓图？代码在任何classfier生成决策边界的背后有什么想法？我在做什么错？

原文

I am crossposting from Cross Validated and Data Science stackexchange, since I was told my questions is code heavy. I will delete if rules disallow - I don't know what the position is.

I am using the MNIST dataset with 10 classes (the digits 0 to 9). I am using a compressed version with 49 predictor variables(x1,x2,...,x49). I have trained a Random Forest model and have created a Test data set, which is a grid, on which I have used the trained model to generate predictions as class probabilities as well as the classes. I am trying to generalise the code here that generates a decision boundary when there are only two outcome classes:
Variation on "How to plot decision boundary of a k-nearest neighbor classifier from Elements of Statistical Learning?"
and here:
https://stats.stackexchange.com/questions/21572/how-to-plot-decision-boundary-of-a-k-nearest-neighbor-classifier-from-elements-o

and here:
Decision boundary plots in ggplot2

I have tried to visualise the boundary using the first 2 predictors(x1 and x2), though predictions have been made with all 49.

Here is my code:

## Create a grid of data which is the Test data...
## traindat is the dataset that the model was trained om
data<- traindat
resolution = 50 (there will be 50 rows)
## Extract the 49 predictor variables and drop the toutcome variable
data <- data[,2:50]
head(data)
## Get the variable names in a list
ln <- vector(mode="list", length=49)
ln<-as.list(names(data))

data_mat<-matrix(0,50,49)
r <- sapply(data, range, na.rm = TRUE)
for (i in 1:49){
  
  data_mat[,i]<- seq(r[1,i], r[2,i], length.out = resolution)
}
data_mat
mat<-as.matrix(data_mat)
m<-as.data.frame(mat)
## Create test data grid                                                       

fn<-function(x) seq(min(x)+1, max(x) + 1, length.out=50)
test2<-apply(m, 2, fn)
test2<-as.data.frame(test2)
colnames(test2)<-unlist(ln)
test2<-as.data.frame(test2)
## label is a column that should contain the Predicted class labels
test2$label<-"-1"
test2<-test2 %>%
  relocate(label, .before = x1)
## finalModel is the model obtained from training the Random Forest on traindat
prob=predict(rf_gridsearch$finalModel,test2,type="prob")
prob2=predict(rf_gridsearch$finalModel,test2,type="response")
prob2<-as.data.frame(prob)
head(prob2)

## Create predicted classes 0 to 9 and probabilities for the Test data
fn<-function(x) which.max(x)-1
outCls<-apply(prob2, 1, fn)
outCls
fn<-function(x) max(x)
outProb<-apply(prob2, 1, fn)
outProb
##Data structure for plotting

require(dplyr)

dataf2 <- bind_rows(mutate(test2,
                          prob=outProb,
                          cls=0,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                                 prob=outProb,
                                 cls=1,
                                 prob_cls=ifelse(outCls==cls,
                                                 1, 0)),
                   mutate(test2,
                                 prob=outProb,
                                 cls=2,
                                 prob_cls=ifelse(outCls==cls,
                                                 1, 0)),
                   
                   
                   mutate(test2,
                          prob=outProb,
                          cls=3,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=4,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=5,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=6,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=7,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=8,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0)),
                   mutate(test2,
                          prob=outProb,
                          cls=9,
                          prob_cls=ifelse(outCls==cls,
                                          1, 0))
                   
                   
                   )

## Solution from Stackexchange based on only two outcome classes
ggplot()+
  geom_raster(data= dataf2, aes(x= x1, y=x2, fill=dataf2$cls ), interpolate = TRUE)+
  geom_contour(data= NULL, aes(x= dataf2$x1, y=dataf2$x2, z= dataf2$prob), breaks=c(1.5), color="black", size=1)+
  theme_few()+
  scale_colour_manual(values = cols)+
  labs(colour = "", fill="")+
  scale_fill_gradient2(low="#338cea", mid="white", high="#dd7e7e", 
                       midpoint=0.5, limits=range(dataf2$prob))+
  theme(legend.position = "none")

My output doesn't look right - what does it mean? Also, why does the contour plot have to be based on the predicted probablity? What is the idea behind the code to generate a decision boundary for any classfier? What am I doing wrong?

分享到QQ

分享到微博