使用 K-fold 交叉验证 KNN
当使用KNN预测价格时,如何使用K-fold进行交叉验证? 我当前要预测的代码是
library("tidyverse")
library("FNN")
library("forecast")
library("caret")
library("stats")
houses=read_csv("data.csv")
houses = subset(houses, price < 1000001)
houses = subset(houses, price >99999)
houses = subset(houses, price != 0)
houses =houses %>%
select(-street,-city, -statezip,-country)
houses = houses %>%
mutate(date = as.Date(houses$date)) %>%
mutate(date = format(date, format="%Y"))
houses = houses %>%
mutate(date = as.numeric(houses$date)) %>%
mutate(yr_built = as.numeric(houses$yr_built)) %>%
mutate(age_when_listed = date - yr_built)
houses = houses %>%
mutate(age_when_listed = (houses$date - houses$yr_built))
houses= houses %>%
mutate(renovated = ifelse(yr_renovated > 0, 1,0))
b1= mean(houses$bedrooms)
b2=sd(houses$bedrooms)
c1= mean(houses$bathrooms)
c2=sd(houses$bathrooms)
e1= mean(houses$sqft_lot)
e2=sd(houses$sqft_lot)
f1= mean(houses$floors)
f2=sd(houses$floors)
g1= mean(houses$view)
g2=sd(houses$view)
h1=mean(houses$waterfront)
h2=sd(houses$waterfront)
i1= mean(houses$condition)
i2=sd(houses$condition)
j1= mean(houses$sqft_above)
j2=sd(houses$sqft_above)
k1= mean(houses$sqft_basement)
k2=sd(houses$sqft_basement)
l1= mean(houses$age_when_listed)
l2=sd(houses$age_when_listed)
m1=mean(houses$yr_renovated)
m2=sd(houses$yr_renovated)
houses = houses %>%
mutate(bedrooms_norm = (bedrooms-b1)/b2,bathrooms_norm = (bathrooms-c1)/c2,
sqft_lot_norm = (sqft_lot-e1)/e2,floors_norm = (floors-f1)/f2,
view_norm = (view-g1)/g2, condition_norm = (condition-i1)/i2,
sqft_above_norm = (sqft_above-j1)/j2, sqft_basement_norm = (sqft_basement-k1)/k2,
age_when_listed_norm = (age_when_listed-l1)/l2, waterfront_norm=(waterfront-h1)/h2,
yr_renovated_norm=(yr_renovated-m1)/m2)
houses_input_norm = houses %>%
select(bedrooms_norm, bathrooms_norm,
sqft_lot_norm, floors_norm, view_norm,condition_norm, sqft_above_norm,
sqft_basement_norm, age_when_listed_norm, waterfront_norm, yr_renovated_norm)
#New sample observation
newdata= as_tibble(list(bedrooms=4,bathrooms=3, sqft_lot=2000,floors=2, waterfront= 0,
view=2, condition=3,sqft_above = 3000,sqft_basement=0,age_when_listed=20, yr_renovated=0))
newdata = newdata %>%
mutate(bedrooms_norm = (bedrooms-b1)/b2,bathrooms_norm = (bathrooms-c1)/c2,
sqft_lot_norm = (sqft_lot-e1)/e2,floors_norm = (floors-f1)/f2,
view_norm = (view-g1)/g2, condition_norm = (condition-i1)/i2,
sqft_above_norm = (sqft_above-j1)/j2, sqft_basement_norm = (sqft_basement-k1)/k2,
age_when_listed_norm = (age_when_listed-l1)/l2,waterfront_norm=(waterfront-h1)/h2,
yr_renovated_norm=(yr_renovated-m1)/m2)
newdata_input_norm = newdata %>%
select(bedrooms_norm, bathrooms_norm,
sqft_lot_norm, floors_norm, view_norm,
condition_norm, sqft_above_norm, sqft_basement_norm,
age_when_listed_norm,waterfront_norm, yr_renovated_norm)
houses_output= houses$price
然后交叉验证我使用了此代码(基于教科书示例)这仅用于交叉验证线性回归问题还是对于 K-NN 也准确?
set.seed(30)
houses = houses%>%
tibble::rowid_to_column("ID")
temp =as_tibble()
houses = houses %>%
mutate(fold = sample(1:10, 4202, replace = TRUE))
K=10
for(obs_num in 1:K)
{
train = houses %>%
filter(ID != obs_num)
validation= houses%>%
filter(ID ==obs_num)
train.mlr = lm(price~bedrooms + bathrooms + sqft_lot+floors+
view+ condition+sqft_above+ sqft_basement+ age_when_listed+
yr_renovated+ waterfront, train)
validation = validation%>%
mutate(price_prediction = predict(train.mlr, validation))
am= accuracy(validation$price_prediction, validation$price)
temp= temp %>%
bind_rows(as_tibble(list(run=obs_num, RSME = am[2], MAPE= am[5])))
print(paste("iteration", obs_num, "completed"), sep =" ")
}
temp %>%
summarise(mean_MAPE = mean(MAPE), sd_MAPE = sd(MAPE))
temp %>%
summarise(mean_RSME = mean(RSME), sd_RSME = sd(RSME))
此代码是否准确地使用 K-fold 进行交叉验证,还是我需要更改它,它当前输出错误级别,但不确定它们是否正确
When using KNN to predict price how do you use K-fold to cross-validate?
My current code to predict is
library("tidyverse")
library("FNN")
library("forecast")
library("caret")
library("stats")
houses=read_csv("data.csv")
houses = subset(houses, price < 1000001)
houses = subset(houses, price >99999)
houses = subset(houses, price != 0)
houses =houses %>%
select(-street,-city, -statezip,-country)
houses = houses %>%
mutate(date = as.Date(houses$date)) %>%
mutate(date = format(date, format="%Y"))
houses = houses %>%
mutate(date = as.numeric(houses$date)) %>%
mutate(yr_built = as.numeric(houses$yr_built)) %>%
mutate(age_when_listed = date - yr_built)
houses = houses %>%
mutate(age_when_listed = (houses$date - houses$yr_built))
houses= houses %>%
mutate(renovated = ifelse(yr_renovated > 0, 1,0))
b1= mean(houses$bedrooms)
b2=sd(houses$bedrooms)
c1= mean(houses$bathrooms)
c2=sd(houses$bathrooms)
e1= mean(houses$sqft_lot)
e2=sd(houses$sqft_lot)
f1= mean(houses$floors)
f2=sd(houses$floors)
g1= mean(houses$view)
g2=sd(houses$view)
h1=mean(houses$waterfront)
h2=sd(houses$waterfront)
i1= mean(houses$condition)
i2=sd(houses$condition)
j1= mean(houses$sqft_above)
j2=sd(houses$sqft_above)
k1= mean(houses$sqft_basement)
k2=sd(houses$sqft_basement)
l1= mean(houses$age_when_listed)
l2=sd(houses$age_when_listed)
m1=mean(houses$yr_renovated)
m2=sd(houses$yr_renovated)
houses = houses %>%
mutate(bedrooms_norm = (bedrooms-b1)/b2,bathrooms_norm = (bathrooms-c1)/c2,
sqft_lot_norm = (sqft_lot-e1)/e2,floors_norm = (floors-f1)/f2,
view_norm = (view-g1)/g2, condition_norm = (condition-i1)/i2,
sqft_above_norm = (sqft_above-j1)/j2, sqft_basement_norm = (sqft_basement-k1)/k2,
age_when_listed_norm = (age_when_listed-l1)/l2, waterfront_norm=(waterfront-h1)/h2,
yr_renovated_norm=(yr_renovated-m1)/m2)
houses_input_norm = houses %>%
select(bedrooms_norm, bathrooms_norm,
sqft_lot_norm, floors_norm, view_norm,condition_norm, sqft_above_norm,
sqft_basement_norm, age_when_listed_norm, waterfront_norm, yr_renovated_norm)
#New sample observation
newdata= as_tibble(list(bedrooms=4,bathrooms=3, sqft_lot=2000,floors=2, waterfront= 0,
view=2, condition=3,sqft_above = 3000,sqft_basement=0,age_when_listed=20, yr_renovated=0))
newdata = newdata %>%
mutate(bedrooms_norm = (bedrooms-b1)/b2,bathrooms_norm = (bathrooms-c1)/c2,
sqft_lot_norm = (sqft_lot-e1)/e2,floors_norm = (floors-f1)/f2,
view_norm = (view-g1)/g2, condition_norm = (condition-i1)/i2,
sqft_above_norm = (sqft_above-j1)/j2, sqft_basement_norm = (sqft_basement-k1)/k2,
age_when_listed_norm = (age_when_listed-l1)/l2,waterfront_norm=(waterfront-h1)/h2,
yr_renovated_norm=(yr_renovated-m1)/m2)
newdata_input_norm = newdata %>%
select(bedrooms_norm, bathrooms_norm,
sqft_lot_norm, floors_norm, view_norm,
condition_norm, sqft_above_norm, sqft_basement_norm,
age_when_listed_norm,waterfront_norm, yr_renovated_norm)
houses_output= houses$price
Then to cross-validate I used this code (based on textbook example) does this only for cross validating linear regression problems or is it accurate for K-NN also?
set.seed(30)
houses = houses%>%
tibble::rowid_to_column("ID")
temp =as_tibble()
houses = houses %>%
mutate(fold = sample(1:10, 4202, replace = TRUE))
K=10
for(obs_num in 1:K)
{
train = houses %>%
filter(ID != obs_num)
validation= houses%>%
filter(ID ==obs_num)
train.mlr = lm(price~bedrooms + bathrooms + sqft_lot+floors+
view+ condition+sqft_above+ sqft_basement+ age_when_listed+
yr_renovated+ waterfront, train)
validation = validation%>%
mutate(price_prediction = predict(train.mlr, validation))
am= accuracy(validation$price_prediction, validation$price)
temp= temp %>%
bind_rows(as_tibble(list(run=obs_num, RSME = am[2], MAPE= am[5])))
print(paste("iteration", obs_num, "completed"), sep =" ")
}
temp %>%
summarise(mean_MAPE = mean(MAPE), sd_MAPE = sd(MAPE))
temp %>%
summarise(mean_RSME = mean(RSME), sd_RSME = sd(RSME))
Is this code accurate to cross validate using K-fold or do I need to change it, it outputs error levels currently but uncertain if they are correct
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
目前对我来说看起来不太合适。在交叉验证循环中,您将根据
ID
变量分配给train
和validation
表。我认为如果您将其更改为基于您的fold
变量,那么它应该可以正常工作。例如,
针对不同的模型类型调整这样的交叉验证循环通常相当容易。要检查的是
predict()
函数仍然以相同的方式工作。predict
是使用大多数模型类型进行预测的包装器 - 有时它需要额外的输入参数,有时输出采用不同的格式。您的 KNN 模型稍微复杂一些,因为
FNN
包没有内置简单的predict
函数。它必须看起来像这样:当然,如果不查看您的数据,很难判断这是否有效。您能否提供数据样本,或者使用 R 中可用数据集的可重现示例?使用上面的代码,您必须确保
train.matrix
和validation.matrix
的格式完全匹配。另请注意,FNN
仅适用于数字数据 - 如果您有任何分类预测变量,则必须首先对它们进行单热编码。It doesn't look quite right to me at the moment. Within your cross validation loop you are assigning to your
train
andvalidation
tables based on theID
variable. I think if you change this to be based on yourfold
variable instead, then it should work ok.e.g.
It's generally fairly easy to adapt a cross-validation loop like this for different model types. The thing to check is that the
predict()
function still works the same way.predict
is a wrapper for predicting using most model types - sometimes it required additional input arguments, and sometimes the outputs are in a different format.Your KNN model is slightly more complicated, as the
FNN
package doesn't have a simplepredict
function built into it. It would have to look something like this:Of course, it's hard to tell if this works without seeing your data. Can you provide a sample of your data, or a reproducible example using a dataset available in R? With the above code you will have to make sure that the format of
train.matrix
andvalidation.matrix
match exactly. Also, note thatFNN
works with numeric data only - if you have any categorical predictors they will have to be one-hot-encoded first.