新数据中的预测变量类型与培训数据的类型不符

发布于 01-21 22:20 字数 1484 浏览 2 评论 0原文

我想使用随机森林对R中的自杀率(log_suicides_per_100k)进行预测,我遇到的问题是,当我尝试选择一个变量级别时,我会得到错误:

Type of predictors in new data do not match that of the training data. 

模型是:性别是:

rf3 <- randomForest(log_suicides_per_100k~ age+sex+log_gdp_per_capita+log_population+year, # formula data = train, # data ntree = 500)

性别有四个级别:男性和男性和女性 年龄有六个级别; “ 15-24岁”,“ 25 - 34年”,“ 35-54岁”, “ 5-14岁”,“ 55-74岁”,“ 75岁以上”,

structure(list(year = c(2001L, 2004L, 2008L, 2010L, 2004L, 2011L
), sex = structure(c(2L, 2L, 1L, 2L, 2L, 1L), .Label = c("female", 
"male"), class = "factor"), age = structure(c(1L, 6L, 3L, 6L, 
2L, 3L), .Label = c("15-24 years", "25-34 years", "35-54 years", 
"5-14 years", "55-74 years", "75+ years"), class = "factor"), 
log_population = c(14.0462476055718, 10.0651811415341, 
13.5550389013841, 
10.2665669441479, 15.5047227728237, 13.4021140795298), 
log_suicides_per_100k = c(2.42657107277504, 
4.03069453514564, 2.38508631450579, 4.15261347034608, 
2.88480071284671, 
0.647103242058539), log_gdp_per_capita = c(7.67786350067821, 
9.13701670755734, 11.1338150021447, 9.65117262392164, 
7.95472333449791, 
8.14177220465645)), row.names = c(7888L, 8465L, 7593L, 8535L, 
25159L, 9656L), class = "data.frame")

我想预测2025年75岁以上男性的自杀率。

prediction <- predict(rf3, data.frame (age = '75+ years', sex= 'male', log_gdp_per_capita = 13.082, log_population = 9.393, year = 2025))

I want to make prediction of suicide rate (log_suicides_per_100k) in R using random forest, the problem I have is that when I try to pick one level of a variable, I get the error:

Type of predictors in new data do not match that of the training data. 

The model is:

rf3 <- randomForest(log_suicides_per_100k~ age+sex+log_gdp_per_capita+log_population+year, # formula data = train, # data ntree = 500)

sex has four levels: male and female
age has six levels; "15-24 years", "25-34 years", "35-54 years",
"5-14 years", "55-74 years", "75+ years"

structure(list(year = c(2001L, 2004L, 2008L, 2010L, 2004L, 2011L
), sex = structure(c(2L, 2L, 1L, 2L, 2L, 1L), .Label = c("female", 
"male"), class = "factor"), age = structure(c(1L, 6L, 3L, 6L, 
2L, 3L), .Label = c("15-24 years", "25-34 years", "35-54 years", 
"5-14 years", "55-74 years", "75+ years"), class = "factor"), 
log_population = c(14.0462476055718, 10.0651811415341, 
13.5550389013841, 
10.2665669441479, 15.5047227728237, 13.4021140795298), 
log_suicides_per_100k = c(2.42657107277504, 
4.03069453514564, 2.38508631450579, 4.15261347034608, 
2.88480071284671, 
0.647103242058539), log_gdp_per_capita = c(7.67786350067821, 
9.13701670755734, 11.1338150021447, 9.65117262392164, 
7.95472333449791, 
8.14177220465645)), row.names = c(7888L, 8465L, 7593L, 8535L, 
25159L, 9656L), class = "data.frame")

I want to predict the suicide rate for males in the group age 75+ for the year 2025.

prediction <- predict(rf3, data.frame (age = '75+ years', sex= 'male', log_gdp_per_capita = 13.082, log_population = 9.393, year = 2025))

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

任谁2025-01-28 22:20:18

这是一些有效的代码。因为您还没有包含所有代码,所以有可能对您不起作用的风险。因素和级别需要匹配,因此这是正确的关键。复制培训数据中的因素和水平,并将其设置为与测试数据中的因素相匹配。

library(randomForest)

traindf <- structure(
    list(
        year = c(2001L, 2004L, 2008L, 2010L, 2004L, 2011L),
        sex = structure(
            c(2L, 2L, 1L, 2L, 2L, 1L),
            .Label = c("female",
                                 "male"),
            class = "factor"
        ),
        age = structure(
            c(1L, 6L, 3L, 6L,
                2L, 3L),
            .Label = c(
                "15-24 years",
                "25-34 years",
                "35-54 years",
                "5-14 years",
                "55-74 years",
                "75+ years"
            ),
            class = "factor"
        ),
        log_population = c(
            14.0462476055718,
            10.0651811415341,
            13.5550389013841,
            10.2665669441479,
            15.5047227728237,
            13.4021140795298
        ),
        log_suicides_per_100k = c(
            2.42657107277504,
            4.03069453514564,
            2.38508631450579,
            4.15261347034608,
            2.88480071284671,
            0.647103242058539
        ),
        log_gdp_per_capita = c(
            7.67786350067821,
            9.13701670755734,
            11.1338150021447,
            9.65117262392164,
            7.95472333449791,
            8.14177220465645
        )
    ),
    row.names = c(7888L, 8465L, 7593L, 8535L,
                                25159L, 9656L),
    class = "data.frame"
)

rf3 <- randomForest(log_suicides_per_100k ~ age+sex+log_gdp_per_capita+log_population+year, data=traindf)

testdf <- data.frame(age='75+ years', sex='male', log_gdp_per_capita=13.082, log_population=9.393, year=2025)
testdf$sex <- factor(testdf$sex, levels=levels(traindf$sex))
testdf$age <- factor(testdf$age, levels=levels(traindf$age))

prediction <- predict(rf3, testdf)
prediction

#3.200609 

Here's some code that works. Because you haven't included all your code, there is a risk that it will not work for you. The factors and levels need to match up so this is the key thing to get right. The factors and levels in the training data are copied and set to match those in the test data.

library(randomForest)

traindf <- structure(
    list(
        year = c(2001L, 2004L, 2008L, 2010L, 2004L, 2011L),
        sex = structure(
            c(2L, 2L, 1L, 2L, 2L, 1L),
            .Label = c("female",
                                 "male"),
            class = "factor"
        ),
        age = structure(
            c(1L, 6L, 3L, 6L,
                2L, 3L),
            .Label = c(
                "15-24 years",
                "25-34 years",
                "35-54 years",
                "5-14 years",
                "55-74 years",
                "75+ years"
            ),
            class = "factor"
        ),
        log_population = c(
            14.0462476055718,
            10.0651811415341,
            13.5550389013841,
            10.2665669441479,
            15.5047227728237,
            13.4021140795298
        ),
        log_suicides_per_100k = c(
            2.42657107277504,
            4.03069453514564,
            2.38508631450579,
            4.15261347034608,
            2.88480071284671,
            0.647103242058539
        ),
        log_gdp_per_capita = c(
            7.67786350067821,
            9.13701670755734,
            11.1338150021447,
            9.65117262392164,
            7.95472333449791,
            8.14177220465645
        )
    ),
    row.names = c(7888L, 8465L, 7593L, 8535L,
                                25159L, 9656L),
    class = "data.frame"
)

rf3 <- randomForest(log_suicides_per_100k ~ age+sex+log_gdp_per_capita+log_population+year, data=traindf)

testdf <- data.frame(age='75+ years', sex='male', log_gdp_per_capita=13.082, log_population=9.393, year=2025)
testdf$sex <- factor(testdf$sex, levels=levels(traindf$sex))
testdf$age <- factor(testdf$age, levels=levels(traindf$age))

prediction <- predict(rf3, testdf)
prediction

#3.200609 
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文