R：分层数据的贝叶斯逻辑回归

发布于 2025-01-08 06:39:12 字数 1525 浏览 0 评论 0原文

这是来自 stats.stackexchange 的转发，我在其中没有得到满意的答复。我有两个数据集，第一个关于学校，第二个列出了每所学校在标准化考试中未通过的学生（强调是故意的）。假数据集可以通过以下方式生成（感谢 Tharen）：

#random school data for 30 schools
schools.num = 30
schools.data = data.frame(school_id=seq(1,schools.num)
                         ,tot_white=sample(100:300,schools.num,TRUE)
                         ,tot_black=sample(100:300,schools.num,TRUE)
                         ,tot_asian=sample(100:300,schools.num,TRUE)
                         ,school_rev=sample(4e6:6e6,schools.num,TRUE)
                         )

#total students in each school
schools.data$tot_students = schools.data$tot_white + schools.data$tot_black + schools.data$tot_asian
#sum of all students all schools
tot_students = sum(schools.data$tot_white, schools.data$tot_black, schools.data$tot_asian)
#generate some random failing students
fail.num = as.integer(tot_students * 0.05)

students = data.frame(student_id=sample(seq(1:tot_students), fail.num, FALSE)
                      ,school_id=sample(1:schools.num, fail.num, TRUE)
                      ,race=sample(c('white', 'black', 'asian'), fail.num, TRUE)
                      )

我正在尝试估计 P(Fail=1 | Student种族、学校收入）。如果我在学生数据集上运行多项式离散选择模型，我显然会估计 P(Race | Fail=1)。我显然必须估计这个的倒数。由于所有信息都可以在两个数据集中使用（P（失败）、P（比赛）、收入），所以我认为没有理由不能做到这一点。但我对如何在 R 中实现感到困惑。任何指针将不胜感激。谢谢。

原文

This is a repost from stats.stackexchange where I did not get a satisfactory response. I have two datasets, the first on schools, and the second lists students in each school who have failed in a standardized test (emphasis intentional). Fake datasets can be generated by (thanks to Tharen):

#random school data for 30 schools
schools.num = 30
schools.data = data.frame(school_id=seq(1,schools.num)
                         ,tot_white=sample(100:300,schools.num,TRUE)
                         ,tot_black=sample(100:300,schools.num,TRUE)
                         ,tot_asian=sample(100:300,schools.num,TRUE)
                         ,school_rev=sample(4e6:6e6,schools.num,TRUE)
                         )

#total students in each school
schools.data$tot_students = schools.data$tot_white + schools.data$tot_black + schools.data$tot_asian
#sum of all students all schools
tot_students = sum(schools.data$tot_white, schools.data$tot_black, schools.data$tot_asian)
#generate some random failing students
fail.num = as.integer(tot_students * 0.05)

students = data.frame(student_id=sample(seq(1:tot_students), fail.num, FALSE)
                      ,school_id=sample(1:schools.num, fail.num, TRUE)
                      ,race=sample(c('white', 'black', 'asian'), fail.num, TRUE)
                      )

I am trying to estimate P(Fail=1 | Student Race, School Revenue). If I run a multinomial discrete choice model on the student dataset, I shall clearly be estimating P(Race | Fail=1). I obviously have to estimate the inverse of this. Since all the pieces of information are available in the two datasets (P(Fail), P(Race), Revenue), I see no reason why this can't be done. But I am stumped as to actually how to implement in R. Any pointer would be much appreciated. Thanks.

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

千笙结 2025-01-15 06:39:12

如果你有一个 data.frame，那就更容易了。

library(reshape2)
library(plyr)
d1 <- ddply(
  students, 
  c("school_id", "race"), 
  summarize,
  fail=length(student_id)
) 
d2 <- with( schools.data, data.frame( 
  school_id = school_id, 
  white = tot_white, 
  black = tot_black, 
  asian = tot_asian, 
  school_rev = school_rev 
) )
d2 <- melt(d2, 
  id.vars=c("school_id", "school_rev"), 
  variable.name="race", 
  value.name="total"
)
d <- merge( d1, d2, by=c("school_id", "race") )
d$pass <- d$total - d$fail

然后您可以查看数据

library(lattice)
xyplot( d$fail / d$total ~ school_rev | race, data=d )

或计算您想要的任何内容。

r <- glm(
  cbind(fail,pass) ~ race + school_rev, 
  data=d, 
  family=binomial() # Logistic regression (not bayesian)
)
summary(r)

（编辑）如果您有关于失败学生的更多信息，
但仅汇总已通过数据的数据，
您可以按如下方式重新创建完整的数据集。

# Unique student_id for the passed students
d3 <- ddply( d, 
  c("school_id", "race"), 
  summarize, student_id=1:pass 
)
d3$student_id <- - seq_len(nrow(d3))
# All students
d3$result <- "pass"
students$result <- "fail"
d3 <- merge( # rather than rbind, in case there are more columns
  d3, students, 
  by=c("student_id", "school_id", "race", "result"), 
  all=TRUE 
)
# Students and schools in a single data.frame
d3 <- merge( d3, schools.data, by="school_id", all=TRUE )
# Check that the results did not change
r <- glm(
  (result=="fail") ~ race + school_rev, 
  data=d3, 
  family=binomial()
)
summary(r)

It will be easier if you have a single data.frame.

library(reshape2)
library(plyr)
d1 <- ddply(
  students, 
  c("school_id", "race"), 
  summarize,
  fail=length(student_id)
) 
d2 <- with( schools.data, data.frame( 
  school_id = school_id, 
  white = tot_white, 
  black = tot_black, 
  asian = tot_asian, 
  school_rev = school_rev 
) )
d2 <- melt(d2, 
  id.vars=c("school_id", "school_rev"), 
  variable.name="race", 
  value.name="total"
)
d <- merge( d1, d2, by=c("school_id", "race") )
d$pass <- d$total - d$fail

You can then look at the data

library(lattice)
xyplot( d$fail / d$total ~ school_rev | race, data=d )

Or compute anything you want.

r <- glm(
  cbind(fail,pass) ~ race + school_rev, 
  data=d, 
  family=binomial() # Logistic regression (not bayesian)
)
summary(r)

(EDIT) If you have more information about the failed students,
but only aggregated data for the passed ones,
you can recreate a complete dataset as follows.

# Unique student_id for the passed students
d3 <- ddply( d, 
  c("school_id", "race"), 
  summarize, student_id=1:pass 
)
d3$student_id <- - seq_len(nrow(d3))
# All students
d3$result <- "pass"
students$result <- "fail"
d3 <- merge( # rather than rbind, in case there are more columns
  d3, students, 
  by=c("student_id", "school_id", "race", "result"), 
  all=TRUE 
)
# Students and schools in a single data.frame
d3 <- merge( d3, schools.data, by="school_id", all=TRUE )
# Check that the results did not change
r <- glm(
  (result=="fail") ~ race + school_rev, 
  data=d3, 
  family=binomial()
)
summary(r)

回复收藏 0 原文

一抹微笑 2025-01-15 06:39:12

您将需要一个包含所有学生信息的数据集。既失败又通过。

schools.num = 30
schools.data = data.frame(school_id=seq(1,schools.num)
                          ,tot_white=sample(100:300,schools.num,TRUE)
                          ,tot_black=sample(100:300,schools.num,TRUE)
                          ,tot_asian=sample(100:300,schools.num,TRUE)
                          ,school_rev=sample(4e6:6e6,schools.num,TRUE)
                          )

library(plyr)
fail_ratio <- 0.05
dataset <- ddply(schools.data, .(school_id, school_rev), function(x){
  data.frame(Fail = rbinom(sum(x$tot_white, x$tot_asian, x$tot_black), size = 1, prob = fail_ratio), Race = c(rep("white", x$tot_white), rep("asian", x$tot_asian), rep("black", x$tot_black)))
})
dataset$Race <- factor(dataset$Race)

然后，您可以使用 lme4 包的 glmer() 来实现频率主义方法。

library(lme4)
glmer(Fail ~ school_rev + Race + (1|school_id), data = dataset, family = binomial)

如果您需要贝叶斯估计，请查看 MCMCglmm 包。

You'll need a dataset with information on all students. Both failed and passed.

schools.num = 30
schools.data = data.frame(school_id=seq(1,schools.num)
                          ,tot_white=sample(100:300,schools.num,TRUE)
                          ,tot_black=sample(100:300,schools.num,TRUE)
                          ,tot_asian=sample(100:300,schools.num,TRUE)
                          ,school_rev=sample(4e6:6e6,schools.num,TRUE)
                          )

library(plyr)
fail_ratio <- 0.05
dataset <- ddply(schools.data, .(school_id, school_rev), function(x){
  data.frame(Fail = rbinom(sum(x$tot_white, x$tot_asian, x$tot_black), size = 1, prob = fail_ratio), Race = c(rep("white", x$tot_white), rep("asian", x$tot_asian), rep("black", x$tot_black)))
})
dataset$Race <- factor(dataset$Race)

Then you can use glmer() for the lme4 package for a frequentist approach.

library(lme4)
glmer(Fail ~ school_rev + Race + (1|school_id), data = dataset, family = binomial)

Have a look at the MCMCglmm package if you need Bayesian estimates.

回复收藏 0 原文

~没有更多了~

关于作者

长不大的小祸害

暂无简介

文章

26 人气

关注发私信

友情链接

文江博客

R：分层数据的贝叶斯逻辑回归

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

评论（2）

关于作者

相关话题

热门标签

推荐作者

佚名

羁客

天天爱笑的徐老师

星

夏日落

隐诗

友情链接

R：分层数据的贝叶斯逻辑回归

如果你对这篇内容有疑问，欢迎到本站社区发帖提问 参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

评论（2）

关于作者

相关话题

热门标签

推荐作者

佚名

羁客

天天爱笑的徐老师

星

夏日落

隐诗

友情链接

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。