如何将列中的值转换为“是”和“不” R 中多元回归的值
我正在使用以下可重现的数据集进行多元线性回归(这是我的数据的一小部分样本):
structure(list(age = c(62.84998, 60.33899, 52.74698, 42.38498,
79.88495, 93.01599, 62.37097, 86.83899, 85.65594, 42.25897),
death = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), sex = c("male",
"female", "female", "female", "female", "male", "male", "male",
"male", "female"), hospdead = c(0, 1, 0, 0, 0, 1, 0, 0, 0,
0), slos = c(5, 4, 17, 3, 16, 4, 9, 7, 12, 8), d.time = c(2029,
4, 47, 133, 2029, 4, 659, 142, 63, 370), dzgroup = c("Lung Cancer",
"Cirrhosis", "Cirrhosis", "Lung Cancer", "ARF/MOSF w/Sepsis",
"Coma", "CHF", "CHF", "Lung Cancer", "Colon Cancer"), dzclass = c("Cancer",
"COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer", "ARF/MOSF",
"Coma", "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer",
"Cancer"), num.co = c(0, 2, 2, 2, 1, 1, 1, 3, 2, 0), edu = c(11,
12, 12, 11, NA, 14, 14, NA, 12, 11), income = c("$11-$25k",
"$11-$25k", "under $11k", "under $11k", NA, NA, "$25-$50k",
NA, NA, "$25-$50k"), scoma = c(0, 44, 0, 0, 26, 55, 0, 26,
26, 0), charges = c(9715, 34496, 41094, 3075, 50127, 6884,
30460, 30460, NA, 9914), totcst = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), totmcst = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), avtisst = c(7, 29, 13, 7, 18.666656, 5, 8, 6.5, 8.5, 8
), race = c("other", "white", "white", "white", "white",
"white", "white", "white", "black", "hispanic"), sps = c(33.8984375,
52.6953125, 20.5, 20.0976562, 23.5, 19.3984375, 17.296875,
21.5976562, 15.8984375, 2.2998047), aps = c(20, 74, 45, 19,
30, 27, 46, 53, 17, 9), surv2m = c(0.262939453, 0.0009999275,
0.790893555, 0.698974609, 0.634887695, 0.284973145, 0.892944336,
0.670898438, 0.570922852, 0.952880859), surv6m = c(0.0369949341,
0, 0.664916992, 0.411987305, 0.532958984, 0.214996338, 0.820922852,
0.498962402, 0.24899292, 0.887939453), hday = c(1, 3, 4,
1, 3, 1, 1, 1, 1, 1), diabetes = c(0, 0, 0, 0, 0, 0, 0, 1,
0, 0), dementia = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), ca = c("metastatic",
"no", "no", "metastatic", "no", "no", "no", "no", "metastatic",
"metastatic"), prg2m = c(0.5, 0, 0.75, 0.899999619, 0.899999619,
0, NA, 0.799999714, 0.049999982, NA), prg6m = c(0.25, 0,
0.5, 0.5, 0.8999996, 0, 0.6999998, 0.3999999, 0.0001249999,
NA), dnr = c("no dnr", NA, "no dnr", "no dnr", "no dnr",
"no dnr", "no dnr", "no dnr", "dnr after sadm", "no dnr"),
dnrday = c(5, NA, 17, 3, 16, 4, 9, 7, 2, 8), meanbp = c(97,
43, 70, 75, 59, 110, 78, 72, 97, 84), wblc = c(6, 17.0976562,
8.5, 9.09960938, 13.5, 10.3984375, 11.6992188, 13.5996094,
9.69921875, 11.2988281), hrt = c(69, 112, 88, 88, 112, 101,
120, 100, 56, 94), resp = c(22, 34, 28, 32, 20, 44, 28, 26,
20, 20), temp = c(36, 34.59375, 37.39844, 35, 37.89844, 38.39844,
37.39844, 37.59375, 36.59375, 38.19531), pafi = c(388, 98,
231.65625, NA, 173.3125, 266.625, 309.5, 404.75, 357.125,
NA), alb = c(1.7998047, NA, NA, NA, NA, NA, 4.7998047, NA,
NA, 4.6992188), bili = c(0.19998169, NA, 2.19970703, NA,
NA, NA, 0.39996338, NA, 0.39996338, 0.19998169), crea = c(1.19995117,
5.5, 2, 0.79992676, 0.79992676, 0.69995117, 1.59985352, 2,
1, 0.79992676), sod = c(141, 132, 134, 139, 143, 140, 132,
139, 143, 139), ph = c(7.459961, 7.25, 7.459961, NA, 7.509766,
7.65918, 7.479492, 7.509766, 7.449219, NA), glucose = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), bun = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), urine = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), adlp = c(7, NA, 1, 0, NA, NA, 0, NA, NA, 0), adls = c(7,
1, 0, 0, 2, 1, 1, 0, 7, NA), sfdm2 = c(NA, "<2 mo. follow-up",
"<2 mo. follow-up", "no(M2 and SIP pres)", "no(M2 and SIP pres)",
"<2 mo. follow-up", "no(M2 and SIP pres)", NA, NA, NA), adlsc = c(7,
1, 0, 0, 2, 1, 1, 0, 7, 0.4947999)), row.names = c(NA, 10L
), class = "data.frame")
我在这里有回归公式。
SB_xlsx13 = SB_xlsx13[!is.na(SB_xlsx13$dnrday), ]
SB_xlsx13 = SB_xlsx13[!is.na(SB_xlsx13$sps), ]
MLR_2 = lm(SB_xlsx13$hospdead ~ SB_xlsx13$dzclass_f + SB_xlsx13$age + SB_xlsx13$sex + SB_xlsx13$num.co + SB_xlsx13$sps)
summary(MLR_2)
##
## Call:
## lm(formula = SB_xlsx13$hospdead ~ SB_xlsx13$dzclass_f + SB_xlsx13$age +
## SB_xlsx13$sex + SB_xlsx13$num.co + SB_xlsx13$sps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26132 -0.25758 -0.08914 0.15412 1.14048
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3519553 0.0224017 -15.711 < 2e-16
## SB_xlsx13$dzclass_fCancer -0.0870012 0.0123327 -7.055 1.86e-12
## SB_xlsx13$dzclass_fComa 0.2907825 0.0164644 17.661 < 2e-16
## SB_xlsx13$dzclass_fCOPD/CHF/Cirrhosis -0.1378731 0.0104787 -13.157 < 2e-16
## SB_xlsx13$age 0.0027082 0.0002555 10.598 < 2e-16
## SB_xlsx13$sexmale 0.0022789 0.0079126 0.288 0.773
## SB_xlsx13$num.co 0.0028155 0.0032577 0.864 0.387
## SB_xlsx13$sps 0.0184986 0.0004393 42.105 < 2e-16
##
## (Intercept) ***
## SB_xlsx13$dzclass_fCancer ***
## SB_xlsx13$dzclass_fComa ***
## SB_xlsx13$dzclass_fCOPD/CHF/Cirrhosis ***
## SB_xlsx13$age ***
## SB_xlsx13$sexmale
## SB_xlsx13$num.co
## SB_xlsx13$sps ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3724 on 9067 degrees of freedom
## Multiple R-squared: 0.2772, Adjusted R-squared: 0.2767
## F-statistic: 496.8 on 7 and 9067 DF, p-value: < 2.2e-16
回归结果很好;但是,我想再添加一个变量,即第三天的 dnr 状态。如果该值小于或等于 3,则存在 DNR;如果该值大于 3,则不存在 DNR。我之前使用此代码对先前任务的这些值进行了子集化:
YesDNR <- subset(SB_xlsx12, dnrday <= 3, na.rm=TRUE)
NoDNR <- subset(SB_xlsx12, dnrday > 3, na.rm=TRUE)
这工作正常,但我无法真正在回归模型中使用这些子集。我假设要使模型正常工作,我需要将“dnrday”列中每个 3 或更少 (<=) 的值转换为“yes”,并将每个超过 3 (>) 的值转换为“no”。我的这种想法是否正确?如果是,我将如何完成改变这些价值观。
I am doing a multiple linear regression with the following reproducible dataset (this is a small sample of my data):
structure(list(age = c(62.84998, 60.33899, 52.74698, 42.38498,
79.88495, 93.01599, 62.37097, 86.83899, 85.65594, 42.25897),
death = c(0, 1, 1, 1, 0, 1, 1, 1, 1, 1), sex = c("male",
"female", "female", "female", "female", "male", "male", "male",
"male", "female"), hospdead = c(0, 1, 0, 0, 0, 1, 0, 0, 0,
0), slos = c(5, 4, 17, 3, 16, 4, 9, 7, 12, 8), d.time = c(2029,
4, 47, 133, 2029, 4, 659, 142, 63, 370), dzgroup = c("Lung Cancer",
"Cirrhosis", "Cirrhosis", "Lung Cancer", "ARF/MOSF w/Sepsis",
"Coma", "CHF", "CHF", "Lung Cancer", "Colon Cancer"), dzclass = c("Cancer",
"COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer", "ARF/MOSF",
"Coma", "COPD/CHF/Cirrhosis", "COPD/CHF/Cirrhosis", "Cancer",
"Cancer"), num.co = c(0, 2, 2, 2, 1, 1, 1, 3, 2, 0), edu = c(11,
12, 12, 11, NA, 14, 14, NA, 12, 11), income = c("$11-$25k",
"$11-$25k", "under $11k", "under $11k", NA, NA, "$25-$50k",
NA, NA, "$25-$50k"), scoma = c(0, 44, 0, 0, 26, 55, 0, 26,
26, 0), charges = c(9715, 34496, 41094, 3075, 50127, 6884,
30460, 30460, NA, 9914), totcst = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), totmcst = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_
), avtisst = c(7, 29, 13, 7, 18.666656, 5, 8, 6.5, 8.5, 8
), race = c("other", "white", "white", "white", "white",
"white", "white", "white", "black", "hispanic"), sps = c(33.8984375,
52.6953125, 20.5, 20.0976562, 23.5, 19.3984375, 17.296875,
21.5976562, 15.8984375, 2.2998047), aps = c(20, 74, 45, 19,
30, 27, 46, 53, 17, 9), surv2m = c(0.262939453, 0.0009999275,
0.790893555, 0.698974609, 0.634887695, 0.284973145, 0.892944336,
0.670898438, 0.570922852, 0.952880859), surv6m = c(0.0369949341,
0, 0.664916992, 0.411987305, 0.532958984, 0.214996338, 0.820922852,
0.498962402, 0.24899292, 0.887939453), hday = c(1, 3, 4,
1, 3, 1, 1, 1, 1, 1), diabetes = c(0, 0, 0, 0, 0, 0, 0, 1,
0, 0), dementia = c(0, 0, 0, 0, 0, 0, 0, 0, 1, 0), ca = c("metastatic",
"no", "no", "metastatic", "no", "no", "no", "no", "metastatic",
"metastatic"), prg2m = c(0.5, 0, 0.75, 0.899999619, 0.899999619,
0, NA, 0.799999714, 0.049999982, NA), prg6m = c(0.25, 0,
0.5, 0.5, 0.8999996, 0, 0.6999998, 0.3999999, 0.0001249999,
NA), dnr = c("no dnr", NA, "no dnr", "no dnr", "no dnr",
"no dnr", "no dnr", "no dnr", "dnr after sadm", "no dnr"),
dnrday = c(5, NA, 17, 3, 16, 4, 9, 7, 2, 8), meanbp = c(97,
43, 70, 75, 59, 110, 78, 72, 97, 84), wblc = c(6, 17.0976562,
8.5, 9.09960938, 13.5, 10.3984375, 11.6992188, 13.5996094,
9.69921875, 11.2988281), hrt = c(69, 112, 88, 88, 112, 101,
120, 100, 56, 94), resp = c(22, 34, 28, 32, 20, 44, 28, 26,
20, 20), temp = c(36, 34.59375, 37.39844, 35, 37.89844, 38.39844,
37.39844, 37.59375, 36.59375, 38.19531), pafi = c(388, 98,
231.65625, NA, 173.3125, 266.625, 309.5, 404.75, 357.125,
NA), alb = c(1.7998047, NA, NA, NA, NA, NA, 4.7998047, NA,
NA, 4.6992188), bili = c(0.19998169, NA, 2.19970703, NA,
NA, NA, 0.39996338, NA, 0.39996338, 0.19998169), crea = c(1.19995117,
5.5, 2, 0.79992676, 0.79992676, 0.69995117, 1.59985352, 2,
1, 0.79992676), sod = c(141, 132, 134, 139, 143, 140, 132,
139, 143, 139), ph = c(7.459961, 7.25, 7.459961, NA, 7.509766,
7.65918, 7.479492, 7.509766, 7.449219, NA), glucose = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), bun = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), urine = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), adlp = c(7, NA, 1, 0, NA, NA, 0, NA, NA, 0), adls = c(7,
1, 0, 0, 2, 1, 1, 0, 7, NA), sfdm2 = c(NA, "<2 mo. follow-up",
"<2 mo. follow-up", "no(M2 and SIP pres)", "no(M2 and SIP pres)",
"<2 mo. follow-up", "no(M2 and SIP pres)", NA, NA, NA), adlsc = c(7,
1, 0, 0, 2, 1, 1, 0, 7, 0.4947999)), row.names = c(NA, 10L
), class = "data.frame")
I have my formula for the regression here.
SB_xlsx13 = SB_xlsx13[!is.na(SB_xlsx13$dnrday), ]
SB_xlsx13 = SB_xlsx13[!is.na(SB_xlsx13$sps), ]
MLR_2 = lm(SB_xlsx13$hospdead ~ SB_xlsx13$dzclass_f + SB_xlsx13$age + SB_xlsx13$sex + SB_xlsx13$num.co + SB_xlsx13$sps)
summary(MLR_2)
##
## Call:
## lm(formula = SB_xlsx13$hospdead ~ SB_xlsx13$dzclass_f + SB_xlsx13$age +
## SB_xlsx13$sex + SB_xlsx13$num.co + SB_xlsx13$sps)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26132 -0.25758 -0.08914 0.15412 1.14048
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3519553 0.0224017 -15.711 < 2e-16
## SB_xlsx13$dzclass_fCancer -0.0870012 0.0123327 -7.055 1.86e-12
## SB_xlsx13$dzclass_fComa 0.2907825 0.0164644 17.661 < 2e-16
## SB_xlsx13$dzclass_fCOPD/CHF/Cirrhosis -0.1378731 0.0104787 -13.157 < 2e-16
## SB_xlsx13$age 0.0027082 0.0002555 10.598 < 2e-16
## SB_xlsx13$sexmale 0.0022789 0.0079126 0.288 0.773
## SB_xlsx13$num.co 0.0028155 0.0032577 0.864 0.387
## SB_xlsx13$sps 0.0184986 0.0004393 42.105 < 2e-16
##
## (Intercept) ***
## SB_xlsx13$dzclass_fCancer ***
## SB_xlsx13$dzclass_fComa ***
## SB_xlsx13$dzclass_fCOPD/CHF/Cirrhosis ***
## SB_xlsx13$age ***
## SB_xlsx13$sexmale
## SB_xlsx13$num.co
## SB_xlsx13$sps ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3724 on 9067 degrees of freedom
## Multiple R-squared: 0.2772, Adjusted R-squared: 0.2767
## F-statistic: 496.8 on 7 and 9067 DF, p-value: < 2.2e-16
The regression comes out just fine; however, I want to add one more variable(s), which is dnr status at day three. If the value is 3 or less, there is a DNR and if the value is over 3, there is not a DNR. I previously subsetted these values for a previous task using this code:
YesDNR <- subset(SB_xlsx12, dnrday <= 3, na.rm=TRUE)
NoDNR <- subset(SB_xlsx12, dnrday > 3, na.rm=TRUE)
This worked fine, but I can't really use these subsets in my regression model. I would assume to make the model work, I would need to translate every value of 3 or less (<=) in the "dnrday" column to "yes" and every value over 3 (>) to "no". Am I correct on this thinking, and if so, how would I accomplish changing those values.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
我将创建一个新专栏 - 请参阅下面的两个选项。
(注意,在
lm()
中,如果将协变量列为data =
,则每次添加协变量时都不必指定SB_xlsx13$
参数一次!这将使您的输出更易于阅读。)Tidyverse 方法:
mutate
和case_when
:Base R 方法:
I would create a new column - see two options below.
(NB in
lm()
you don't have to specifySB_xlsx13$
each time you add a covariate if you list it as thedata =
argument once! This will make your output easier to read.)Tidyverse approach:
mutate
andcase_when
:Base R approach:
您只需在公式中使用 AsIs 函数
I()
即可。还可以使用factor
轻松分解变量。这可能看起来有点不干净,但对于处理数据来说很不错。一旦您对某些内容感到满意,您就可以使用
transform
轻松地在数据中更改它。例如,只需使用 dnrday <= 3 即可创建布尔变量 DNR。You may simply use the AsIs function
I()
in the formula. Also usefactor
to easily factorize variables.This might look a bit unclean but it's nice for playing around with the data. Once you are happy with something you may easily change it in the data using
transform
. For instance, the DNR variable can be created boolean just usingdnrday <= 3
.