使用突变 - dplyr分裂多列
我正在尝试将多列拆分,但努力进行有效的操作。
我有一个DF:
> dput(df)
structure(list(UNIQUE_PATIENT_ID = c("DIS-1101-1001-E1", "DIS-1101-1002-E1",
"DIS-1101-1003-E1", "DIS-1101-1004-E1", "DIS-1101-1004-E2", "DIS-1101-1005-E1",
"DIS-1101-1006-E1", "DIS-1101-1007-E1", "DIS-1101-1008-E1", "DIS-1101-1009-E1",
"DIS-1101-1010-E1", "DIS-1101-1011-E1", "DIS-1101-1012-E1", "DIS-1101-1013-E1",
"DIS-1101-1014-E1", "DIS-1101-1015-E1", "DIS-1101-1016-E1", "DIS-1101-1017-E1",
"DIS-1101-1018-E1", "DIS-1101-1019-E1", "DIS-1101-1020-E1", "DIS-1101-1021-E1",
"DIS-1101-1022-E1", "DIS-1101-1023-E1", "DIS-1101-1025-E1", "DIS-1101-1026-E1",
"DIS-1101-1027-E1", "DIS-1101-1028-E1", "DIS-1101-1029-E1", "DIS-1101-1030-E1",
"DIS-1101-1031-E1", "DIS-1101-1032-E1", "DIS-1101-1033-E1", "DIS-1101-1034-E1",
"DIS-1101-1035-E1", "DIS-1101-1036-E1", "DIS-1101-1037-E1", "DIS-1101-1038-E1",
"DIS-1101-1039-E1", "DIS-1101-1040-E1", "DIS-1101-1041-E1", "DIS-1101-1042-E1",
"DIS-1101-1042-E2", "DIS-1101-1043-E1", "DIS-1101-1044-E1", "DIS-1101-1045-E1",
"DIS-1101-1046-E1", "DIS-1101-1047-E1", "DIS-1101-1048-E1", "DIS-1101-1049-E1"
), `TIME_POINT_1=HAEMOGLOBIN` = c("BASELINE=123.00;FIRST=117.00",
"FIRST=92.00", "FIRST=111.00", "BASELINE=125.00;FIRST=113.00",
"FIRST=124.00", "NULL=NULL", "NULL=NULL", "FIRST=77.00", "FIRST=132.00",
"FIRST=123.00", "BASELINE=119.00;FIRST=93.00", "BASELINE=163.00;FIRST=141.00",
"FIRST=137.00", "NULL=NULL", "NULL=NULL", "FIRST=137.00", "FIRST=127.00",
"BASELINE=NULL;FIRST=101.00;SECOND=105.00", "FIRST=120.00", "FIRST=145.00",
"FIRST=126.00", "BASELINE=93.00;FIRST=82.00", "BASELINE=114.00;FIRST=95.00",
"BASELINE=137.00;FIRST=125.00", "BASELINE=157.00;FIRST=112.00",
"NULL=NULL", "FIRST=127.00", "FIRST=147.00", "BASELINE=80.00;FIRST=92.00",
"BASELINE=129.00;FIRST=131.00", "FIRST=108.00", "BASELINE=92.00;FIRST=80.00",
"BASELINE=118.00;FIRST=107.00", "NULL=NULL", "NULL=NULL", "BASELINE=NULL;FIRST=89.00",
"NULL=NULL", "BASELINE=NULL;FIRST=101.00", "FIRST=151.00", "FIRST=106.00",
"BASELINE=81.00;FIRST=87.00", "FIRST=94.00;SECOND=85.00", "FIRST=85.00",
"FIRST=118.00;SECOND=116.00", "FIRST=101.00;SECOND=103.00", "FIRST=114.00",
"BASELINE=NULL;FIRST=129.00", "BASELINE=123.00;FIRST=105.00",
"NULL=NULL", "NULL=NULL"), `TIME_POINT_2=WHITE_CELLS` = c("BASELINE=2.90;FIRST=2.40",
"FIRST=14.80", "FIRST=11.00", "BASELINE=6.90;FIRST=7.30", "FIRST=4.50",
"NULL=NULL", "NULL=NULL", "FIRST=6.10", "FIRST=7.70", "FIRST=16.20",
"BASELINE=7.40;FIRST=5.30", "BASELINE=12.80;FIRST=10.30", "FIRST=14.50",
"NULL=NULL", "NULL=NULL", "FIRST=12.80", "FIRST=3.70", "BASELINE=4.20;FIRST=4.70;SECOND=6.50",
"FIRST=16.60", "FIRST=9.30", "FIRST=15.90", "BASELINE=31.00;FIRST=31.70",
"BASELINE=3.70;FIRST=7.00", "BASELINE=14.10;FIRST=7.20", "BASELINE=21.80;FIRST=31.20",
"NULL=NULL", "FIRST=9.20", "FIRST=5.90", "BASELINE=2.20;FIRST=0.50",
"BASELINE=10.50;FIRST=10.60", "FIRST=12.00", "BASELINE=14.70;FIRST=11.80",
"BASELINE=13.40;FIRST=13.00", "NULL=NULL", "NULL=NULL", "BASELINE=NULL;FIRST=18.60",
"NULL=NULL", "BASELINE=4.30;FIRST=5.90", "FIRST=7.60", "FIRST=3.10",
"BASELINE=10.10;FIRST=11.40", "FIRST=12.00;SECOND=21.30", "FIRST=9.80",
"FIRST=29.50;SECOND=4.80", "FIRST=4.60;SECOND=9.90", "FIRST=18.90",
"BASELINE=15.00;FIRST=13.00", "BASELINE=12.80;FIRST=10.60", "NULL=NULL",
"NULL=NULL")), row.names = c(NA, -50L), class = c("tbl_df", "tbl",
"data.frame"))
我想要这个输出:
dput(output)
structure(list(UNIQUE_PATIENT_ID = c("DIS-1101-1001-E1", "DIS-1101-1002-E1",
"DIS-1101-1003-E1", "DIS-1101-1004-E1", "DIS-1101-1004-E2", "DIS-1101-1005-E1",
"DIS-1101-1006-E1", "DIS-1101-1007-E1", "DIS-1101-1008-E1", "DIS-1101-1009-E1",
"DIS-1101-1010-E1", "DIS-1101-1011-E1", "DIS-1101-1012-E1", "DIS-1101-1013-E1",
"DIS-1101-1014-E1", "DIS-1101-1015-E1", "DIS-1101-1016-E1", "DIS-1101-1017-E1",
"DIS-1101-1018-E1", "DIS-1101-1019-E1", "DIS-1101-1020-E1", "DIS-1101-1021-E1",
"DIS-1101-1022-E1", "DIS-1101-1023-E1", "DIS-1101-1025-E1", "DIS-1101-1026-E1",
"DIS-1101-1027-E1", "DIS-1101-1028-E1", "DIS-1101-1029-E1", "DIS-1101-1030-E1",
"DIS-1101-1031-E1", "DIS-1101-1032-E1", "DIS-1101-1033-E1", "DIS-1101-1034-E1",
"DIS-1101-1035-E1", "DIS-1101-1036-E1", "DIS-1101-1037-E1", "DIS-1101-1038-E1",
"DIS-1101-1039-E1", "DIS-1101-1040-E1", "DIS-1101-1041-E1", "DIS-1101-1042-E1",
"DIS-1101-1042-E2", "DIS-1101-1043-E1", "DIS-1101-1044-E1", "DIS-1101-1045-E1",
"DIS-1101-1046-E1", "DIS-1101-1047-E1", "DIS-1101-1048-E1", "DIS-1101-1049-E1"
), Haemoglobin_BASELINE = c("123.00", NA, NA, "125.00", NA, NA,
NA, NA, NA, NA, "119.00", "163.00", NA, NA, NA, NA, NA, "NULL",
NA, NA, NA, "93.00", "114.00", "137.00", "157.00", NA, NA, NA,
"80.00", "129.00", NA, "92.00", "118.00", NA, NA, "NULL", NA,
"NULL", NA, NA, "81.00", NA, NA, NA, NA, NA, "NULL", "123.00",
NA, NA), Haemoglobin_FIRST = c("117.00", "92.00", "111.00", "113.00",
"124.00", NA, NA, "77.00", "132.00", "123.00", "93.00", "141.00",
"137.00", NA, NA, "137.00", "127.00", "101.00", "120.00", "145.00",
"126.00", "82.00", "95.00", "125.00", "112.00", NA, "127.00",
"147.00", "92.00", "131.00", "108.00", "80.00", "107.00", NA,
NA, "89.00", NA, "101.00", "151.00", "106.00", "87.00", "94.00",
"85.00", "118.00", "101.00", "114.00", "129.00", "105.00", NA,
NA), Haemoglobin_SECOND = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "105.00", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, "85.00", NA, "116.00", "103.00", NA, NA, NA, NA, NA),
WBC_BASELINE = c("2.90", NA, NA, "6.90", NA, NA, NA, NA,
NA, NA, "7.40", "12.80", NA, NA, NA, NA, NA, "4.20", NA,
NA, NA, "31.00", "3.70", "14.10", "21.80", NA, NA, NA, "2.20",
"10.50", NA, "14.70", "13.40", NA, NA, "NULL", NA, "4.30",
NA, NA, "10.10", NA, NA, NA, NA, NA, "15.00", "12.80", NA,
NA), WBC_FIRST = c("2.40", "14.80", "11.00", "7.30", "4.50",
NA, NA, "6.10", "7.70", "16.20", "5.30", "10.30", "14.50",
NA, NA, "12.80", "3.70", "4.70", "16.60", "9.30", "15.90",
"31.70", "7.00", "7.20", "31.20", NA, "9.20", "5.90", "0.50",
"10.60", "12.00", "11.80", "13.00", NA, NA, "18.60", NA,
"5.90", "7.60", "3.10", "11.40", "12.00", "9.80", "29.50",
"4.60", "18.90", "13.00", "10.60", NA, NA), WBC_SECOND = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "6.50", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "21.30", NA,
"4.80", "9.90", NA, NA, NA, NA, NA)), row.names = c(NA, -50L
), class = "data.frame")
我可以实现这一目标,但是我的代码效率低下(我在每个血列上这样做):
#haemoglobin
haem_ID <- df %>% select(UNIQUE_PATIENT_ID, `TIME_POINT_1=HAEMOGLOBIN`)
haem_sep <- haem_ID %>%
rename(text = `TIME_POINT_1=HAEMOGLOBIN`) %>%
separate_rows(text, sep = ';') %>%
separate(text, c('text', 'value'), sep = '=') %>%
pivot_wider(names_from = text, values_from = value)
tmp <- haem_sep %>% select(-UNIQUE_PATIENT_ID)
colnames(tmp) <- paste("Haemoglobin", colnames(tmp), sep = "_")
tmp2 <- haem_ID %>% select(UNIQUE_PATIENT_ID)
cbind(tmp2, tmp) -> tmp3
tmp3 %<>% select(-Haemoglobin_NULL)
haem <- tmp3
#white cells
a <- df %>% select(UNIQUE_PATIENT_ID, `TIME_POINT_2=WHITE_CELLS`)
b <- a %>%
rename(text = `TIME_POINT_2=WHITE_CELLS`) %>%
separate_rows(text, sep = ';') %>%
separate(text, c('text', 'value'), sep = '=') %>%
pivot_wider(names_from = text, values_from = value)
tmp <- b %>% select(-UNIQUE_PATIENT_ID)
colnames(tmp) <- paste("WBC", colnames(tmp), sep = "_")
tmp2 <- a %>% select(UNIQUE_PATIENT_ID)
cbind(tmp2, tmp) -> tmp3
tmp3 %<>% select(-WBC_NULL)
WBC <- tmp3
output <- merge(haem, WBC)
请有人帮助我实现相同但更有效地实现吗?实际上,我进行了20次血液检查,而不仅仅是2次,因此进行硬编码不是理想的。
更新
在这种响应时,我尝试运行建议的代码,但是我遇到了一个错误,但我不知道为什么。
输入数据
structure(list(UNIQUE_PATIENT_ID = c("DIS-1104-1021-E1", "DIS-1107-1010-E1",
"DIS-1124-1108-E1"), `TIME_POINT_1=HAEMOGLOBIN` = c("BASELINE=98.00;FIRST=90.00;SECOND=81.00;FIRST=85.00",
"FIRST=120.00;NULL=120.00;NULL=118.00;SECOND=109.00;NULL=186.00",
"BASELINE=78.00;FIRST=NULL;FIRST=99.00"), `TIME_POINT_2=WHITE_CELLS` = c("BASELINE=0.01;FIRST=NULL;SECOND=0.01;FIRST=0.00",
"FIRST=15.40;NULL=16.10;NULL=14.50;SECOND=11.40;NULL=2.20;NULL=5.00",
"BASELINE=85.00;FIRST=NULL;FIRST=209.00"), `TIME_POINT_3=PLATELETS` = c("BASELINE=28.00;FIRST=28.00;SECOND=12.00;FIRST=21.00",
"FIRST=200.00;NULL=186.00;NULL=203.00;SECOND=243.00;NULL=145.00;NULL=457.00",
"BASELINE=186.00;FIRST=NULL"), `TIME_POINT_4=NEUTROPHILS` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=13.00;NULL=13.20;NULL=11.10;SECOND=7.70;NULL=1.30;NULL=2.20",
"BASELINE=7.10;FIRST=2.40;FIRST=NULL"), `TIME_POINT_5=LYMPHOCYTES` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=0.80;NULL=1.60;NULL=1.40;SECOND=1.70;NULL=0.40", "BASELINE=76.80;FIRST=135.00;FIRST=NULL"
), `TIME_POINT_6=MONOCYTES` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=1.50;NULL=1.20;NULL=1.90;SECOND=2.00;NULL=0.40;NULL=0.90",
"BASELINE=1.40;FIRST=4.20;FIRST=NULL"), `TIME_POINT_7=EOSINOPHILS` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=0.10;NULL=0.00;SECOND=0.00;NULL=0.10;NULL=0.30", "BASELINE=0.10;FIRST=0.40;FIRST=NULL"
), `TIME_POINT_8=FIBRINOGEN` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=6.90;NULL=6.60;NULL=7.10;SECOND=6.60;NULL=6.00;NULL=NULL",
"BASELINE=6.70;FIRST=NULL"), `TIME_POINT_9=PT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=19.20;NULL=17.20;NULL=16.10;SECOND=14.40;NULL=14.90;NULL=NULL",
"BASELINE=13.40;FIRST=NULL"), `TIME_POINT_10=UREA` = c("BASELINE=4.30;FIRST=4.30;SECOND=5.10;FIRST=4.50",
"FIRST=3.80;NULL=3.70;NULL=4.40;SECOND=2.80;NULL=2.90;NULL=4.10",
"BASELINE=5.10;FIRST=NULL;FIRST=7.20"), `TIME_POINT_11=CREATININE` = c("BASELINE=67.00;FIRST=55.00;SECOND=61.00;FIRST=60.00",
"FIRST=44.00;NULL=42.00;NULL=45.00;SECOND=42.00;NULL=33.00",
"BASELINE=65.00;FIRST=NULL;FIRST=72.00"), `TIME_POINT_12=ALT` = c("BASELINE=9.00;FIRST=11.00;SECOND=8.00;FIRST=5.00",
"FIRST=20.00;NULL=NULL;NULL=19.00;SECOND=16.00;NULL=16.00;NULL=21.00",
"BASELINE=64.00;FIRST=NULL;FIRST=92.00"), `TIME_POINT_13=BILIRUBIN` = c("BASELINE=3.00;FIRST=5.00;SECOND=3.00;FIRST=NULL",
"FIRST=20.00;NULL=NULL;NULL=11.00;SECOND=10.00;NULL=5.00", "BASELINE=7.00;FIRST=NULL;FIRST=10.00"
), `TIME_POINT_14=ALBUMIN` = c("BASELINE=42.00;FIRST=39.00;SECOND=35.00;FIRST=36.00",
"FIRST=42.00;NULL=NULL;NULL=37.00;SECOND=35.00;NULL=35.00;NULL=39.00",
"BASELINE=31.00;FIRST=NULL;FIRST=40.00"), `TIME_POINT_15=VITAMIN_D` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_16=CRP` = c("BASELINE=NULL;FIRST=172.00;SECOND=116.00;FIRST=212.00",
"FIRST=427.00;NULL=420.00;NULL=338.00;SECOND=215.00;NULL=87.00;NULL=64.00",
"BASELINE=199.00;FIRST=NULL;FIRST=1.00"), `TIME_POINT_17=PCT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_18=FERRITIN` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=378.00;SECOND=NULL", "BASELINE=809.00;FIRST=432.00;FIRST=NULL"
), `TIME_POINT_19=TROP_T` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=3.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_20=D_DIMER` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=1.85;NULL=1.65;SECOND=NULL;NULL=NULL", "BASELINE=749.00;FIRST=486.00;FIRST=NULL"
), `TIME_POINT_21=INR` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_22=BNP_NT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_23=LDH` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=254.00;FIRST=NULL"
), `TIME_POINT_24=CK` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=29.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_25=SCD_25` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_26=BASE_EXCESS` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=3.10;NULL=1.50;NULL=NULL;SECOND=NULL", "BASELINE=-3.80;FIRST=NULL"
), `TIME_POINT_27=LACTATE` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=0.90;NULL=1.30;NULL=NULL;SECOND=NULL", "BASELINE=0.60;FIRST=NULL"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
代码:
output <- input |>
separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |>
pivot_longer(-UNIQUE_PATIENT_ID) |>
separate(value, into = c("timepoint", "value"), sep = "=") |>
mutate(name = str_remove(name, ".*=")) |>
pivot_wider(names_from = c("name", "timepoint")) |>
select(-ends_with("NULL"))
错误消息:
Error in `fn()`:
! In row 2, can't recycle input of size 5 to size 6.
Run `rlang::last_error()` to see where the error occurred.
谢谢!
I'm trying to split multiple columns, but struggling to do this efficiently.
I have a df:
> dput(df)
structure(list(UNIQUE_PATIENT_ID = c("DIS-1101-1001-E1", "DIS-1101-1002-E1",
"DIS-1101-1003-E1", "DIS-1101-1004-E1", "DIS-1101-1004-E2", "DIS-1101-1005-E1",
"DIS-1101-1006-E1", "DIS-1101-1007-E1", "DIS-1101-1008-E1", "DIS-1101-1009-E1",
"DIS-1101-1010-E1", "DIS-1101-1011-E1", "DIS-1101-1012-E1", "DIS-1101-1013-E1",
"DIS-1101-1014-E1", "DIS-1101-1015-E1", "DIS-1101-1016-E1", "DIS-1101-1017-E1",
"DIS-1101-1018-E1", "DIS-1101-1019-E1", "DIS-1101-1020-E1", "DIS-1101-1021-E1",
"DIS-1101-1022-E1", "DIS-1101-1023-E1", "DIS-1101-1025-E1", "DIS-1101-1026-E1",
"DIS-1101-1027-E1", "DIS-1101-1028-E1", "DIS-1101-1029-E1", "DIS-1101-1030-E1",
"DIS-1101-1031-E1", "DIS-1101-1032-E1", "DIS-1101-1033-E1", "DIS-1101-1034-E1",
"DIS-1101-1035-E1", "DIS-1101-1036-E1", "DIS-1101-1037-E1", "DIS-1101-1038-E1",
"DIS-1101-1039-E1", "DIS-1101-1040-E1", "DIS-1101-1041-E1", "DIS-1101-1042-E1",
"DIS-1101-1042-E2", "DIS-1101-1043-E1", "DIS-1101-1044-E1", "DIS-1101-1045-E1",
"DIS-1101-1046-E1", "DIS-1101-1047-E1", "DIS-1101-1048-E1", "DIS-1101-1049-E1"
), `TIME_POINT_1=HAEMOGLOBIN` = c("BASELINE=123.00;FIRST=117.00",
"FIRST=92.00", "FIRST=111.00", "BASELINE=125.00;FIRST=113.00",
"FIRST=124.00", "NULL=NULL", "NULL=NULL", "FIRST=77.00", "FIRST=132.00",
"FIRST=123.00", "BASELINE=119.00;FIRST=93.00", "BASELINE=163.00;FIRST=141.00",
"FIRST=137.00", "NULL=NULL", "NULL=NULL", "FIRST=137.00", "FIRST=127.00",
"BASELINE=NULL;FIRST=101.00;SECOND=105.00", "FIRST=120.00", "FIRST=145.00",
"FIRST=126.00", "BASELINE=93.00;FIRST=82.00", "BASELINE=114.00;FIRST=95.00",
"BASELINE=137.00;FIRST=125.00", "BASELINE=157.00;FIRST=112.00",
"NULL=NULL", "FIRST=127.00", "FIRST=147.00", "BASELINE=80.00;FIRST=92.00",
"BASELINE=129.00;FIRST=131.00", "FIRST=108.00", "BASELINE=92.00;FIRST=80.00",
"BASELINE=118.00;FIRST=107.00", "NULL=NULL", "NULL=NULL", "BASELINE=NULL;FIRST=89.00",
"NULL=NULL", "BASELINE=NULL;FIRST=101.00", "FIRST=151.00", "FIRST=106.00",
"BASELINE=81.00;FIRST=87.00", "FIRST=94.00;SECOND=85.00", "FIRST=85.00",
"FIRST=118.00;SECOND=116.00", "FIRST=101.00;SECOND=103.00", "FIRST=114.00",
"BASELINE=NULL;FIRST=129.00", "BASELINE=123.00;FIRST=105.00",
"NULL=NULL", "NULL=NULL"), `TIME_POINT_2=WHITE_CELLS` = c("BASELINE=2.90;FIRST=2.40",
"FIRST=14.80", "FIRST=11.00", "BASELINE=6.90;FIRST=7.30", "FIRST=4.50",
"NULL=NULL", "NULL=NULL", "FIRST=6.10", "FIRST=7.70", "FIRST=16.20",
"BASELINE=7.40;FIRST=5.30", "BASELINE=12.80;FIRST=10.30", "FIRST=14.50",
"NULL=NULL", "NULL=NULL", "FIRST=12.80", "FIRST=3.70", "BASELINE=4.20;FIRST=4.70;SECOND=6.50",
"FIRST=16.60", "FIRST=9.30", "FIRST=15.90", "BASELINE=31.00;FIRST=31.70",
"BASELINE=3.70;FIRST=7.00", "BASELINE=14.10;FIRST=7.20", "BASELINE=21.80;FIRST=31.20",
"NULL=NULL", "FIRST=9.20", "FIRST=5.90", "BASELINE=2.20;FIRST=0.50",
"BASELINE=10.50;FIRST=10.60", "FIRST=12.00", "BASELINE=14.70;FIRST=11.80",
"BASELINE=13.40;FIRST=13.00", "NULL=NULL", "NULL=NULL", "BASELINE=NULL;FIRST=18.60",
"NULL=NULL", "BASELINE=4.30;FIRST=5.90", "FIRST=7.60", "FIRST=3.10",
"BASELINE=10.10;FIRST=11.40", "FIRST=12.00;SECOND=21.30", "FIRST=9.80",
"FIRST=29.50;SECOND=4.80", "FIRST=4.60;SECOND=9.90", "FIRST=18.90",
"BASELINE=15.00;FIRST=13.00", "BASELINE=12.80;FIRST=10.60", "NULL=NULL",
"NULL=NULL")), row.names = c(NA, -50L), class = c("tbl_df", "tbl",
"data.frame"))
I want this output:
dput(output)
structure(list(UNIQUE_PATIENT_ID = c("DIS-1101-1001-E1", "DIS-1101-1002-E1",
"DIS-1101-1003-E1", "DIS-1101-1004-E1", "DIS-1101-1004-E2", "DIS-1101-1005-E1",
"DIS-1101-1006-E1", "DIS-1101-1007-E1", "DIS-1101-1008-E1", "DIS-1101-1009-E1",
"DIS-1101-1010-E1", "DIS-1101-1011-E1", "DIS-1101-1012-E1", "DIS-1101-1013-E1",
"DIS-1101-1014-E1", "DIS-1101-1015-E1", "DIS-1101-1016-E1", "DIS-1101-1017-E1",
"DIS-1101-1018-E1", "DIS-1101-1019-E1", "DIS-1101-1020-E1", "DIS-1101-1021-E1",
"DIS-1101-1022-E1", "DIS-1101-1023-E1", "DIS-1101-1025-E1", "DIS-1101-1026-E1",
"DIS-1101-1027-E1", "DIS-1101-1028-E1", "DIS-1101-1029-E1", "DIS-1101-1030-E1",
"DIS-1101-1031-E1", "DIS-1101-1032-E1", "DIS-1101-1033-E1", "DIS-1101-1034-E1",
"DIS-1101-1035-E1", "DIS-1101-1036-E1", "DIS-1101-1037-E1", "DIS-1101-1038-E1",
"DIS-1101-1039-E1", "DIS-1101-1040-E1", "DIS-1101-1041-E1", "DIS-1101-1042-E1",
"DIS-1101-1042-E2", "DIS-1101-1043-E1", "DIS-1101-1044-E1", "DIS-1101-1045-E1",
"DIS-1101-1046-E1", "DIS-1101-1047-E1", "DIS-1101-1048-E1", "DIS-1101-1049-E1"
), Haemoglobin_BASELINE = c("123.00", NA, NA, "125.00", NA, NA,
NA, NA, NA, NA, "119.00", "163.00", NA, NA, NA, NA, NA, "NULL",
NA, NA, NA, "93.00", "114.00", "137.00", "157.00", NA, NA, NA,
"80.00", "129.00", NA, "92.00", "118.00", NA, NA, "NULL", NA,
"NULL", NA, NA, "81.00", NA, NA, NA, NA, NA, "NULL", "123.00",
NA, NA), Haemoglobin_FIRST = c("117.00", "92.00", "111.00", "113.00",
"124.00", NA, NA, "77.00", "132.00", "123.00", "93.00", "141.00",
"137.00", NA, NA, "137.00", "127.00", "101.00", "120.00", "145.00",
"126.00", "82.00", "95.00", "125.00", "112.00", NA, "127.00",
"147.00", "92.00", "131.00", "108.00", "80.00", "107.00", NA,
NA, "89.00", NA, "101.00", "151.00", "106.00", "87.00", "94.00",
"85.00", "118.00", "101.00", "114.00", "129.00", "105.00", NA,
NA), Haemoglobin_SECOND = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, "105.00", NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, "85.00", NA, "116.00", "103.00", NA, NA, NA, NA, NA),
WBC_BASELINE = c("2.90", NA, NA, "6.90", NA, NA, NA, NA,
NA, NA, "7.40", "12.80", NA, NA, NA, NA, NA, "4.20", NA,
NA, NA, "31.00", "3.70", "14.10", "21.80", NA, NA, NA, "2.20",
"10.50", NA, "14.70", "13.40", NA, NA, "NULL", NA, "4.30",
NA, NA, "10.10", NA, NA, NA, NA, NA, "15.00", "12.80", NA,
NA), WBC_FIRST = c("2.40", "14.80", "11.00", "7.30", "4.50",
NA, NA, "6.10", "7.70", "16.20", "5.30", "10.30", "14.50",
NA, NA, "12.80", "3.70", "4.70", "16.60", "9.30", "15.90",
"31.70", "7.00", "7.20", "31.20", NA, "9.20", "5.90", "0.50",
"10.60", "12.00", "11.80", "13.00", NA, NA, "18.60", NA,
"5.90", "7.60", "3.10", "11.40", "12.00", "9.80", "29.50",
"4.60", "18.90", "13.00", "10.60", NA, NA), WBC_SECOND = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, "6.50", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "21.30", NA,
"4.80", "9.90", NA, NA, NA, NA, NA)), row.names = c(NA, -50L
), class = "data.frame")
I can achieve this, but my code is inefficient (as I am doing this on each blood column):
#haemoglobin
haem_ID <- df %>% select(UNIQUE_PATIENT_ID, `TIME_POINT_1=HAEMOGLOBIN`)
haem_sep <- haem_ID %>%
rename(text = `TIME_POINT_1=HAEMOGLOBIN`) %>%
separate_rows(text, sep = ';') %>%
separate(text, c('text', 'value'), sep = '=') %>%
pivot_wider(names_from = text, values_from = value)
tmp <- haem_sep %>% select(-UNIQUE_PATIENT_ID)
colnames(tmp) <- paste("Haemoglobin", colnames(tmp), sep = "_")
tmp2 <- haem_ID %>% select(UNIQUE_PATIENT_ID)
cbind(tmp2, tmp) -> tmp3
tmp3 %<>% select(-Haemoglobin_NULL)
haem <- tmp3
#white cells
a <- df %>% select(UNIQUE_PATIENT_ID, `TIME_POINT_2=WHITE_CELLS`)
b <- a %>%
rename(text = `TIME_POINT_2=WHITE_CELLS`) %>%
separate_rows(text, sep = ';') %>%
separate(text, c('text', 'value'), sep = '=') %>%
pivot_wider(names_from = text, values_from = value)
tmp <- b %>% select(-UNIQUE_PATIENT_ID)
colnames(tmp) <- paste("WBC", colnames(tmp), sep = "_")
tmp2 <- a %>% select(UNIQUE_PATIENT_ID)
cbind(tmp2, tmp) -> tmp3
tmp3 %<>% select(-WBC_NULL)
WBC <- tmp3
output <- merge(haem, WBC)
Pls can someone help me achieve the same but more effectively please? In reality I have 20 blood tests and not just 2, so hard-coding it is not ideal.
UPDATE
Upon the kind responses, I've tried running the suggested code but I'm getting an error and I can't figure out why.
input data
structure(list(UNIQUE_PATIENT_ID = c("DIS-1104-1021-E1", "DIS-1107-1010-E1",
"DIS-1124-1108-E1"), `TIME_POINT_1=HAEMOGLOBIN` = c("BASELINE=98.00;FIRST=90.00;SECOND=81.00;FIRST=85.00",
"FIRST=120.00;NULL=120.00;NULL=118.00;SECOND=109.00;NULL=186.00",
"BASELINE=78.00;FIRST=NULL;FIRST=99.00"), `TIME_POINT_2=WHITE_CELLS` = c("BASELINE=0.01;FIRST=NULL;SECOND=0.01;FIRST=0.00",
"FIRST=15.40;NULL=16.10;NULL=14.50;SECOND=11.40;NULL=2.20;NULL=5.00",
"BASELINE=85.00;FIRST=NULL;FIRST=209.00"), `TIME_POINT_3=PLATELETS` = c("BASELINE=28.00;FIRST=28.00;SECOND=12.00;FIRST=21.00",
"FIRST=200.00;NULL=186.00;NULL=203.00;SECOND=243.00;NULL=145.00;NULL=457.00",
"BASELINE=186.00;FIRST=NULL"), `TIME_POINT_4=NEUTROPHILS` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=13.00;NULL=13.20;NULL=11.10;SECOND=7.70;NULL=1.30;NULL=2.20",
"BASELINE=7.10;FIRST=2.40;FIRST=NULL"), `TIME_POINT_5=LYMPHOCYTES` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=0.80;NULL=1.60;NULL=1.40;SECOND=1.70;NULL=0.40", "BASELINE=76.80;FIRST=135.00;FIRST=NULL"
), `TIME_POINT_6=MONOCYTES` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=1.50;NULL=1.20;NULL=1.90;SECOND=2.00;NULL=0.40;NULL=0.90",
"BASELINE=1.40;FIRST=4.20;FIRST=NULL"), `TIME_POINT_7=EOSINOPHILS` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00",
"FIRST=0.10;NULL=0.00;SECOND=0.00;NULL=0.10;NULL=0.30", "BASELINE=0.10;FIRST=0.40;FIRST=NULL"
), `TIME_POINT_8=FIBRINOGEN` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=6.90;NULL=6.60;NULL=7.10;SECOND=6.60;NULL=6.00;NULL=NULL",
"BASELINE=6.70;FIRST=NULL"), `TIME_POINT_9=PT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=19.20;NULL=17.20;NULL=16.10;SECOND=14.40;NULL=14.90;NULL=NULL",
"BASELINE=13.40;FIRST=NULL"), `TIME_POINT_10=UREA` = c("BASELINE=4.30;FIRST=4.30;SECOND=5.10;FIRST=4.50",
"FIRST=3.80;NULL=3.70;NULL=4.40;SECOND=2.80;NULL=2.90;NULL=4.10",
"BASELINE=5.10;FIRST=NULL;FIRST=7.20"), `TIME_POINT_11=CREATININE` = c("BASELINE=67.00;FIRST=55.00;SECOND=61.00;FIRST=60.00",
"FIRST=44.00;NULL=42.00;NULL=45.00;SECOND=42.00;NULL=33.00",
"BASELINE=65.00;FIRST=NULL;FIRST=72.00"), `TIME_POINT_12=ALT` = c("BASELINE=9.00;FIRST=11.00;SECOND=8.00;FIRST=5.00",
"FIRST=20.00;NULL=NULL;NULL=19.00;SECOND=16.00;NULL=16.00;NULL=21.00",
"BASELINE=64.00;FIRST=NULL;FIRST=92.00"), `TIME_POINT_13=BILIRUBIN` = c("BASELINE=3.00;FIRST=5.00;SECOND=3.00;FIRST=NULL",
"FIRST=20.00;NULL=NULL;NULL=11.00;SECOND=10.00;NULL=5.00", "BASELINE=7.00;FIRST=NULL;FIRST=10.00"
), `TIME_POINT_14=ALBUMIN` = c("BASELINE=42.00;FIRST=39.00;SECOND=35.00;FIRST=36.00",
"FIRST=42.00;NULL=NULL;NULL=37.00;SECOND=35.00;NULL=35.00;NULL=39.00",
"BASELINE=31.00;FIRST=NULL;FIRST=40.00"), `TIME_POINT_15=VITAMIN_D` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_16=CRP` = c("BASELINE=NULL;FIRST=172.00;SECOND=116.00;FIRST=212.00",
"FIRST=427.00;NULL=420.00;NULL=338.00;SECOND=215.00;NULL=87.00;NULL=64.00",
"BASELINE=199.00;FIRST=NULL;FIRST=1.00"), `TIME_POINT_17=PCT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_18=FERRITIN` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=378.00;SECOND=NULL", "BASELINE=809.00;FIRST=432.00;FIRST=NULL"
), `TIME_POINT_19=TROP_T` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=3.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_20=D_DIMER` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=1.85;NULL=1.65;SECOND=NULL;NULL=NULL", "BASELINE=749.00;FIRST=486.00;FIRST=NULL"
), `TIME_POINT_21=INR` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_22=BNP_NT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_23=LDH` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=254.00;FIRST=NULL"
), `TIME_POINT_24=CK` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;NULL=29.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_25=SCD_25` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_26=BASE_EXCESS` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=3.10;NULL=1.50;NULL=NULL;SECOND=NULL", "BASELINE=-3.80;FIRST=NULL"
), `TIME_POINT_27=LACTATE` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL",
"FIRST=0.90;NULL=1.30;NULL=NULL;SECOND=NULL", "BASELINE=0.60;FIRST=NULL"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))
Code:
output <- input |>
separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |>
pivot_longer(-UNIQUE_PATIENT_ID) |>
separate(value, into = c("timepoint", "value"), sep = "=") |>
mutate(name = str_remove(name, ".*=")) |>
pivot_wider(names_from = c("name", "timepoint")) |>
select(-ends_with("NULL"))
Error message:
Error in `fn()`:
! In row 2, can't recycle input of size 5 to size 6.
Run `rlang::last_error()` to see where the error occurred.
Thank you!
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
在上面的评论和 @carl的实现的基础上,您可以使用
values_fn
直接利用嵌套列表结构,从而可以将您从某些代码中保存。我正在使用first()
从dplyr
获取第一个元素,但是您还可以根据您喜欢的内容来使用其他功能,〜means(as。数字(。))
要采取平均值等。当我们运行数据时,使用pivot_longer
,当我们pivot_longer
之前,没有不同的分离器。Building on my comments above and @Carl's implementation, you you could utilize the nested list structure directly using
values_fn
, which might save you from some lines of code. I am usingfirst()
fromdplyr
to take the first element, but you could as well use other functions depending on what you prefer,~ mean(as.numeric(.))
to take the mean etc. There is no varying number of separators when wepivot_longer
before usingseparate_rows
, when I run the data.最后一个
选择
行是可选的,具体取决于您要订购的列以及是否希望保留零列:使用带有语法列名称的示例数据,即使用“”。可以在干净的r会话中运行以下reprex:
在2022-07-04上由 reprex软件包(v2.0.1)
**更新的示例2数据集更新
以处理变化跨列的分隔符数量,我建议通过列> 迭代的函数。这克服了
axtim_rows
在较宽数据集中的错误,这不是示例数据集中的问题。使用模式匹配,我还仅提取引用“基线”,“第一个”或“第二”的字符串,其中它们与值相关联。
正如@harre所指出的那样,由于某些列包含多个,例如,例如,我随后使用
slice_head
在使用pivot_wider
之前使用了第一个。取而代之的是,您可以使用slice_tail
进行最后一个,也可以根据需求进行均值等。您需要在扩展数据集中进行进一步测试,但希望这会有所帮助!
由
The last
select
line is optional depending on how you want the columns ordered and whether or not you wish to keep the NULL columns:Using the example data with syntactic columns names, i.e. using a "." instead of "="), the following reprex may be run in a clean R session:
Created on 2022-07-04 by the reprex package (v2.0.1)
** UPDATE FOR YOUR WIDER SAMPLE2 DATASET **
To handle the varying number of separators across columns I've suggested a function to
separate_rows
iteratively by column. This overcomes theseparate_rows
error in the wider dataset that wasn't an issue in the sample dataset.Using pattern matching, I've also extracted only strings referencing "BASELINE", "FIRST" or "SECOND" where these are associated with a value.
And as @harre notes, because some columns contain multiple, e.g. FIRSTs, I've subsequently taken the first of these using
slice_head
before usingpivot_wider
. You could instead take the last usingslice_tail
or take the mean etc. depending on the requirement.You'll need to test further on your extended dataset, but hope this helps!
Created on 2022-07-04 by the reprex package (v2.0.1)