按标题作为分隔符分割文本并保存为 R 中的数据框列

发布于 2025-01-09 08:14:45 字数 3310 浏览 0 评论 0原文

我在带有多个标题的 text 列中拥有药物 (df) 及其相关信息(其中两个作为示例提供)。我需要分割文本并将相应的文本放在单独的列中(如 required 数据框中提供的)

heads <- c("Indications", "Administration")
df <- data.frame(drugs = c("acetaminophen", "prednisolone"), text = c("Indications1\nPain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.\nAdministration\nUsually administered orally; may be administered rectally as suppositories in patients who cannot tolerate oral therapy. Also may be administered IV.", "Indications \nTreatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on blood and lymphatic systems in the palliative treatment of various diseases.\nAdministration\nGeneralDosage depends on the condition of indications and the patient response."))

required <- data.frame(drugs = c("acetaminophen", "prednisolone"), Indications = c(c("Pain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.", "Treatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on blood and lymphatic systems in the palliative treatment of various diseases.")), Administration = c("Usually administered orally; may be administered rectally as suppositories in patients who cannot tolerate oral therapy. Also may be administered IV.", "GeneralDosage depends on the condition of indications and the patient response."))

我尝试过的

使用 strsplit

这给了我一个列表,但是我没有标题,而且由于并非所有药物都有所有标题,因此这是行不通的。 另外,我不知道如何将其合并到现有的 df 中

library(rebus)

head.rx <- sapply(heads, function(x) as.regex(x) %R% free_spacing(any_char(0,3)) %R% newline(1,2)) %R% optional(space(0,3))
split <- strsplit(df$text[1], or1(head.rx), perl = T))

每个标题的开始和结束

提取之间的文本(抱歉,如果它非常初步......我不太擅长自定义函数)

extract_heading <- function(text){
  
  #-1 is because I thought It would throw an error for the last heading
  extract.list <- vector(mode = "list", length = length(heads)-1)
  names(extract.list) <- heads[1:length(heads)-1]
 
  for (i in 1:length(heads)-1) {
    
    #the start and end regexes (based on the text to capture only the headings)
    start <- as.regex(heads[i]) %R% free_spacing(any_char(0,3)) %R% newline(1,2)
    end <- as.regex(heads[i+1]) %R% free_spacing(any_char(0,3)) %R% newline(1,2)
    
    #the strings that need to be extracted (from one heading to the next)
    rx <- start %R% free_spacing(any_char(3,5000)) %R% lookahead(end)
    
    #extract
    extract.list[i] <- stri_extract_first_regex(text, rx)
  }
  extract.list
}
  
##tried to see if it works (it gives me all NAs)
extract_heading(df$text[1])

使用 map 函数

但不知道该怎么做。

head.extract <- sapply(heads, function(x) x %R% free_spacing(any_char(3,9000)) %R% heads[which(heads ==x) +1])
purrr:: map2(df$text[1], head.extract, stri_extract_first_regex(df$text[1], head.extract)) 

我提前感谢您的帮助。

I have data frame of drugs (df) and their associated information in a text column with a number of headings (two of which are provided as examples). I need to split the text and have the according text in separate columns (as provided in the required data frame)

heads <- c("Indications", "Administration")
df <- data.frame(drugs = c("acetaminophen", "prednisolone"), text = c("Indications1\nPain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.\nAdministration\nUsually administered orally; may be administered rectally as suppositories in patients who cannot tolerate oral therapy. Also may be administered IV.", "Indications \nTreatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on blood and lymphatic systems in the palliative treatment of various diseases.\nAdministration\nGeneralDosage depends on the condition of indications and the patient response."))

required <- data.frame(drugs = c("acetaminophen", "prednisolone"), Indications = c(c("Pain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.", "Treatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on blood and lymphatic systems in the palliative treatment of various diseases.")), Administration = c("Usually administered orally; may be administered rectally as suppositories in patients who cannot tolerate oral therapy. Also may be administered IV.", "GeneralDosage depends on the condition of indications and the patient response."))

What I've tried

Using strsplit

This gives me a list but I don't have the headings and because of the fact that not all drug have all of the headings this doesn't work.
Also I don't know how to incorporate it into the existing df

library(rebus)

head.rx <- sapply(heads, function(x) as.regex(x) %R% free_spacing(any_char(0,3)) %R% newline(1,2)) %R% optional(space(0,3))
split <- strsplit(df$text[1], or1(head.rx), perl = T))

Getting start and end for each heading

To extract the text in between (sorry if it's very preliminary ... I'm not so good at custom functions)

extract_heading <- function(text){
  
  #-1 is because I thought It would throw an error for the last heading
  extract.list <- vector(mode = "list", length = length(heads)-1)
  names(extract.list) <- heads[1:length(heads)-1]
 
  for (i in 1:length(heads)-1) {
    
    #the start and end regexes (based on the text to capture only the headings)
    start <- as.regex(heads[i]) %R% free_spacing(any_char(0,3)) %R% newline(1,2)
    end <- as.regex(heads[i+1]) %R% free_spacing(any_char(0,3)) %R% newline(1,2)
    
    #the strings that need to be extracted (from one heading to the next)
    rx <- start %R% free_spacing(any_char(3,5000)) %R% lookahead(end)
    
    #extract
    extract.list[i] <- stri_extract_first_regex(text, rx)
  }
  extract.list
}
  
##tried to see if it works (it gives me all NAs)
extract_heading(df$text[1])

Use the map function

But can't figure out how to do it.

head.extract <- sapply(heads, function(x) x %R% free_spacing(any_char(3,9000)) %R% heads[which(heads ==x) +1])
purrr:: map2(df$text[1], head.extract, stri_extract_first_regex(df$text[1], head.extract)) 

I appreciate your help in advance.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

梦太阳 2025-01-16 08:14:45

那么让我们从 main 函数和正则表达式开始。我
会使用 stringistri_extract_all_regex 来实现此目的,但是
stringr::str_extract_all() 也可以工作,如果你发现这更容易的话。或者
您可以将 regmatchesregexprgregexpr 一起使用(如果您是)
决定留在基地R请参阅此处,了解
示例
)。

我对正则表达式的建议如下所示:

library(stringi)

stri_extract_all_regex(
  df$text,
  "(?<=Indications)[\\s\\S]+(?=Administration)"
)
## [[1]]
## [1] "1\nPain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.\n"
## 
## [[2]]
## [1] " \nTreatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on blood and lymphatic systems in the palliative treatment of various diseases.\n"

各个部分:

  • (?<=Inductions) 是后向查找,意味着它与位置匹配
    'Inductions' 后面的
  • [\\s\\S] 匹配任何字符(. 匹配除
    \n,这里很重要)
  • +? 表示我们需要至少 1 个字符匹配 [\\s\\S],我们
    还可以使用惰性匹配来获得较短的字符串。
  • (?=Administration) 是向后查找,意味着它与位置匹配
    后跟“管理”

这意味着我们提取“指示”之间且不包括“指示”的字符串
和“管理”。

接下来,我们想将其包装在一个函数中以使其更加灵活:

extract_between <- function(str, string_1, string_2) {
  unlist(stri_extract_all_regex(
    df$text,
    paste0("(?<=", string_1, ")[\\s\\S]+?(?=", string_2, ")")
  ))
}

该函数提取之间但不包括在内的所有字符
string_1string_2。如果你喜欢就尝试一下。

最后,我们想为每个标题创建一个新列。我用一个
简单的 for 循环。你可以使用lapply来让它变得更多
有效,但我没有测试这是否会改善任何事情
使代码的可读性降低。

# for the final match, we need the $ which represents the end of the string
heads_new <- c(heads, "$")

for (i in seq_len(length(heads_new) - 1)) {
  df[[heads_new[i]]] <- extract_between(
    df$text,
    string_1 = heads_new[i],
    string_2 = heads_new[i + 1]
  )
}

# for nicer printing
tibble::as_tibble(df)
## # A tibble: 2 × 4
##   drugs         text              Indications           Administration          
##   <chr>         <chr>             <chr>                 <chr>                   
## 1 acetaminophen "Indications1\nP… "1\nPain\nSymptomati… "\nUsually administered…
## 2 prednisolone  "Indications \nT… " \nTreatment of a w… "\nGeneralDosage depend…

这假设标题的顺序正确并且您知道
命令。您可以通过将所有标题用作 string2 来更改行为
同时,一旦 R 遇到另一个,匹配就会停止
标题(这就是我使用惰性模式的原因,即上面的)。我会
说这通常会产生更多问题,就像你的头条新闻一样
出现在文本的其他地方,所以我更喜欢第一种方法,如果
可能的:

extract_between(
  df$text,
  heads_new[1],
  paste0("(", paste0(heads_new, collapse = "|"), ")")
)
## [1] "1\nPain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.\n"                                                                                                              
## [2] " \nTreatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on blood and lymphatic systems in the palliative treatment of various diseases.\n"

So let’s start with the main function and the regular expression. I
would use stringi’s stri_extract_all_regex for this but
stringr::str_extract_all() would also work if you find that easier. Or
you can use regmatches with regexpr or gregexpr if you are
determined to stay in base R (see here, for
example
).

My suggestion for the regular expression is shown below:

library(stringi)

stri_extract_all_regex(
  df$text,
  "(?<=Indications)[\\s\\S]+(?=Administration)"
)
## [[1]]
## [1] "1\nPain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.\n"
## 
## [[2]]
## [1] " \nTreatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on blood and lymphatic systems in the palliative treatment of various diseases.\n"

The individual parts:

  • (?<=Indications) is a lookbehind meaning it matches the position
    following ‘Indications’
  • [\\s\\S] matches any character (. matches any character except
    \n, which is essential here)
  • +? indicates we want at least 1 character matching [\\s\\S], we
    also use lazy matching to get shorter strings.
  • (?=Administration) is a lookbehind meaning it matches the position
    followed by ‘Administration’

This means we extract the string between and not including ‘Indications’
and ‘Administration’.

Next, we want to wrap this in a function to make it more flexible:

extract_between <- function(str, string_1, string_2) {
  unlist(stri_extract_all_regex(
    df$text,
    paste0("(?<=", string_1, ")[\\s\\S]+?(?=", string_2, ")")
  ))
}

The function extracts all characters between but not including
string_1 and string_2. Try it out if you like.

Finally, we want to create a new column for each headline. I use a
simple for loop for this. You could use lapply to maybe make it more
efficient, but I did not test if that would improve anything and it
makes the code less readable.

# for the final match, we need the $ which represents the end of the string
heads_new <- c(heads, "
quot;)

for (i in seq_len(length(heads_new) - 1)) {
  df[[heads_new[i]]] <- extract_between(
    df$text,
    string_1 = heads_new[i],
    string_2 = heads_new[i + 1]
  )
}

# for nicer printing
tibble::as_tibble(df)
## # A tibble: 2 × 4
##   drugs         text              Indications           Administration          
##   <chr>         <chr>             <chr>                 <chr>                   
## 1 acetaminophen "Indications1\nP… "1\nPain\nSymptomati… "\nUsually administered…
## 2 prednisolone  "Indications \nT… " \nTreatment of a w… "\nGeneralDosage depend…

This assumes the headlines are in the correct order and you know that
order. You can change the behaviour by using all headlines as string2
at the same time so matching stops as soon as R encounters another
headline (this is the reason I use lazy mode, i.e. ?, above). I would
say that will generally produce more issues as your headlines might
occur elsewhere in the text, so I would prefer the first approach if
possible:

extract_between(
  df$text,
  heads_new[1],
  paste0("(", paste0(heads_new, collapse = "|"), ")")
)
## [1] "1\nPain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.\n"                                                                                                              
## [2] " \nTreatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on blood and lymphatic systems in the palliative treatment of various diseases.\n"
如果没有你 2025-01-16 08:14:45

使用 strsplit 的基本 R 选项

with(
  df,
  cbind(df,
  setNames(
  as.data.frame(
    do.call(rbind,strsplit(
    text,
    split = sprintf("(%s).*?\\n",paste0(heads,collapse = "|")),
    perl = TRUE
  ))[,-1]),
  heads))
)

给出

          drugs
1 acetaminophen
2  prednisolone

                                                                                                                                                                         text
1                                         Indications1\nPain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.\nAdministration\nUsually administered orally; may be administered rectally as suppositories in patients who cannot tolerate oral therapy. Also may be administered IV.
2 Indications \nTreatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on 
blood and lymphatic systems in the palliative treatment of various diseases.\nAdministration\nGeneralDosage depends on the condition of indications and the patient response.

                                                     Indications
1                                                                                                               Pain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.\n
2 Treatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on blood and lymphatic systems in the palliative treatment of various diseases.\n
                                                                                                                                         Administration
1 Usually administered orally; may be administered rectally as suppositories in patients who cannot tolerate oral therapy. Also may be administered IV.
2                                                                       GeneralDosage depends on the condition of indications and the patient response.

A base R option using strsplit

with(
  df,
  cbind(df,
  setNames(
  as.data.frame(
    do.call(rbind,strsplit(
    text,
    split = sprintf("(%s).*?\\n",paste0(heads,collapse = "|")),
    perl = TRUE
  ))[,-1]),
  heads))
)

gives

          drugs
1 acetaminophen
2  prednisolone

                                                                                                                                                                         text
1                                         Indications1\nPain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.\nAdministration\nUsually administered orally; may be administered rectally as suppositories in patients who cannot tolerate oral therapy. Also may be administered IV.
2 Indications \nTreatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on 
blood and lymphatic systems in the palliative treatment of various diseases.\nAdministration\nGeneralDosage depends on the condition of indications and the patient response.

                                                     Indications
1                                                                                                               Pain\nSymptomatic relief of mild to moderate pain.Fever\nReduction of fever.Self-medication to reduce fever in infants, children, and adults.\n
2 Treatment of a wide variety of diseases and conditions; used principally for glucocorticoid effects as an anti-inflammatory and immunosuppressant agent and for its effects on blood and lymphatic systems in the palliative treatment of various diseases.\n
                                                                                                                                         Administration
1 Usually administered orally; may be administered rectally as suppositories in patients who cannot tolerate oral therapy. Also may be administered IV.
2                                                                       GeneralDosage depends on the condition of indications and the patient response.
彼岸花似海 2025-01-16 08:14:45

这里有一个 for 循环选项(猜想不太友好)。

但首先:

需要假设标题名称及其顺序,并且它们不会在内容上重复。

如果是这样:

n<-c("Indications","Administration")


df1<-df["drugs"]
df1[,n]<-NA



for (i in length(n):1){

#For the first heading
  if (i == 1){ 
    df1[,n[1]]<-df$text[grepl(n[1],df$text)]
    df1[,n[1]]<- gsub("\n"," ",df1[,n[1]])
    df1[,n[1]]<-sub(paste0(".*",n[1]," (.+)",n[2]," .*"),"\\1",df1[,n[1]])
    df1[,n[1]]<- gsub(n[1]," ",df1[,n[1]])
    
    }else{
      
      #For the last heading
      if (i == length(n)){ 
        df1[,n[length(n)]]<-df$text[grepl(n[length(n)],df$text)]
        df1[,n[length(n)]]<- gsub("\n"," ",df1[,n[length(n)]])
        df1[,n[length(n)]]<-sub(paste0(".*",n[length(n)]," (.+)"),"\\1",df1[,n[length(n)]])
        df1[,n[length(n)]]<- gsub(n[length(n)]," ",df1[,n[length(n)]])
        }else{
          
          #Remaining headings
          df1[,n[i]]<-df$text[grepl(n[i],df$text)]
          df1[,n[i]]<- gsub("\n"," ",df1[,n[i]])
          df1[,n[i]]<-sub(paste0(".*",n[i]," (.+)",n[i+1]," .*"),"\\1",df1[,n[i]])
          df1[,n[i]]<- gsub(n[i]," ",df1[,n[i]])
          }
    }
  }

A for loop option here (guess not so friendly).

but first:

Need to assume the heading names, its order, and that they do not repeat on the content.

if so:

n<-c("Indications","Administration")


df1<-df["drugs"]
df1[,n]<-NA



for (i in length(n):1){

#For the first heading
  if (i == 1){ 
    df1[,n[1]]<-df$text[grepl(n[1],df$text)]
    df1[,n[1]]<- gsub("\n"," ",df1[,n[1]])
    df1[,n[1]]<-sub(paste0(".*",n[1]," (.+)",n[2]," .*"),"\\1",df1[,n[1]])
    df1[,n[1]]<- gsub(n[1]," ",df1[,n[1]])
    
    }else{
      
      #For the last heading
      if (i == length(n)){ 
        df1[,n[length(n)]]<-df$text[grepl(n[length(n)],df$text)]
        df1[,n[length(n)]]<- gsub("\n"," ",df1[,n[length(n)]])
        df1[,n[length(n)]]<-sub(paste0(".*",n[length(n)]," (.+)"),"\\1",df1[,n[length(n)]])
        df1[,n[length(n)]]<- gsub(n[length(n)]," ",df1[,n[length(n)]])
        }else{
          
          #Remaining headings
          df1[,n[i]]<-df$text[grepl(n[i],df$text)]
          df1[,n[i]]<- gsub("\n"," ",df1[,n[i]])
          df1[,n[i]]<-sub(paste0(".*",n[i]," (.+)",n[i+1]," .*"),"\\1",df1[,n[i]])
          df1[,n[i]]<- gsub(n[i]," ",df1[,n[i]])
          }
    }
  }
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文