r dplyr根据重叠的日期时间间隔加入表

发布于 2025-01-21 01:57:05 字数 3266 浏览 0 评论 0原文

我有两个表,label 其中包含 label 时间间隔(开始和结束),以及 items 每个表都有唯一的时间戳项目

我想根据时间戳来映射items。如果项目时间戳落在标签的时间间隔内,则该项目属于该标签

例如,这两个 a123b682位于与label X547<对应的时间间隔内/code>,所以a123b682属于X547

# Label, start - end time
have_label_start_end <- data.frame(label = c('X547', 'X285', 'X290')
                   , time = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 58L, sec = 51, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 4L, sec = 54, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 34, tz = "UTC"))) %>% 
  dplyr::mutate(start_time = time, stop_time = lead(time)) %>% dplyr::select(-time) 

> have_label_start_end
  label          start_time           stop_time
1  X547 2022-04-11 09:58:51 2022-04-11 10:04:54
2  X285 2022-04-11 10:04:54 2022-04-11 11:08:34
3  X290 2022-04-11 11:08:34                <NA>


# Item
have_item_time <- data.frame(item = c('a123', 'b682', 'c3324', 'd4343', 'e5343')
           , timestamp = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 59L, sec = 34, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 3L, sec = 13, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 5L, sec = 17, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 35, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 10L, sec = 09, tz = "UTC")))

> have_item_time
   item           timestamp
1  a123 2022-04-11 09:59:34
2  b682 2022-04-11 10:03:13
3 c3324 2022-04-11 10:05:17
4 d4343 2022-04-11 11:08:35
5 e5343 2022-04-11 11:10:09


# Map Items to Label
want <- data.frame(label = c('X547', 'X547','X285', 'X290', 'X290'),
                   item = c('a123', 'b682', 'c3324', 'd4343', 'e5343')
                   , timestamp = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 59L, sec = 34, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 3L, sec = 13, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 5L, sec = 17, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 35, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 10L, sec = 09, tz = "UTC")))


> want
  label  item           timestamp
1  X547  a123 2022-04-11 09:59:34
2  X547  b682 2022-04-11 10:03:13
3  X285 c3324 2022-04-11 10:05:17
4  X290 d4343 2022-04-11 11:08:35
5  X290 e5343 2022-04-11 11:10:09

I have two tables, the label which contains the label time interval (start and end), and the items which have a unique timestamp for each item.

I want to map the items based on their timestamps. If the item timestamp falls within the label's time interval, therefore the item belongs to that label.

For example these two items a123, b682 are in the time interval which corresponds to label X547, so a123, b682 belongs to X547

# Label, start - end time
have_label_start_end <- data.frame(label = c('X547', 'X285', 'X290')
                   , time = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 58L, sec = 51, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 4L, sec = 54, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 34, tz = "UTC"))) %>% 
  dplyr::mutate(start_time = time, stop_time = lead(time)) %>% dplyr::select(-time) 

> have_label_start_end
  label          start_time           stop_time
1  X547 2022-04-11 09:58:51 2022-04-11 10:04:54
2  X285 2022-04-11 10:04:54 2022-04-11 11:08:34
3  X290 2022-04-11 11:08:34                <NA>


# Item
have_item_time <- data.frame(item = c('a123', 'b682', 'c3324', 'd4343', 'e5343')
           , timestamp = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 59L, sec = 34, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 3L, sec = 13, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 5L, sec = 17, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 35, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 10L, sec = 09, tz = "UTC")))

> have_item_time
   item           timestamp
1  a123 2022-04-11 09:59:34
2  b682 2022-04-11 10:03:13
3 c3324 2022-04-11 10:05:17
4 d4343 2022-04-11 11:08:35
5 e5343 2022-04-11 11:10:09


# Map Items to Label
want <- data.frame(label = c('X547', 'X547','X285', 'X290', 'X290'),
                   item = c('a123', 'b682', 'c3324', 'd4343', 'e5343')
                   , timestamp = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 59L, sec = 34, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 3L, sec = 13, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 5L, sec = 17, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 35, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 10L, sec = 09, tz = "UTC")))


> want
  label  item           timestamp
1  X547  a123 2022-04-11 09:59:34
2  X547  b682 2022-04-11 10:03:13
3  X285 c3324 2022-04-11 10:05:17
4  X290 d4343 2022-04-11 11:08:35
5  X290 e5343 2022-04-11 11:10:09

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

反目相谮 2025-01-28 01:57:05

以下方法将间隔转换为开始行和结束行,添加 item_time 表,然后填充这些间隔中的标签以填充 item_time 表。

library(tidyverse)
have_label_start_end %>%
  pivot_longer(-label, values_to = "timestamp") %>%
  bind_rows(have_item_time) %>%
  arrange(timestamp) %>%
  fill(label, .direction = "down") %>%
  filter(!is.na(item)) %>%
  select(label, item, timestamp)

这假设每一项都在一个且仅有一个区间内。如果某个项目可能超出任何间隔或超过一个间隔,我们就需要对此进行修改。


另一种方法可能是使用 fuzzyjoin 包,它可以容纳像这样的“非等价”连接(就像 data.tablesqldf 一样) 。一项调整是我们需要用有效的时间戳替换任何 NA stop_times。在这种情况下,我们可以使用 have_item_time 中的最大值。

在这里,我们获取 have_item_time 中的特定时间戳位于(包含)have_label_start_end 中的任何间隔之间的所有匹配项。

library(fuzzyjoin)
have_item_time %>%
  fuzzy_left_join(have_label_start_end %>%
                    replace_na(list(stop_time = max(have_item_time$timestamp))), 
                  by = c("timestamp" = "start_time",
                         "timestamp" = "stop_time"),
                  match_fun = list(`>=`, `<=`))

Here's an approach that converts the intervals to a start and end row, adds the item_time table, and then fills the labels from those intervals to populate the item_time table.

library(tidyverse)
have_label_start_end %>%
  pivot_longer(-label, values_to = "timestamp") %>%
  bind_rows(have_item_time) %>%
  arrange(timestamp) %>%
  fill(label, .direction = "down") %>%
  filter(!is.na(item)) %>%
  select(label, item, timestamp)

This assumes every item is within one and only one interval. If it's possible that an item could be outside of any intervals, or in more than one, we'd need to revise this.


Another approach could be to use the fuzzyjoin package, which accommodates "non-equi" joins like this (as do data.table and sqldf). One tweak is that we need to replace any NA stop_times with a valid timestamp. In this case we can use the max from have_item_time.

Here, we get all the matches where the specific timestamp in have_item_time is between (inclusive) any of the intervals in have_label_start_end.

library(fuzzyjoin)
have_item_time %>%
  fuzzy_left_join(have_label_start_end %>%
                    replace_na(list(stop_time = max(have_item_time$timestamp))), 
                  by = c("timestamp" = "start_time",
                         "timestamp" = "stop_time"),
                  match_fun = list(`>=`, `<=`))
小瓶盖 2025-01-28 01:57:05

我们可以使用{powerjoin}:

library(powerjoin)
power_left_join(
  have_item_time, have_label_start_end, 
  by = ~.x$timestamp > .y$start_time & 
    (.x$timestamp < .y$stop_time | is.na(.y$stop_time)),
  keep = "left")
#>    item           timestamp label
#> 1  a123 2022-04-11 09:59:34  X547
#> 2  b682 2022-04-11 10:03:13  X547
#> 3 c3324 2022-04-11 10:05:17  X285
#> 4 d4343 2022-04-11 11:08:35  X290
#> 5 e5343 2022-04-11 11:10:09  X290

We can use {powerjoin} :

library(powerjoin)
power_left_join(
  have_item_time, have_label_start_end, 
  by = ~.x$timestamp > .y$start_time & 
    (.x$timestamp < .y$stop_time | is.na(.y$stop_time)),
  keep = "left")
#>    item           timestamp label
#> 1  a123 2022-04-11 09:59:34  X547
#> 2  b682 2022-04-11 10:03:13  X547
#> 3 c3324 2022-04-11 10:05:17  X285
#> 4 d4343 2022-04-11 11:08:35  X290
#> 5 e5343 2022-04-11 11:10:09  X290
罪歌 2025-01-28 01:57:05

现在,只有DPLYR才能实现这一目标,使用join_by()函数代替参数,

获得所需的结果:

want <- have_item_time |> 
        inner_join(
            have_label_start_end |> 
                mutate(stop_time = if_else(is.na(stop_time), Sys.Date(), stop_time)), 
            by = join_by(between(x=timestamp, y_lower=start_time, y_upper=stop_time, bounds = "[]"))
            )

It's now possible with only dplyr to achieve that, using the join_by() function in place of the by argument

To get the desired result :

want <- have_item_time |> 
        inner_join(
            have_label_start_end |> 
                mutate(stop_time = if_else(is.na(stop_time), Sys.Date(), stop_time)), 
            by = join_by(between(x=timestamp, y_lower=start_time, y_upper=stop_time, bounds = "[]"))
            )
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文