从不连续的有效范围构造连续区间

发布于 2025-01-18 10:41:01 字数 3170 浏览 1 评论 0原文

鉴于

data have;
    infile datalines missover delimiter="|" dsd;
    input id :$20. (start_date end_date) (:date9.) (attribute_1 attribute_2 attribute_3 attribute_4) ($);
    format start_date end_date date9.;
datalines;
ID1|01MAR2014|31DEC9999|BIG|YES|| 
ID2|01SEP2015|30NOV2020|||TWO|
ID2|01SEP2015|31DEC9999|SMALL|||
ID2|01AUG2021|31DEC9999|||TWO|
ID3|01DEC2014|31MAY2016||YES||
ID3|01DEC2014|29JUN2017||||OK
ID3|01DEC2014|31DEC9999|MEDIUM|||
ID3|31MAR2015|29SEP2017|||ONE|
ID3|30JUN2017|31DEC9999||YES||TBD
ID3|30SEP2017|31DEC9999|||ONE, TWO|
;

我想获得每个ID的连续有效性范围，每个ID都有正确的属性。
所需的输出就是这样：

+-----+------------+-----------+-------------+-------------+-------------+-------------+
| id  | start_date | end_date  | attribute_1 | attribute_2 | attribute_3 | attribute_4 |
+-----+------------+-----------+-------------+-------------+-------------+-------------+
| ID1 | 01MAR2014  | 31DEC9999 | BIG         | YES         |             |             |
| ID2 | 01SEP2015  | 30NOV2020 | SMALL       |             | TWO         |             |
| ID2 | 01DEC2020  | 31JUL2021 | SMALL       |             |             |             |
| ID2 | 01AUG2021  | 31DEC9999 | SMALL       |             | TWO         |             |
| ID3 | 01DEC2014  | 30MAR2015 | MEDIUM      | YES         |             | OK          |
| ID3 | 31MAR2015  | 31MAY2016 | MEDIUM      | YES         | ONE         | OK          |
| ID3 | 01JUN2016  | 29JUN2016 | MEDIUM      |             | ONE         | OK          |
| ID3 | 30JUN2016  | 29SEP2017 | MEDIUM      | YES         | ONE         | TBD         |
| ID3 | 30SEP2017  | 31DEC9999 | MEDIUM      | YES         | ONE, TWO    | TBD         |
+-----+------------+-----------+-------------+-------------+-------------+-------------+

我找到了一种使用连接的方法，但想知道是否存在更好的方法来执行以下操作：

data all_intervals;
    set have(keep= id start_date end_date);
    _start = start_date; output;
    _end = start_date-1; output;
    _end = end_date; output;
    if end_date < '31DEC9999'd then do;
        _start = end_date+1; output;
    end;
run;

proc sql;
    create table all_intervals as
    select distinct t1.id, t1._start, t2._end
    from all_intervals t1, all_intervals t2
    where t1.id = t2.id and t2._end > t1._start
;
quit;

data all_intervals;
set all_intervals;
by id _start;
if first.id or first._start;
run;

proc sql noprint;
    select 'max(t1.'||NAME||') as '||NAME into :attributes separated by ','
    from sashelp.vcolumn
    where libname = "WORK" and memname = "HAVE" and upcase(name) not in ('ID', "_START", "_END")
;
quit;

proc sql;
    create table merge as
    select t2.id, t2._start as start_date, t2._end as end_date, &attributes.
    from have t1 right join all_intervals t2
    on t1.id = t2.id
    and ((t2._start <= t1.start_date <= t2._end)
        or (t2._start <= t1.end_date <=  t2._end)
        or (t2._start >= t1.start_date and t2._end <= t1.end_date))
    group by t2.id, t2._start
    order by t2.id, t2._start
;
quit;

proc sort data=merge out=want nodupkey; by id start_date end_date; run;

上述产生预期的输出。

原文

Given

data have;
    infile datalines missover delimiter="|" dsd;
    input id :$20. (start_date end_date) (:date9.) (attribute_1 attribute_2 attribute_3 attribute_4) ($);
    format start_date end_date date9.;
datalines;
ID1|01MAR2014|31DEC9999|BIG|YES|| 
ID2|01SEP2015|30NOV2020|||TWO|
ID2|01SEP2015|31DEC9999|SMALL|||
ID2|01AUG2021|31DEC9999|||TWO|
ID3|01DEC2014|31MAY2016||YES||
ID3|01DEC2014|29JUN2017||||OK
ID3|01DEC2014|31DEC9999|MEDIUM|||
ID3|31MAR2015|29SEP2017|||ONE|
ID3|30JUN2017|31DEC9999||YES||TBD
ID3|30SEP2017|31DEC9999|||ONE, TWO|
;

I would like to get continuous validity ranges for each id with the correct attributes at each of them.
The desired output would be like this:

+-----+------------+-----------+-------------+-------------+-------------+-------------+
| id  | start_date | end_date  | attribute_1 | attribute_2 | attribute_3 | attribute_4 |
+-----+------------+-----------+-------------+-------------+-------------+-------------+
| ID1 | 01MAR2014  | 31DEC9999 | BIG         | YES         |             |             |
| ID2 | 01SEP2015  | 30NOV2020 | SMALL       |             | TWO         |             |
| ID2 | 01DEC2020  | 31JUL2021 | SMALL       |             |             |             |
| ID2 | 01AUG2021  | 31DEC9999 | SMALL       |             | TWO         |             |
| ID3 | 01DEC2014  | 30MAR2015 | MEDIUM      | YES         |             | OK          |
| ID3 | 31MAR2015  | 31MAY2016 | MEDIUM      | YES         | ONE         | OK          |
| ID3 | 01JUN2016  | 29JUN2016 | MEDIUM      |             | ONE         | OK          |
| ID3 | 30JUN2016  | 29SEP2017 | MEDIUM      | YES         | ONE         | TBD         |
| ID3 | 30SEP2017  | 31DEC9999 | MEDIUM      | YES         | ONE, TWO    | TBD         |
+-----+------------+-----------+-------------+-------------+-------------+-------------+

I found a way of doing it using joins, but would like to know if there exist a better way of doing the following:

data all_intervals;
    set have(keep= id start_date end_date);
    _start = start_date; output;
    _end = start_date-1; output;
    _end = end_date; output;
    if end_date < '31DEC9999'd then do;
        _start = end_date+1; output;
    end;
run;

proc sql;
    create table all_intervals as
    select distinct t1.id, t1._start, t2._end
    from all_intervals t1, all_intervals t2
    where t1.id = t2.id and t2._end > t1._start
;
quit;

data all_intervals;
set all_intervals;
by id _start;
if first.id or first._start;
run;

proc sql noprint;
    select 'max(t1.'||NAME||') as '||NAME into :attributes separated by ','
    from sashelp.vcolumn
    where libname = "WORK" and memname = "HAVE" and upcase(name) not in ('ID', "_START", "_END")
;
quit;

proc sql;
    create table merge as
    select t2.id, t2._start as start_date, t2._end as end_date, &attributes.
    from have t1 right join all_intervals t2
    on t1.id = t2.id
    and ((t2._start <= t1.start_date <= t2._end)
        or (t2._start <= t1.end_date <=  t2._end)
        or (t2._start >= t1.start_date and t2._end <= t1.end_date))
    group by t2.id, t2._start
    order by t2.id, t2._start
;
quit;

proc sort data=merge out=want nodupkey; by id start_date end_date; run;

The above produce the expected output.

分享到QQ

分享到微博