SAS 中的 Jaro-Winkler 字符串比较函数

发布于 2024-11-27 00:17:25 字数 223 浏览 1 评论 0原文

SAS 中是否有 Jaro-Winkler 字符串比较的实现?

看起来 Link King 有 Jaro-Winkler,但我更喜欢灵活性我自己调用该函数。

谢谢!

Is there an implementation of the Jaro-Winkler string comparison in SAS?

It looks like Link King has Jaro-Winkler, but I'd prefer the flexibility of calling the function myself.

Thanks!

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

躲猫猫 2024-12-04 00:17:25

据我所知,没有内置的 jaro-winkler 距离函数。 @Itzy 已经引用了我所知道的唯一的。如果您愿意的话,您可以使用 proc fcmp 来滚动您自己的函数。我什至会通过下面的代码为您提供一个良好的开端。我只是尝试按照维基百科上的文章进行操作。无论如何,它肯定不是 Bill Winkler 的 strcmp.c 文件的完美表示,并且可能有很多错误。

proc fcmp outlib=work.jaro.chars;

  subroutine jaromatch ( string1 $ , string2 $ , matchChars $);
    outargs matchChars;
    /* Returns number of matched characters between 2 strings excluding blanks*/
    /* two chars from string1 and string2 are considered matching
       if they are no farther than floor(max(|s1|, |s2|)/2)-1  */

    str1_len = length(strip(string1));
    str2_len = length(strip(string2));

    allowedDist = floor(max(str1_len, str2_len)/2) -1;

    matchChars="";

    /* walk through string 1 and match characters to string2 */
    do i= 1 to str1_len;
      x=substr(string1,i,1);
      position = findc(string2,x ,max(1,i-allowedDist));
      if position > 0 then do;
          if position - i <= allowedDist then do;
          y=substr(string2,position,1);
          /* build list of matched characters */
          matchChars=cats(matchChars,y);
        end;
      end;
    end;
    matchChars = strip(matchChars);
  endsub;


  function jarotrans (string1 $ , string2 $ );
    ntrans = 0;
    ubnd = min(length(strip(string1)), length(strip(string2)));
    do i = 1 to ubnd;
      if substr(string1,i,1) ne substr(string2,i,1) then do;
        ntrans + 1;
      end;
    end;
    return(ntrans/2);
  endsub;

  function getPrefixlen( string1 $ , string2 $, maxprelen);
     /* get the length of the matching characters at the beginning */
     n = min(maxprelen, length(string1), length(string2));
     do i = 1 to n;
       if substr(string1,i,1) ne substr(string2,i,1) 
       then return(max(1,i-1));
     end;
  endsub;

  function jarodist(string1 $, string2 $);
    /* get number of matched characters */
    call jaromatch(string1, string2, m1);
    m1_len = length(m1);
    if m1_len = 0 then return(0);
    call jaromatch(string2, string1, m2);
    m2_len = length(m2);
    if m2_len = 0 then return(0);

    /* get number of transposed characters */
    ntrans = jarotrans(m1, m2);
    put m1_len= m2_len= ntrans= ;
    j_dist =  (m1_len/length(string1) 
             + m2_len/length(string2) 
             + (m1_len-ntrans)/m1_len )  /  3;
    return(j_dist);
  endsub;

  function jarowink( string1 $, string2 $, prefixscale);
    jarodist=jarodist(string1, string2);
    prelen=getPrefixlen(string1, string2, 4);
    if prelen = 0 then return(jarodist);
    else  return(jarodist + prelen * prefixscale * (1-jarodist));
  endsub;

run;quit;

/* tell SAS where to find the functions we just wrote */
option cmplib=work.jaro;

/* Now let's try it out! */
data _null_;
string1='DIXON';
string2='DICKSONX';
x=jarodist(string1, string2);
y=jarowink(string1, string2, 0.1);
put x= y=;
run;

There is no built in function for jaro-winkler distance that I am aware of. @Itzy already reference the only ones that I know of. You can roll you own functions with proc fcmp though if you feel up to it. I'll even give you a head start with the code below. I just tried to follow the wikipedia article on it. It certainly isn't close to being a perfect representation of Bill Winkler's strcmp.c file by any means and likely has lots of bugs.

proc fcmp outlib=work.jaro.chars;

  subroutine jaromatch ( string1 $ , string2 $ , matchChars $);
    outargs matchChars;
    /* Returns number of matched characters between 2 strings excluding blanks*/
    /* two chars from string1 and string2 are considered matching
       if they are no farther than floor(max(|s1|, |s2|)/2)-1  */

    str1_len = length(strip(string1));
    str2_len = length(strip(string2));

    allowedDist = floor(max(str1_len, str2_len)/2) -1;

    matchChars="";

    /* walk through string 1 and match characters to string2 */
    do i= 1 to str1_len;
      x=substr(string1,i,1);
      position = findc(string2,x ,max(1,i-allowedDist));
      if position > 0 then do;
          if position - i <= allowedDist then do;
          y=substr(string2,position,1);
          /* build list of matched characters */
          matchChars=cats(matchChars,y);
        end;
      end;
    end;
    matchChars = strip(matchChars);
  endsub;


  function jarotrans (string1 $ , string2 $ );
    ntrans = 0;
    ubnd = min(length(strip(string1)), length(strip(string2)));
    do i = 1 to ubnd;
      if substr(string1,i,1) ne substr(string2,i,1) then do;
        ntrans + 1;
      end;
    end;
    return(ntrans/2);
  endsub;

  function getPrefixlen( string1 $ , string2 $, maxprelen);
     /* get the length of the matching characters at the beginning */
     n = min(maxprelen, length(string1), length(string2));
     do i = 1 to n;
       if substr(string1,i,1) ne substr(string2,i,1) 
       then return(max(1,i-1));
     end;
  endsub;

  function jarodist(string1 $, string2 $);
    /* get number of matched characters */
    call jaromatch(string1, string2, m1);
    m1_len = length(m1);
    if m1_len = 0 then return(0);
    call jaromatch(string2, string1, m2);
    m2_len = length(m2);
    if m2_len = 0 then return(0);

    /* get number of transposed characters */
    ntrans = jarotrans(m1, m2);
    put m1_len= m2_len= ntrans= ;
    j_dist =  (m1_len/length(string1) 
             + m2_len/length(string2) 
             + (m1_len-ntrans)/m1_len )  /  3;
    return(j_dist);
  endsub;

  function jarowink( string1 $, string2 $, prefixscale);
    jarodist=jarodist(string1, string2);
    prelen=getPrefixlen(string1, string2, 4);
    if prelen = 0 then return(jarodist);
    else  return(jarodist + prelen * prefixscale * (1-jarodist));
  endsub;

run;quit;

/* tell SAS where to find the functions we just wrote */
option cmplib=work.jaro;

/* Now let's try it out! */
data _null_;
string1='DIXON';
string2='DICKSONX';
x=jarodist(string1, string2);
y=jarowink(string1, string2, 0.1);
put x= y=;
run;
过度放纵 2024-12-04 00:17:25

我不这么认为。它可以执行 Levenshtein 距离(complev 函数)或广义编辑距离(compged),但我还没有看到任何其他编辑距离函数。

如果您执意要在 SAS 中执行此操作,您可以在 PROC IML 中编写一个程序。

I don't think so. It can do the Levenshtein distance (the complev function) or a generalized edit distance (compged), but I haven't seen any other edit distance functions.

If you're dead set on doing this in SAS you could write a program in PROC IML.

止于盛夏 2024-12-04 00:17:25

我修改并纠正了 cmjohns 的代码。感谢他/她让我开始。温克勒发表
他的论文 Winkler, WE (2006) 中提供了一些例子。 “记录链接概述和当前
研究方向”。研究报告系列,RRS。(参见表 6)我使用示例来测试我的代码。

proc fcmp outlib=work.jaro.chars;

  /* Returns matched characters between 2 strings. Two chars from string1 and string2
     are considered matching if they are no farther apart than 
     floor(max(|s1|, |s2|)/2)-1                                                      */
  function jaromatch(string1 $, string2 $) $ 40;
    length matchChars $ 40;

    str1_len = lengthn(string1);
    str2_len = lengthn(string2);

    allowedDist = floor(max(str1_len, str2_len) / 2) - 1;

    *** walk through string1 and match characters to string2 ***;
    matchChars="";
    do i= 1 to str1_len;
      *** get the part of string2 to search ***;
      allowed_start = max(1, i - allowedDist);      *** starting char position ***;
      allowed_str2 = substr(string2, allowed_start, i + allowedDist - allowed_start + 1);

      *** find i char from string1 in string2 within the allowedDist ***;
      position = findc(allowed_str2, substr(string1, i, 1));
      if position > 0 
      then do;
    matchChars = cats(matchChars, substr(allowed_str2, position, 1));
    *** Once a char is assigned, it can not be assigned again. So, chg char in string2. ***;
    substr(string2, allowed_start + position -1, 1) = '~';
      end;
    end;
    return(strip(matchChars));
  endsub;

  /* count the number of "half" transpositions */
  function jarotrans(string1 $, string2 $);
    ntrans = 0;
    do i = 1 to min(lengthn(strip(string1)), lengthn(strip(string2)));
      if substr(string1, i, 1) ne substr(string2, i, 1) then ntrans + 1;
    end;

    return(ntrans / 2);
  endsub;

  /* get the length of the matching characters at the beginning */
  function getPrefixlen(string1 $, string2 $, maxprelen);
    n = min(maxprelen, lengthn(string1), lengthn(string2));

    if n = 0
    then return(0);
    else do;
      do i = 1 to n;
    if substr(string1, i, 1) ne substr(string2, i, 1) 
    then return(i - 1);
      end;
      return(n);  *** all maxprelen characters match ***;
    end;
  endsub;

  /* calc the jaro distance */
  function jarodist(string1 $, string2 $);
    *** get number of matched characters in string1 ***;
    m1 = jaromatch(string1, string2);
    m1_len = lengthn(m1);
    if m1_len = 0 then return(0);

    *** get number of matched characters in string2 ***;
    m2 = jaromatch(string2, string1);
    m2_len = lengthn(m2);
    if m2_len = 0 then return(0);

    *** get number of transposed characters ***;
    ntrans = jarotrans(m1, m2);

    *** calc jaro distance ***;
    j_dist = (m1_len / lengthn(string1) +
          m2_len / lengthn(string2) +
          (m1_len - ntrans) / m1_len
         ) / 3;

    return(j_dist);
  endsub;

  /* calc the jaro-winkler distance */
  function jarowink(string1 $, string2 $, prefixscale);
    string1 = upcase(strip(string1));
    string2 = upcase(strip(string2));

    *** check for trivial case and calc JW if needed ***;
    if string1 = string2
    then return(1.0);
    else do;
      jarodist = jarodist(string1, string2);
      prelen = getPrefixlen(string1, string2, 4);
      return(jarodist + prelen * prefixscale * (1 - jarodist));
    end;
  endsub;

run;

*** tell SAS where to find the functions we just wrote ***;
option cmplib=work.jaro;

    /* test code */
data _null_;
  put 'SHACKLEFORD SHACKELFORD 0.982';
  jw = jarowink('SHACKLEFORD', 'SHACKELFORD', 0.1);
  put jw=;
  put;
  put 'DUNNINGHAM  CUNNIGHAM   0.896';
  jw = jarowink('DUNNINGHAM', 'CUNNIGHAM', 0.1);
  put jw=;
  put;
  put 'NICHLESON   NICHULSON   0.956';
  jw = jarowink('NICHLESON', 'NICHULSON', 0.1);
  put jw=;
  put;
  put 'JONES       JOHNSON     0.832';
  jw = jarowink('JONES', 'JOHNSON', 0.1);
  put jw=;
  put;
  put 'MASSEY      MASSIE      0.933';
  jw = jarowink('MASSEY', 'MASSIE', 0.1);
  put jw=;
  put;
  put 'ABROMS      ABRAMS      0.922';
  jw = jarowink('ABROMS', 'ABRAMS', 0.1);
  put jw=;
  put; 
  put 'JERALDINE   GERALDINE   0.926';
  jw = jarowink('JERALDINE', 'GERALDINE', 0.1);
  put jw=;
  put;
  put 'MARHTA      MARTHA      0.961';
  jw = jarowink('MARHTA', 'MARTHA', 0.1);
  put jw=;
  put;
  put 'MICHELLE    MICHAEL     0.921';
  jw = jarowink('MICHELLE', 'MICHAEL', 0.1);
  put jw=;
  put;
  put 'JULIES      JULIUS      0.933';
  jw = jarowink('JULIES', 'JULIUS', 0.1);
  put jw=;
  put;
  put 'TANYA       TONYA       0.880';
  jw = jarowink('TANYA', 'TONYA', 0.1);
  put jw=;
  put;
  put 'DWAYNE      DUANE       0.840';
  jw = jarowink('DWAYNE', 'DUANE', 0.1);
  put jw=;
  put;
  put 'SEAN        SUSAN       0.805';
  jw = jarowink('SEAN', 'SUSAN', 0.1);
  put jw=;
  put;
  put 'JON         JOHN        0.933';
  jw = jarowink('JON', 'JOHN', 0.1);
  put jw=;
  put;
run;

I modified and corrected cmjohns' code. Thanks to him/her for starting me off. Winkler published
some examples in his paper Winkler, W. E. (2006). "Overview of Record Linkage and Current
Research Directions". Research Report Series, RRS. (See table 6) I used the examples to test my code.

proc fcmp outlib=work.jaro.chars;

  /* Returns matched characters between 2 strings. Two chars from string1 and string2
     are considered matching if they are no farther apart than 
     floor(max(|s1|, |s2|)/2)-1                                                      */
  function jaromatch(string1 $, string2 $) $ 40;
    length matchChars $ 40;

    str1_len = lengthn(string1);
    str2_len = lengthn(string2);

    allowedDist = floor(max(str1_len, str2_len) / 2) - 1;

    *** walk through string1 and match characters to string2 ***;
    matchChars="";
    do i= 1 to str1_len;
      *** get the part of string2 to search ***;
      allowed_start = max(1, i - allowedDist);      *** starting char position ***;
      allowed_str2 = substr(string2, allowed_start, i + allowedDist - allowed_start + 1);

      *** find i char from string1 in string2 within the allowedDist ***;
      position = findc(allowed_str2, substr(string1, i, 1));
      if position > 0 
      then do;
    matchChars = cats(matchChars, substr(allowed_str2, position, 1));
    *** Once a char is assigned, it can not be assigned again. So, chg char in string2. ***;
    substr(string2, allowed_start + position -1, 1) = '~';
      end;
    end;
    return(strip(matchChars));
  endsub;

  /* count the number of "half" transpositions */
  function jarotrans(string1 $, string2 $);
    ntrans = 0;
    do i = 1 to min(lengthn(strip(string1)), lengthn(strip(string2)));
      if substr(string1, i, 1) ne substr(string2, i, 1) then ntrans + 1;
    end;

    return(ntrans / 2);
  endsub;

  /* get the length of the matching characters at the beginning */
  function getPrefixlen(string1 $, string2 $, maxprelen);
    n = min(maxprelen, lengthn(string1), lengthn(string2));

    if n = 0
    then return(0);
    else do;
      do i = 1 to n;
    if substr(string1, i, 1) ne substr(string2, i, 1) 
    then return(i - 1);
      end;
      return(n);  *** all maxprelen characters match ***;
    end;
  endsub;

  /* calc the jaro distance */
  function jarodist(string1 $, string2 $);
    *** get number of matched characters in string1 ***;
    m1 = jaromatch(string1, string2);
    m1_len = lengthn(m1);
    if m1_len = 0 then return(0);

    *** get number of matched characters in string2 ***;
    m2 = jaromatch(string2, string1);
    m2_len = lengthn(m2);
    if m2_len = 0 then return(0);

    *** get number of transposed characters ***;
    ntrans = jarotrans(m1, m2);

    *** calc jaro distance ***;
    j_dist = (m1_len / lengthn(string1) +
          m2_len / lengthn(string2) +
          (m1_len - ntrans) / m1_len
         ) / 3;

    return(j_dist);
  endsub;

  /* calc the jaro-winkler distance */
  function jarowink(string1 $, string2 $, prefixscale);
    string1 = upcase(strip(string1));
    string2 = upcase(strip(string2));

    *** check for trivial case and calc JW if needed ***;
    if string1 = string2
    then return(1.0);
    else do;
      jarodist = jarodist(string1, string2);
      prelen = getPrefixlen(string1, string2, 4);
      return(jarodist + prelen * prefixscale * (1 - jarodist));
    end;
  endsub;

run;

*** tell SAS where to find the functions we just wrote ***;
option cmplib=work.jaro;

    /* test code */
data _null_;
  put 'SHACKLEFORD SHACKELFORD 0.982';
  jw = jarowink('SHACKLEFORD', 'SHACKELFORD', 0.1);
  put jw=;
  put;
  put 'DUNNINGHAM  CUNNIGHAM   0.896';
  jw = jarowink('DUNNINGHAM', 'CUNNIGHAM', 0.1);
  put jw=;
  put;
  put 'NICHLESON   NICHULSON   0.956';
  jw = jarowink('NICHLESON', 'NICHULSON', 0.1);
  put jw=;
  put;
  put 'JONES       JOHNSON     0.832';
  jw = jarowink('JONES', 'JOHNSON', 0.1);
  put jw=;
  put;
  put 'MASSEY      MASSIE      0.933';
  jw = jarowink('MASSEY', 'MASSIE', 0.1);
  put jw=;
  put;
  put 'ABROMS      ABRAMS      0.922';
  jw = jarowink('ABROMS', 'ABRAMS', 0.1);
  put jw=;
  put; 
  put 'JERALDINE   GERALDINE   0.926';
  jw = jarowink('JERALDINE', 'GERALDINE', 0.1);
  put jw=;
  put;
  put 'MARHTA      MARTHA      0.961';
  jw = jarowink('MARHTA', 'MARTHA', 0.1);
  put jw=;
  put;
  put 'MICHELLE    MICHAEL     0.921';
  jw = jarowink('MICHELLE', 'MICHAEL', 0.1);
  put jw=;
  put;
  put 'JULIES      JULIUS      0.933';
  jw = jarowink('JULIES', 'JULIUS', 0.1);
  put jw=;
  put;
  put 'TANYA       TONYA       0.880';
  jw = jarowink('TANYA', 'TONYA', 0.1);
  put jw=;
  put;
  put 'DWAYNE      DUANE       0.840';
  jw = jarowink('DWAYNE', 'DUANE', 0.1);
  put jw=;
  put;
  put 'SEAN        SUSAN       0.805';
  jw = jarowink('SEAN', 'SUSAN', 0.1);
  put jw=;
  put;
  put 'JON         JOHN        0.933';
  jw = jarowink('JON', 'JOHN', 0.1);
  put jw=;
  put;
run;
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文