这个匹配 URL 的正则表达式有什么问题?

发布于 2024-08-10 19:50:15 字数 519 浏览 2 评论 0原文

我在使用 C 中的 regex.h 库设置正则表达式来匹配 URL 时遇到了一些困难。我有一个有效的 IP 正则表达式,我希望将其转换为匹配简单的字符串,例如 www.alphanumerictext12 .com|edu|org。我的正则表达式定义本身的语法有问题。

下面是有效的 IPREGEX 代码和我尝试的 URL 正则表达式。

#define IPEXPR    "([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})"
#define URLEXPR "(www)\\.((?:[a-z][a-z]*[0-9]+[a-z0-9]*))\\.(com|edu|org)"

regex_t regex;

if(regcomp(&regex, IPEXPR, REG_EXTENDED) != 0)
  return 0;

if(regexec(&regex, inputURL, 0, NULL, 0) != 0)
  return 0;

I'm having a bit of difficulty setting up a regex to match a URL using the regex.h library in C. I have a working IP regex that I was hoping to convert to match a simple string such as www.alphanumerictext12.com|edu|org. Something is wrong with my syntax in the regex definition itself.

Below is the working IPREGEX code and my attempted URL regex.

#define IPEXPR    "([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})\\.([0-9]{1,3})"
#define URLEXPR "(www)\\.((?:[a-z][a-z]*[0-9]+[a-z0-9]*))\\.(com|edu|org)"

regex_t regex;

if(regcomp(®ex, IPEXPR, REG_EXTENDED) != 0)
  return 0;

if(regexec(®ex, inputURL, 0, NULL, 0) != 0)
  return 0;

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(4

暖阳 2024-08-17 19:50:15

尝试:

"www\\.[a-z]+[a-z0-9]*\\.(com|edu|org)"

我删除了 [0-9]+ 并将 [az][az]* 替换为 [az]+

Try:

"www\\.[a-z]+[a-z0-9]*\\.(com|edu|org)"

I removed the [0-9]+ and replaced [a-z][a-z]* with [a-z]+.

雾里花 2024-08-17 19:50:15

问题出在 (?: ) 中,您只需要 (www)\\.([az][az]*[0-9]+[a-z0-9]* )\\.(com|edu|org)

顺便说一句,你的内心表达是:“至少一个字母字符,然后至少一个数字字符,然后是任何字母数字字符”。是你的意思吗?如果是这样,您可以将其缩短一点:[az]+[0-9]+[a-z0-9]*

The problem is in (?: ), You need just (www)\\.([a-z][a-z]*[0-9]+[a-z0-9]*)\\.(com|edu|org).

Btw, your inner expression says: "at least one alpha character, then at least one numeric character, then any alphanumeric characters". Is it what you mean? If so, you can make it a little bit shorter: [a-z]+[0-9]+[a-z0-9]*.

夢归不見 2024-08-17 19:50:15

您可能应该使用 inet_pton() 这是一个标准 POSIX 函数(替换 inet_aton())并处理 IPv4 和 IPv6 地址格式。

You probably should be using inet_pton() which is a standard POSIX function (replacing inet_aton()) and handles both IPv4 and IPv6 address formats.

表情可笑 2024-08-17 19:50:15

来自编码恐怖

有些人在遇到问题时会想:“我知道,我会用
正则表达式。”现在他们有了
两个问题。

我的意思是:您确定正则表达式是解决您的问题的最佳方法吗?也许你可以用一些更轻量级的方法来测试该字符串是否是一个URL?


编辑

我的计算机上的以下程序,输出重定向到/dev/null,打印(到stderr

rx time: 1.730000
lw time: 0.920000

程序列表:< /强>

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <time.h>

int goodurl_rx(const char *buf) {
  static regex_t rx;
  static int done = 0;
  int e;

  if (!done) {
    done = 1;
    if ((e = regcomp(&rx, "^www\\.[a-z][a-z0-9]*\\.(com|edu|org)$", REG_EXTENDED)) != 0) {
      printf("Error %d compiling regular expression.\n", e);
      exit(EXIT_FAILURE);
    }
  }
  return !regexec(&rx, buf, 0, NULL, 0);
}

int goodurl_lw(const char *buf) {
  if (*buf++ != 'w') return 0;
  if (*buf++ != 'w') return 0;
  if (*buf++ != 'w') return 0;
  if (*buf++ != '.') return 0;
  if (!isalpha((unsigned char)*buf++)) return 0;
  while (isalnum((unsigned char)*buf)) buf++;
  if (*buf++ != '.') return 0;
  if ((*buf == 'c') && (*(buf+1) == 'o') && (*(buf+2) == 'm') && (*(buf+3) == 0)) return 1;
  if ((*buf == 'e') && (*(buf+1) == 'd') && (*(buf+2) == 'u') && (*(buf+3) == 0)) return 1;
  if ((*buf == 'o') && (*(buf+1) == 'r') && (*(buf+2) == 'g') && (*(buf+3) == 0)) return 1;
  return 0;
}

int main(void) {
  clock_t t0, t1, t2;
  char *buf[] = {"www.alphanumerics.com", "ww2.alphanumerics.com", "www.alphanumerics.net"};
  int times;

  t0 = clock();
  times = 1000000;
  while (times--) {
    printf("    %s: %s\n", buf[0], goodurl_rx(buf[0])?"pass":"invalid");
    printf("    %s: %s\n", buf[1], goodurl_rx(buf[1])?"pass":"invalid");
    printf("    %s: %s\n", buf[2], goodurl_rx(buf[2])?"pass":"invalid");
  };
  t1 = clock();
  times = 1000000;
  while (times--) {
    printf("    %s: %s\n", buf[0], goodurl_lw(buf[0])?"pass":"invalid");
    printf("    %s: %s\n", buf[1], goodurl_lw(buf[1])?"pass":"invalid");
    printf("    %s: %s\n", buf[2], goodurl_lw(buf[2])?"pass":"invalid");
  } while (0);
  t2 = clock();

  fprintf(stderr, "rx time: %f\n", (double)(t1-t0)/CLOCKS_PER_SEC);
  fprintf(stderr, "lw time: %f\n", (double)(t2-t1)/CLOCKS_PER_SEC);
  return 0;
}

From Coding Horror:

Some people, when confronted with a problem, think "I know, I'll use
regular expressions." Now they have
two problems.

What I mean is: are you sure a regular expression is the best way to solve your problem? Maybe you can test whether the string is a URL with some more lightweigth method?


Edit

The following program on my computer, with output redirected to /dev/null, prints (to stderr)

rx time: 1.730000
lw time: 0.920000

Program Listing:

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <time.h>

int goodurl_rx(const char *buf) {
  static regex_t rx;
  static int done = 0;
  int e;

  if (!done) {
    done = 1;
    if ((e = regcomp(&rx, "^www\\.[a-z][a-z0-9]*\\.(com|edu|org)$", REG_EXTENDED)) != 0) {
      printf("Error %d compiling regular expression.\n", e);
      exit(EXIT_FAILURE);
    }
  }
  return !regexec(&rx, buf, 0, NULL, 0);
}

int goodurl_lw(const char *buf) {
  if (*buf++ != 'w') return 0;
  if (*buf++ != 'w') return 0;
  if (*buf++ != 'w') return 0;
  if (*buf++ != '.') return 0;
  if (!isalpha((unsigned char)*buf++)) return 0;
  while (isalnum((unsigned char)*buf)) buf++;
  if (*buf++ != '.') return 0;
  if ((*buf == 'c') && (*(buf+1) == 'o') && (*(buf+2) == 'm') && (*(buf+3) == 0)) return 1;
  if ((*buf == 'e') && (*(buf+1) == 'd') && (*(buf+2) == 'u') && (*(buf+3) == 0)) return 1;
  if ((*buf == 'o') && (*(buf+1) == 'r') && (*(buf+2) == 'g') && (*(buf+3) == 0)) return 1;
  return 0;
}

int main(void) {
  clock_t t0, t1, t2;
  char *buf[] = {"www.alphanumerics.com", "ww2.alphanumerics.com", "www.alphanumerics.net"};
  int times;

  t0 = clock();
  times = 1000000;
  while (times--) {
    printf("    %s: %s\n", buf[0], goodurl_rx(buf[0])?"pass":"invalid");
    printf("    %s: %s\n", buf[1], goodurl_rx(buf[1])?"pass":"invalid");
    printf("    %s: %s\n", buf[2], goodurl_rx(buf[2])?"pass":"invalid");
  };
  t1 = clock();
  times = 1000000;
  while (times--) {
    printf("    %s: %s\n", buf[0], goodurl_lw(buf[0])?"pass":"invalid");
    printf("    %s: %s\n", buf[1], goodurl_lw(buf[1])?"pass":"invalid");
    printf("    %s: %s\n", buf[2], goodurl_lw(buf[2])?"pass":"invalid");
  } while (0);
  t2 = clock();

  fprintf(stderr, "rx time: %f\n", (double)(t1-t0)/CLOCKS_PER_SEC);
  fprintf(stderr, "lw time: %f\n", (double)(t2-t1)/CLOCKS_PER_SEC);
  return 0;
}
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文