如何使用 sscanf() 在 C 中解析 URL？

发布于 2024-08-18 20:53:43 字数 2463 浏览 15 评论 0原文

这是我的 C 代码，它从文件中读取 URL 列表，并尝试分隔 URL 的各个部分。这只是粗略的解析，我不关心特殊情况。我猜 sscanf() 语句有一些错误；当我运行这个时，我得到“分段错误”。此外，完整的 url 被分配给“proto”字符串。

#include<stdio.h>
#include<string.h>
#include<stdlib.h>

# define BAG_SIZE 14

char bag[117][30];

void initbag()
{
strcpy(bag[0],"account");
strcpy(bag[1],"audit");
strcpy(bag[2],"ad");
strcpy(bag[3],"advertising");
strcpy(bag[4],"marketing");
strcpy(bag[5],"application");
strcpy(bag[6],"banking");
strcpy(bag[7],"barter");
strcpy(bag[8],"business");
strcpy(bag[9],"econo");
strcpy(bag[10],"commerce");
strcpy(bag[11],"communication");
strcpy(bag[12],"computer");
strcpy(bag[13],"processing");
}
/*
 other bag[] values will be later copied
*/

void substr(char dest[10],char src[200],int start,int len)
{
int i,j;

for(i=start,j=0;i<start+len;i++,j++)
dest[j]=src[i];
dest[j]='\0';

}

int found(char* word)
{
   int i;
   for(i=0;i<BAG_SIZE;i++)
   if((!strcmp(word,bag[i]))||(strstr(bag[i],word)!=NULL)) return 1;
   return 0;
}

void main()
{
int i,j,k;

char buff[10],fullurl[100];
char proto[5],www[4],host[100],tokens[200],tld[4];
float feature[11];for(i=0;i<11;i++) feature[i]=0;
FILE *furl,*fop;
furl=fopen("bizurls.txt","r");
fop=fopen("urlsvm.txt","w");
initbag();
printf("\nbag initialised");fflush(stdout);

while(!feof(furl))
{
   fscanf(furl,"%s",fullurl);
   printf("%s",fullurl);
   sscanf(fullurl,"%s://%s.%s.%s/%s\n",proto,www,host,tld,tokens);// this line isnt working properly
   printf("2hi");fflush(stdout);
   printf("proto : %s\nwww:%s\nhost :%s\ntld:%s\ntokens:%s\n",proto,www,host,tld,tokens);fflush(stdout);


   for( i=4;i<=8;i++)
   {
       for(j=0;j<strlen(host)-i+1;j++)
           {
                substr(buff,host,j,i);
                if(found(buff)) feature[i-3]++;

           }
   }
  if((!strcmp(tld,"biz"))||(!strcmp(tld,"org"))||(!strcmp(tld,"com"))||(!strcmp(tld,"jobs")))   
        feature[0]=1;
  else if((!strcmp(tld,"info"))||(!strcmp(tld,"coop"))||(!strcmp(tld,"net")))
        feature[0]=0.5;
  else
    feature[0]=0;


   for( i=4;i<=8;i++)
   {
       for(j=0;j<strlen(tokens)-i+1;j++)
           {
                substr(buff,tokens,j,i);
                if(found(buff)) feature[i+2]++;

           }
   }

/*.biz · .com · .info · .name · .net · .org · .pro
.aero, .coop, .jobs, .travel */

for(i=0;i<11;i++) fprintf(fop," %d:%f",i,feature[i]);
fprintf(fop,"\n");


}
fflush(fop);
fclose(furl);
fclose(fop);
}

原文

This is my C code that reads a list of URLs from a file, and tries to separate the various parts of the URL. This is just rough parsing, I'm not bothered about special cases. I guess there is some fault with the sscanf() statement; when I run this, I get "segmentation FAULT". And moreover, the full url is being assigned to the "proto" string.

#include<stdio.h>
#include<string.h>
#include<stdlib.h>

# define BAG_SIZE 14

char bag[117][30];

void initbag()
{
strcpy(bag[0],"account");
strcpy(bag[1],"audit");
strcpy(bag[2],"ad");
strcpy(bag[3],"advertising");
strcpy(bag[4],"marketing");
strcpy(bag[5],"application");
strcpy(bag[6],"banking");
strcpy(bag[7],"barter");
strcpy(bag[8],"business");
strcpy(bag[9],"econo");
strcpy(bag[10],"commerce");
strcpy(bag[11],"communication");
strcpy(bag[12],"computer");
strcpy(bag[13],"processing");
}
/*
 other bag[] values will be later copied
*/

void substr(char dest[10],char src[200],int start,int len)
{
int i,j;

for(i=start,j=0;i<start+len;i++,j++)
dest[j]=src[i];
dest[j]='\0';

}

int found(char* word)
{
   int i;
   for(i=0;i<BAG_SIZE;i++)
   if((!strcmp(word,bag[i]))||(strstr(bag[i],word)!=NULL)) return 1;
   return 0;
}

void main()
{
int i,j,k;

char buff[10],fullurl[100];
char proto[5],www[4],host[100],tokens[200],tld[4];
float feature[11];for(i=0;i<11;i++) feature[i]=0;
FILE *furl,*fop;
furl=fopen("bizurls.txt","r");
fop=fopen("urlsvm.txt","w");
initbag();
printf("\nbag initialised");fflush(stdout);

while(!feof(furl))
{
   fscanf(furl,"%s",fullurl);
   printf("%s",fullurl);
   sscanf(fullurl,"%s://%s.%s.%s/%s\n",proto,www,host,tld,tokens);// this line isnt working properly
   printf("2hi");fflush(stdout);
   printf("proto : %s\nwww:%s\nhost :%s\ntld:%s\ntokens:%s\n",proto,www,host,tld,tokens);fflush(stdout);


   for( i=4;i<=8;i++)
   {
       for(j=0;j<strlen(host)-i+1;j++)
           {
                substr(buff,host,j,i);
                if(found(buff)) feature[i-3]++;

           }
   }
  if((!strcmp(tld,"biz"))||(!strcmp(tld,"org"))||(!strcmp(tld,"com"))||(!strcmp(tld,"jobs")))   
        feature[0]=1;
  else if((!strcmp(tld,"info"))||(!strcmp(tld,"coop"))||(!strcmp(tld,"net")))
        feature[0]=0.5;
  else
    feature[0]=0;


   for( i=4;i<=8;i++)
   {
       for(j=0;j<strlen(tokens)-i+1;j++)
           {
                substr(buff,tokens,j,i);
                if(found(buff)) feature[i+2]++;

           }
   }

/*.biz · .com · .info · .name · .net · .org · .pro
.aero, .coop, .jobs, .travel */

for(i=0;i<11;i++) fprintf(fop," %d:%f",i,feature[i]);
fprintf(fop,"\n");


}
fflush(fop);
fclose(furl);
fclose(fop);
}

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

一花一树开 2024-08-25 20:53:43

这里有很多答案：
使用 C 解析 URL 的最佳方法？

回复收藏 0 原文

倒数 2024-08-25 20:53:43

sscanf 中的 %s 仅当遇到第一个空白字符、字符串末尾或指定的最大长度时才会停止。由于 URL 没有空格，这就是 proto 变成 fullurl 的原因。

对于分段错误：由于 proto 只能容纳 5 个字节（包括尾随 null，因此只有 4 个字节的数据，不能覆盖 https 等），将完整的 URL 放入其中将导致缓冲区溢出/分段错误。 sscanf 在这方面是相当有问题的。文档要求每个接收 %s 的字符缓冲区应该足够大以容纳完整的字符串（加上 \0）。

回复收藏 0 原文