如何使用 sscanf() 在 C 中解析 URL?
这是我的 C 代码,它从文件中读取 URL 列表,并尝试分隔 URL 的各个部分。这只是粗略的解析,我不关心特殊情况。我猜 sscanf() 语句有一些错误;当我运行这个时,我得到“分段错误”。此外,完整的 url 被分配给“proto”字符串。
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
# define BAG_SIZE 14
char bag[117][30];
void initbag()
{
strcpy(bag[0],"account");
strcpy(bag[1],"audit");
strcpy(bag[2],"ad");
strcpy(bag[3],"advertising");
strcpy(bag[4],"marketing");
strcpy(bag[5],"application");
strcpy(bag[6],"banking");
strcpy(bag[7],"barter");
strcpy(bag[8],"business");
strcpy(bag[9],"econo");
strcpy(bag[10],"commerce");
strcpy(bag[11],"communication");
strcpy(bag[12],"computer");
strcpy(bag[13],"processing");
}
/*
other bag[] values will be later copied
*/
void substr(char dest[10],char src[200],int start,int len)
{
int i,j;
for(i=start,j=0;i<start+len;i++,j++)
dest[j]=src[i];
dest[j]='\0';
}
int found(char* word)
{
int i;
for(i=0;i<BAG_SIZE;i++)
if((!strcmp(word,bag[i]))||(strstr(bag[i],word)!=NULL)) return 1;
return 0;
}
void main()
{
int i,j,k;
char buff[10],fullurl[100];
char proto[5],www[4],host[100],tokens[200],tld[4];
float feature[11];for(i=0;i<11;i++) feature[i]=0;
FILE *furl,*fop;
furl=fopen("bizurls.txt","r");
fop=fopen("urlsvm.txt","w");
initbag();
printf("\nbag initialised");fflush(stdout);
while(!feof(furl))
{
fscanf(furl,"%s",fullurl);
printf("%s",fullurl);
sscanf(fullurl,"%s://%s.%s.%s/%s\n",proto,www,host,tld,tokens);// this line isnt working properly
printf("2hi");fflush(stdout);
printf("proto : %s\nwww:%s\nhost :%s\ntld:%s\ntokens:%s\n",proto,www,host,tld,tokens);fflush(stdout);
for( i=4;i<=8;i++)
{
for(j=0;j<strlen(host)-i+1;j++)
{
substr(buff,host,j,i);
if(found(buff)) feature[i-3]++;
}
}
if((!strcmp(tld,"biz"))||(!strcmp(tld,"org"))||(!strcmp(tld,"com"))||(!strcmp(tld,"jobs")))
feature[0]=1;
else if((!strcmp(tld,"info"))||(!strcmp(tld,"coop"))||(!strcmp(tld,"net")))
feature[0]=0.5;
else
feature[0]=0;
for( i=4;i<=8;i++)
{
for(j=0;j<strlen(tokens)-i+1;j++)
{
substr(buff,tokens,j,i);
if(found(buff)) feature[i+2]++;
}
}
/*.biz · .com · .info · .name · .net · .org · .pro
.aero, .coop, .jobs, .travel */
for(i=0;i<11;i++) fprintf(fop," %d:%f",i,feature[i]);
fprintf(fop,"\n");
}
fflush(fop);
fclose(furl);
fclose(fop);
}
This is my C code that reads a list of URLs from a file, and tries to separate the various parts of the URL. This is just rough parsing, I'm not bothered about special cases. I guess there is some fault with the sscanf() statement; when I run this, I get "segmentation FAULT". And moreover, the full url is being assigned to the "proto" string.
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
# define BAG_SIZE 14
char bag[117][30];
void initbag()
{
strcpy(bag[0],"account");
strcpy(bag[1],"audit");
strcpy(bag[2],"ad");
strcpy(bag[3],"advertising");
strcpy(bag[4],"marketing");
strcpy(bag[5],"application");
strcpy(bag[6],"banking");
strcpy(bag[7],"barter");
strcpy(bag[8],"business");
strcpy(bag[9],"econo");
strcpy(bag[10],"commerce");
strcpy(bag[11],"communication");
strcpy(bag[12],"computer");
strcpy(bag[13],"processing");
}
/*
other bag[] values will be later copied
*/
void substr(char dest[10],char src[200],int start,int len)
{
int i,j;
for(i=start,j=0;i<start+len;i++,j++)
dest[j]=src[i];
dest[j]='\0';
}
int found(char* word)
{
int i;
for(i=0;i<BAG_SIZE;i++)
if((!strcmp(word,bag[i]))||(strstr(bag[i],word)!=NULL)) return 1;
return 0;
}
void main()
{
int i,j,k;
char buff[10],fullurl[100];
char proto[5],www[4],host[100],tokens[200],tld[4];
float feature[11];for(i=0;i<11;i++) feature[i]=0;
FILE *furl,*fop;
furl=fopen("bizurls.txt","r");
fop=fopen("urlsvm.txt","w");
initbag();
printf("\nbag initialised");fflush(stdout);
while(!feof(furl))
{
fscanf(furl,"%s",fullurl);
printf("%s",fullurl);
sscanf(fullurl,"%s://%s.%s.%s/%s\n",proto,www,host,tld,tokens);// this line isnt working properly
printf("2hi");fflush(stdout);
printf("proto : %s\nwww:%s\nhost :%s\ntld:%s\ntokens:%s\n",proto,www,host,tld,tokens);fflush(stdout);
for( i=4;i<=8;i++)
{
for(j=0;j<strlen(host)-i+1;j++)
{
substr(buff,host,j,i);
if(found(buff)) feature[i-3]++;
}
}
if((!strcmp(tld,"biz"))||(!strcmp(tld,"org"))||(!strcmp(tld,"com"))||(!strcmp(tld,"jobs")))
feature[0]=1;
else if((!strcmp(tld,"info"))||(!strcmp(tld,"coop"))||(!strcmp(tld,"net")))
feature[0]=0.5;
else
feature[0]=0;
for( i=4;i<=8;i++)
{
for(j=0;j<strlen(tokens)-i+1;j++)
{
substr(buff,tokens,j,i);
if(found(buff)) feature[i+2]++;
}
}
/*.biz · .com · .info · .name · .net · .org · .pro
.aero, .coop, .jobs, .travel */
for(i=0;i<11;i++) fprintf(fop," %d:%f",i,feature[i]);
fprintf(fop,"\n");
}
fflush(fop);
fclose(furl);
fclose(fop);
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(3)
这里有很多答案:
使用 C 解析 URL 的最佳方法?
Many answers here:
Best ways of parsing a URL using C?
sscanf 中的 %s 仅当遇到第一个空白字符、字符串末尾或指定的最大长度时才会停止。由于 URL 没有空格,这就是 proto 变成 fullurl 的原因。
对于分段错误:由于 proto 只能容纳 5 个字节(包括尾随 null,因此只有 4 个字节的数据,不能覆盖 https 等),将完整的 URL 放入其中将导致缓冲区溢出/分段错误。 sscanf 在这方面是相当有问题的。文档要求每个接收 %s 的字符缓冲区应该足够大以容纳完整的字符串(加上 \0)。
%s in sscanf will only stop when it hits the first white-space character, the end of the string or the specified maximum length. Seeing as an URL has no whitespace, that's the reason why proto becomes fullurl.
For the segmentation fault: as proto can only hold 5 bytes (including the trailing null, so only 4 bytes of data which would not cover e.g. https), putting the full URL into it will cause a buffer overflow / segmentation fault. sscanf is rather problematic in this regards. Documentation requests that each char buffer receiving a %s should be big enough to hold the full string (plus \0).
它不会工作,因为
proto
将匹配整个fullurl
而其余部分将不匹配。为此,您应该使用正确的 URL 解析函数或正则表达式。It won't work because
proto
would match the wholefullurl
and the rest will be unmatched. You should use a proper URL parsing function or regex for this.