用 C 中的文件实现的 B 树(不是 B+/B*)中的问题
我是新来的,首先,如果我犯了错误,我想道歉。我的问题是:我想用 C 实现 B 树,使用文件来存储树...我的程序从文本文件中读取 10000 个字符串,每个字符串包含 10 个字符,并将该内容存储在 .DAT 二进制文件中,通过B树组织;然后用户可以搜索字符串。
我正在使用“Cormen等人 - 算法简介(3ed)”的算法,它似乎是正确的、清晰的和实用的。但是,我的程序只是出现运行时错误...例如分段错误和无限循环。我已经尝试调试了5天,但没有成功! B 树函数是递归的,我个人讨厌它......我认为正是递归使调试如此困难!
我的代码比较大,分为2个文件,一个源文件,一个标头。我将在这里发布 B 树的函数和变量声明。但是,如果有人想查看完整的代码,我会发布一个“iFile.it”链接(是否允许?如果不允许,抱歉!)...
非常感谢您的关注和帮助...对于这个大问题感到抱歉!
源链接[不是那么重要,只是main()]:http://ifile.it/n73drmc/ b-tree_file.c
标题链接[所有函数都在这里]:http://ifile.it/u1fa3kp/b-tree_file.h
带有字符串的文本文件:http://ifile.it/7hu95ot/arq_5.txt
关于代码的注释:
i) free() 的行为非常奇怪......所以我对它们进行了评论,因为如果我使用它们,我的程序错误更多!
ii)所有代码注释都是我尝试解决问题的想法,并且必须被认为是我已经尝试过。
iii) 我的母语是葡萄牙语,因此对于以英语为母语的人来说,函数和变量可能有奇怪的名称......对此深表歉意!
守则:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
//defs of pre-compiler
#ifdef __linux
#define PAUSA "read p"
#define CLRSCR "clear"
#else
#define PAUSA "Pause"
#define CLRSCR "cls"
#endif
#define T 5 // minimum degree of B-tree
#define Min (T-1)
#define Max ((2*T)-1)
//
//global vars
FILE *arqt, *arqb;
char VAL[11];
long int lt = 0;
typedef struct {
unsigned int num;
long int pos;
char chave[(Max+1)][11]; //chave in portuguese = key in english!
short int folha; //folha in portuguese = leaf in english!
long int c[(Max+2)];
} Nod;
Nod *raiz = NULL; //raiz in portuguese = root in english!
//
void b_split(Nod *x, unsigned int i, Nod *y) { //B-TREE-SPLIT-CHILD //that function split a B-tree node
Nod *z = NULL;
unsigned int j=1;
z = (Nod*)realloc(z,sizeof(Nod));
fseek(arqb,0,SEEK_END);
z->pos = ftell(arqb);
z->folha = y->folha;
z->num = Min;
for(j=1;j<=Min;j++) {strcpy(z->chave[j],y->chave[j+T]);}
if (y->folha == 0) {
for(j=1;j<=T;j++) {z->c[j] = y->c[j+T];}
}
y->num = Min;
for(j=(x->num + 1);j<=(i+1);j--) {x->c[(j+1)] = x->c[j];}
x->c[(i+1)] = z->pos;
for(j=x->num;j<=i;j--) { strcpy(x->chave[j+1],x->chave[j]); }
strcpy(x->chave[i],y->chave[T]);
x->num = x->num + 1;
fseek(arqb,x->pos,SEEK_SET);
fwrite(x,sizeof(Nod),1,arqb);
fseek(arqb,y->pos,SEEK_SET);
fwrite(y,sizeof(Nod),1,arqb);
fseek(arqb,z->pos,SEEK_SET);
fwrite(z,sizeof(Nod),1,arqb);
//free(z);
//free(y);
}
void b_ins(Nod *x, char *val) { //B-TREE-INSERT-NONFULL //insert a key in nonfull node
unsigned int i=0;
Nod *C = NULL;
i = x->num;
if (x->folha == 1) {
while ( (i >= 1) && (strcmp(val,x->chave[i]) < 0) ) {
strcpy(x->chave[(i+1)],x->chave[i]);
i--;
}
strcpy(x->chave[(i+1)],val);
x->num = x->num + 1;
fseek(arqb,x->pos,SEEK_SET);
fwrite(x,sizeof(Nod),1,arqb);
}
else {
while ( (i >= 1) && (strcmp(val,x->chave[i]) < 0) ) {i--;}
i++;
C = (Nod*)realloc(C,sizeof(Nod));
fseek(arqb,x->c[i],SEEK_SET);
fread(C,sizeof(Nod),1,arqb);
if (C->num == Max) {
b_split(x,i,C);
if ( strcmp(val,x->chave[i]) > 0 ) {i++;}
}
fseek(arqb,x->c[i],SEEK_SET);
fread(C,sizeof(Nod),1,arqb);
b_ins(C,val);
//free(C);
}
}
void insere_b(char *val) { //B-TREE-INSERT //i believe the problem is here!
Nod *S = NULL,*R = NULL;
R = (Nod*)realloc(R,sizeof(Nod));
R = raiz;
if (R->num == Max) {
S = (Nod*)realloc(S,sizeof(Nod));
/* fseek(arqb,0,SEEK_END);
S->pos = ftell(arqb);
*/
raiz = S;
S->folha = 0;
S->num = 0;
S->c[1] = R->pos;
/* fseek(arqb,S->pos,SEEK_SET);
fwrite(S,sizeof(Nod),1,arqb);
*/
b_split(S,1,R);
b_ins(S,val);
//free(S);
}
else {b_ins(R,val);}
//free(R);
}
void busca_b(Nod *x, char *val) { //B-TREE-SEARCH //self explanatory
unsigned int i=1;
Nod *C = NULL;
while( (i <= x->num) && ( strcmp(val, x->chave[i]) > 0 ) ) {i++;}
if ( (i <= x->num) && ( strcmp(val, x->chave[i]) == 0 ) ) {printf ("\nValor encontrado!\n");}
else {
if (x->folha == 1) {printf ("\nValor NAO encontrado!\n");}
else {
C = (Nod*)realloc(C,sizeof(Nod));
fseek(arqb,x->c[i],SEEK_SET);
fread(C,sizeof(Nod),1,arqb);
lt++;
busca_b(C,val);
//free(C);
}
}
}
void cria_b() { // cria arvore B //create the B-tree
long int i = 0;
char V[11];
raiz->folha = 1;
raiz->num = 0;
raiz->pos = 0;
for (i=1;i<=(Max+1);i++) {raiz->c[i] = -1;}
fseek(arqb,raiz->pos,SEEK_SET);
fwrite(raiz,sizeof(Nod),1,arqb);
rewind(arqt);
for (i=0;i<fSize(arqt);i++) {
fscanf(arqt,"%s\n",V);
insere_b(V);
}
}
i'm new here and first of all, i wanna apologize if i make errors in question. My problem is: i want to implement a B-tree in C, using a file to store the tree...my program reads 10000 strings of 10 characters each from a Text file, and store that content in a .DAT binary file, organized via B-tree; then the user can search for a string.
I'm using algorithms of "Cormen,et al - Introduction to Algorithms (3ed)", which seems to be correct, clear and functional. BUT, my program just get runtime errors...like Segmentation Fault and Infinite Loop. I've been trying to debug for 5 days, but with no success! The B-tree functions are recursive, which i personally hate...i think exactly the recursion makes the debug so difficult!
My code is relatively big, and it's been divided in 2 files, one source, one header. I'll just post here the functions of B-tree and the variables declarations. But, if someone wants to see full code, i'll post an "iFile.it" link (is it allowed? Sorry if not!)...
Thanks very much for attention and help...sorry for the big question!
Source link [not so important,just main()] : http://ifile.it/n73drmc/b-tree_file.c
Header link [all functions are here]: http://ifile.it/u1fa3kp/b-tree_file.h
Text File with strings: http://ifile.it/7hu95ot/arq_5.txt
NOTES about code:
i) The free() are with very strange behaviour...so i've commented them, because if i use them, my program bugs even more!
ii) All code comments are my ideas to try to solve the problems, and must be considered that i've tried them.
iii) I'm a native portuguese speaker, so functions and variables could have strange names for native english speakers...sorry for that !
THE CODE:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
//defs of pre-compiler
#ifdef __linux
#define PAUSA "read p"
#define CLRSCR "clear"
#else
#define PAUSA "Pause"
#define CLRSCR "cls"
#endif
#define T 5 // minimum degree of B-tree
#define Min (T-1)
#define Max ((2*T)-1)
//
//global vars
FILE *arqt, *arqb;
char VAL[11];
long int lt = 0;
typedef struct {
unsigned int num;
long int pos;
char chave[(Max+1)][11]; //chave in portuguese = key in english!
short int folha; //folha in portuguese = leaf in english!
long int c[(Max+2)];
} Nod;
Nod *raiz = NULL; //raiz in portuguese = root in english!
//
void b_split(Nod *x, unsigned int i, Nod *y) { //B-TREE-SPLIT-CHILD //that function split a B-tree node
Nod *z = NULL;
unsigned int j=1;
z = (Nod*)realloc(z,sizeof(Nod));
fseek(arqb,0,SEEK_END);
z->pos = ftell(arqb);
z->folha = y->folha;
z->num = Min;
for(j=1;j<=Min;j++) {strcpy(z->chave[j],y->chave[j+T]);}
if (y->folha == 0) {
for(j=1;j<=T;j++) {z->c[j] = y->c[j+T];}
}
y->num = Min;
for(j=(x->num + 1);j<=(i+1);j--) {x->c[(j+1)] = x->c[j];}
x->c[(i+1)] = z->pos;
for(j=x->num;j<=i;j--) { strcpy(x->chave[j+1],x->chave[j]); }
strcpy(x->chave[i],y->chave[T]);
x->num = x->num + 1;
fseek(arqb,x->pos,SEEK_SET);
fwrite(x,sizeof(Nod),1,arqb);
fseek(arqb,y->pos,SEEK_SET);
fwrite(y,sizeof(Nod),1,arqb);
fseek(arqb,z->pos,SEEK_SET);
fwrite(z,sizeof(Nod),1,arqb);
//free(z);
//free(y);
}
void b_ins(Nod *x, char *val) { //B-TREE-INSERT-NONFULL //insert a key in nonfull node
unsigned int i=0;
Nod *C = NULL;
i = x->num;
if (x->folha == 1) {
while ( (i >= 1) && (strcmp(val,x->chave[i]) < 0) ) {
strcpy(x->chave[(i+1)],x->chave[i]);
i--;
}
strcpy(x->chave[(i+1)],val);
x->num = x->num + 1;
fseek(arqb,x->pos,SEEK_SET);
fwrite(x,sizeof(Nod),1,arqb);
}
else {
while ( (i >= 1) && (strcmp(val,x->chave[i]) < 0) ) {i--;}
i++;
C = (Nod*)realloc(C,sizeof(Nod));
fseek(arqb,x->c[i],SEEK_SET);
fread(C,sizeof(Nod),1,arqb);
if (C->num == Max) {
b_split(x,i,C);
if ( strcmp(val,x->chave[i]) > 0 ) {i++;}
}
fseek(arqb,x->c[i],SEEK_SET);
fread(C,sizeof(Nod),1,arqb);
b_ins(C,val);
//free(C);
}
}
void insere_b(char *val) { //B-TREE-INSERT //i believe the problem is here!
Nod *S = NULL,*R = NULL;
R = (Nod*)realloc(R,sizeof(Nod));
R = raiz;
if (R->num == Max) {
S = (Nod*)realloc(S,sizeof(Nod));
/* fseek(arqb,0,SEEK_END);
S->pos = ftell(arqb);
*/
raiz = S;
S->folha = 0;
S->num = 0;
S->c[1] = R->pos;
/* fseek(arqb,S->pos,SEEK_SET);
fwrite(S,sizeof(Nod),1,arqb);
*/
b_split(S,1,R);
b_ins(S,val);
//free(S);
}
else {b_ins(R,val);}
//free(R);
}
void busca_b(Nod *x, char *val) { //B-TREE-SEARCH //self explanatory
unsigned int i=1;
Nod *C = NULL;
while( (i <= x->num) && ( strcmp(val, x->chave[i]) > 0 ) ) {i++;}
if ( (i <= x->num) && ( strcmp(val, x->chave[i]) == 0 ) ) {printf ("\nValor encontrado!\n");}
else {
if (x->folha == 1) {printf ("\nValor NAO encontrado!\n");}
else {
C = (Nod*)realloc(C,sizeof(Nod));
fseek(arqb,x->c[i],SEEK_SET);
fread(C,sizeof(Nod),1,arqb);
lt++;
busca_b(C,val);
//free(C);
}
}
}
void cria_b() { // cria arvore B //create the B-tree
long int i = 0;
char V[11];
raiz->folha = 1;
raiz->num = 0;
raiz->pos = 0;
for (i=1;i<=(Max+1);i++) {raiz->c[i] = -1;}
fseek(arqb,raiz->pos,SEEK_SET);
fwrite(raiz,sizeof(Nod),1,arqb);
rewind(arqt);
for (i=0;i<fSize(arqt);i++) {
fscanf(arqt,"%s\n",V);
insere_b(V);
}
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
该代码有点难以阅读,但这是我所看到的:
您正在执行内存分配并立即丢弃结果。 realloc 的结果可能与原始指针不同。
您可能需要一个函数来初始化您的树节点以使其更清晰 - 我很难遵循代码。
我希望在树中的每个节点中看到指针,但我没有看到任何指针。我没有遵循你的实施。我建议在纸上画出你的树,用箭头显示什么指向什么,并考虑所有极端情况(例如,当树为空时首先插入),因为这些通常是人们可能会陷入困境的地方。还可以使用调试器单步调试代码,看看它的行为是否符合您的预期。
编辑:如果整个树都构建在一个文件中(这似乎是有意的),您可能根本不需要进行任何动态内存分配。
通常,您需要:
某些 C/C++ 编译器可以在运行时执行额外的检查(例如 Visual Studio),这可能有助于查明问题。
另请参阅乔纳森对惯例和风格的好评。
The code is a little hard to read but here's what I see:
You are doing a memory allocation and throwing away the result right away. The result of realloc may be different than the original pointer.
You probably want a function to initialize your tree nodes to make that clearer - it's hard for me to follow the code.
I am expecting to see pointers in each node in the tree but I'm not seeing any. I'm not following your implementation. I would suggest drawing your tree on paper with arrows showing what points to what and consider all corner cases (e.g. first insert when tree is empty) as those are usually what people can get stuck on. Also step through your code with a debugger and see if it behaves the way you expect it to.
EDIT: If the entire tree is built in a file (which staring at this a bit more seems to be the intention) you probably do not need to do any dynamic memory allocation at all.
Very generally you want to:
Some C/C++ compilers can do extra checks during run time (e.g. Visual Studio) which may be useful in pinpointing the problem.
Also look at Jonathan's good comments re conventions and style.
头文件应该只包含结构体、类型定义、枚举、定义和函数声明的声明。您可能认为其中有内联函数定义,但也可能没有。
您的头文件中包含完整的函数 - 不太可能被内联且未声明为内联的函数。这意味着它无法用于其预期目的 - 使用相应 C 文件中定义的代码向文件提供声明。
代码不容易阅读 - 而且问题不在于语言。诸如以下的结构:
不是惯用的 C. 正常的编写方式是:
有编码指南要求所有循环体和条件都用大括号括起来;如果您遇到其中一种情况,那么您可以使用以下几种编码约定之一:
我非常喜欢前者;有很多人更喜欢后者。
这个世界上还有Windows和Linux以外的系统。
这对于我的环境 - MacOS X 来说并不是特别正常。我不太热衷于使用系统语句的程序,但我想这是一个问题,因为我是一个不使用 IDE 的老家伙(尽管有一个我不使用 IDE 的原因之一是因为命令行程序不能在其中正常运行,而且我主要编写命令行程序,因此它们对我的正常工作模式不利)。
一个主要的性能错误是:
每次迭代都会调用 fSize() 函数,该函数会遍历整个文件,确定文件中有多少行,并在返回之前恢复读取位置。目前尚不清楚您是否真的需要计算行数 - 您可能可以简单地边读边读。但是,如果您确实需要计算行数,只需计算一次。
有很多地方使用循环:
这在 C 中通常是不正确的;数组索引从零开始,循环的规范形式是:
从风格上讲,您通常为
#define
或enum
值保留所有大写名称,而不是作为常规变量。在
insere_b()
函数中,您有:其他人指出
R = raiz;
行是可疑的。将 null 分配给 R;然后你realloc()
它。这始终等同于malloc()
。此外,分配的内存未初始化,因此您可以使用随机值。这会导致问题。然后,您使用
S
执行类似的序列(通过realloc()
分配内存),不过这次您随后初始化所分配结构的部分(可能是全部)。这些就足以引起麻烦。
当代码第一次进入
b_split()
时,您会遇到位置值的问题。具体来说,y->pos
为零,但x->pos
也为零,因此程序在相同偏移处存储(写入)两个节点的数据,这很少是幸福的秘诀。由于 x 是根节点(至少在本上下文中 - 第一个分割),因此它应该位于位置 0;y
和z
需要位于不同的位置。y
节点还包含非零键,尽管这并不重要(除了为了整洁目的),因为y->num
值表明它们不是用过的。您也永远不会使用
chave[0]
键值 AFAICT。这就有点浪费了。Header files should only contain declarations of structures, typedefs, enums, defines and function declarations. You might conceivably have inline function definitions in it, but probably not.
Your header file has full functions in it - ones that are unlikely to be inlined and are not declared inline. That means it is unusable for its intended purpose - to provide declarations to files using the code defined in the corresponding C file.
The code is not easy to read - and it isn't the language that is the problem. Constructs such as:
are not idiomatic C. The normal way of writing that would be:
There are coding guidelines that require braces around all loop bodies and conditionals; if you suffer from one of those, then you use one of a couple of coding conventions:
I strongly prefer the former; there are lots of people who prefer the latter.
There are systems other than Windows and Linux in this world.
This doesn't work particularly sanely for my environment - MacOS X. I'm not exactly keen on programs that use system statements, but I guess that is a problem with me being an old fogey who doesn't use and IDE (though one of the reasons I don't use an IDE is because command line programs don't run sanely in them, and I mostly write command line programs, so they are hostile to my normal mode of working).
One major performance bug is:
This calls the
fSize()
function on each iteration, and the function traverses the entire file, determining how many lines are in the file, restoring the read position before returning. It is not clear that you really need to count the number of lines - you can probably simply read the lines as you go. However, if you do need to count the lines, do so just once.There are a number of places where you have loops using:
This is usually incorrect in C; array indexes start at zero and the canonical form for a loop is:
Stylistically, you normally reserve all upper-case names for
#define
orenum
values and not as regular variables.In the
insere_b()
function, you have:Someone else pointed out that the
R = raiz;
line is dubious. You assign null toR
; then yourealloc()
it. This is always equivalent tomalloc()
. Further, the memory allocated is not initialized, so you get random values to play with. This leads to problems.You then go through a similar sequence with
S
(allocating memory viarealloc()
), though this time you subsequently initialize parts (possibly all) of the structure allocated.These are sufficient to cause trouble.
When the code goes into
b_split()
for the first time, you run into a problem with the position values. Specifically,y->pos
is zero, but so isx->pos
, so the program stores (writes) two nodes worth of data at the same offset, which is seldom a recipe for happiness. Sincex
is the root node (at least in this context - the first split), it should be at position 0; bothy
andz
need to be at different positions. They
node also contains non-zeroed keys, though that doesn't matter much (except for purposes of tidiness) as they->num
value indicates they are not used.You also do not ever use the
chave[0]
key value, AFAICT. That is a bit wasteful.