c/c90/大型文本文件中的计数单词

发布于 2025-01-31 11:58:14 字数 1325 浏览 5 评论 0原文

我有一个文本文件，该文件由大约30000个单词组成。我的目标是计算单词的实际数量（请记住包括多个标点符号和连续空间，以及与-连接的单词（例如三腿），因此仅计算空格不正确）。

我设法计算了总体角色，但我在努力挣扎。有帮助吗？

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define SIZE 50

char *getfile(void);
void stats(char *filename);

int main() {
    char *file;
    file = getfile();
    stats(file);
    return 0;
}

char *getfile(void) {
    char *filename;
    FILE *fp;
    filename = malloc(SIZE);

    printf("Enter the name of the text file: ");
    scanf("%49s", filename);

    fp = fopen(filename, "r");
    printf("\n");

    if (fp == NULL) {
        printf("The entered file does not exist.");
        printf("\n");
    } else {
        printf("The file exists.");
        fclose(fp);
    }

    return filename;
}

void stats(char *filename) {
    int cnt = 0, space = 0, lines = 0;
    int c;
    int count = 0;

    FILE *fp;
    fp = fopen(filename, "r");
    while (((c = fgetc(fp)) != EOF)) {
        cnt++;

        if (c == ' ') {
            space++;
        }
        if (c == '\n' || c == '\0') {
            lines++;
        }
    }

    printf("\nTotal characters in file: %d", cnt);
    printf("\nTotal characters (excluding spaces) in file: %d", cnt - space);
    
    fclose(fp);
    return;
}

原文

I have a text file which consists of about 30000 words. My goal is to count the actual number of the words (keep in mind that multiple punctuation marks and consecutive spaces are included, as well as words connected with - (for example three-legged), so counting just the spaces isn't correct).

I have managed to count the total characters but I am struggling with the words.
Any help?

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define SIZE 50

char *getfile(void);
void stats(char *filename);

int main() {
    char *file;
    file = getfile();
    stats(file);
    return 0;
}

char *getfile(void) {
    char *filename;
    FILE *fp;
    filename = malloc(SIZE);

    printf("Enter the name of the text file: ");
    scanf("%49s", filename);

    fp = fopen(filename, "r");
    printf("\n");

    if (fp == NULL) {
        printf("The entered file does not exist.");
        printf("\n");
    } else {
        printf("The file exists.");
        fclose(fp);
    }

    return filename;
}

void stats(char *filename) {
    int cnt = 0, space = 0, lines = 0;
    int c;
    int count = 0;

    FILE *fp;
    fp = fopen(filename, "r");
    while (((c = fgetc(fp)) != EOF)) {
        cnt++;

        if (c == ' ') {
            space++;
        }
        if (c == '\n' || c == '\0') {
            lines++;
        }
    }

    printf("\nTotal characters in file: %d", cnt);
    printf("\nTotal characters (excluding spaces) in file: %d", cnt - space);
    
    fclose(fp);
    return;
}

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

煮茶煮酒煮时光 2025-02-07 11:58:14

您应该列出所有可以在单词之间分开的字符，并计算分离字符的每个顺序。

回复收藏 0 原文

悲凉≈ 2025-02-07 11:58:14

您遇到麻烦的原因是您没有 state 。也就是说，对以前发生的情况进行分类。您可以使用其他方法将文件分解为单词，但是A state-machine> state-machine 简单快捷。正如评论和其他答案中所建议的那样，您需要两个状态，以前有一个白色空间，而一个单词的字符是以前出现的。这有点像一位衍生物，具有上升的边缘，白色空间空间，作为您所算的东西。

剥夺大多数外部内容，这可能是您进行状态机器的方式。

#include <stdio.h>

int main(void) {
    unsigned char buf[16384 /*50*/]; /* 50 is small. */
    enum { WHITE, WORD } state = WHITE;
    size_t cnt = 0, lines = 0, words = 0, nread, i;
    do { /* Fill `buf`. */
        nread = fread(buf, 1, sizeof buf, stdin);
        if(ferror(stdin)) { perror("wc"); return 1; }
        cnt += nread;
        for(i = 0; i < nread; i++) { /* Char-by-char in `buf`. */
            unsigned char c = buf[i];
            /* https://en.cppreference.com/w/cpp/string/byte/isspace */
            switch(c) {
            case '\n':
                lines++; /* Fall-though. Doesn't handle CRs properly. */
            case '\0': case ' ': case '\f': case '\r': case '\t': case '\v':
                state = WHITE;
                break;
            default:
                if(state == WORD) break;
                state = WORD;
                words++;
                break;
            }
        }
    } while(nread == sizeof buf);
    printf("Total characters in file: %lu\n", (unsigned long)(cnt - lines));
    printf("Total lines in file: %lu\n", (unsigned long)lines);
    printf("Total words in file: %lu\n", (unsigned long)words);
    return 0;
}

我在托管环境上的简洁环境中卸载了一些工作，./ wc＆lt; file.txt，我使用一个缓冲区。

The reason you are having trouble is you have no state. That is, classifying context about what came before. You can use other methods to break the file into words, but a state-machine is simple and fast. As suggested in the comments and by other answers, you need two states, a white-space came before, and a word character came before. It's sort of like the one-bit derivative, with rising edge, white-space space to word, as a the thing you count.

Stripping off most of the extraneous stuff, this might be how you do a state machine.

#include <stdio.h>

int main(void) {
    unsigned char buf[16384 /*50*/]; /* 50 is small. */
    enum { WHITE, WORD } state = WHITE;
    size_t cnt = 0, lines = 0, words = 0, nread, i;
    do { /* Fill `buf`. */
        nread = fread(buf, 1, sizeof buf, stdin);
        if(ferror(stdin)) { perror("wc"); return 1; }
        cnt += nread;
        for(i = 0; i < nread; i++) { /* Char-by-char in `buf`. */
            unsigned char c = buf[i];
            /* https://en.cppreference.com/w/cpp/string/byte/isspace */
            switch(c) {
            case '\n':
                lines++; /* Fall-though. Doesn't handle CRs properly. */
            case '\0': case ' ': case '\f': case '\r': case '\t': case '\v':
                state = WHITE;
                break;
            default:
                if(state == WORD) break;
                state = WORD;
                words++;
                break;
            }
        }
    } while(nread == sizeof buf);
    printf("Total characters in file: %lu\n", (unsigned long)(cnt - lines));
    printf("Total lines in file: %lu\n", (unsigned long)lines);
    printf("Total words in file: %lu\n", (unsigned long)words);
    return 0;
}

I off-loaded some work on the hosted-environment for brevity, ./wc < file.txt and I used a buffer.

回复收藏 0 原文

~没有更多了~