为 Flex 定义了 C 令牌文件?

发布于 2024-11-29 13:29:48 字数 116 浏览 0 评论 0原文

我想将一个C文件分割成标记,不是为了编译而是为了分析。我觉得这应该非常简单,并尝试在线查找已定义的所有 C 语法的 Flex 文件,但找不到任何内容。我想知道是否存在任何类型的定义语法,或者我是否认为这一切都是错误的?

I want to split a C file into tokens, not for compiling but for analyzing. I feel like this should be pretty straight-forward, and tried looking online for a defined tokens.l (or something similar) file for flex with all the C grammar already defined, but couldn't find anything. I was wondering if there are any sort of defined grammars floating around, or if perhaps I'm going about this all wrong?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

青萝楚歌 2024-12-06 13:29:48

是的,至少有一个

编辑:

由于有一些问题无法处理,也许值得看看我几年前编写的一些(手写的)词法分析代码。这基本上只处理翻译的第 1、2 和 3 阶段。如果定义 DIGRAPH,它还会打开一些代码来翻译 C++ 二合字母。然而,如果没记错的话,它在翻译中的执行时间比实际发生的时间要早​​,但无论如何您可能都不希望这样做。 OTOH,这甚至不尝试识别任何接近所有标记的地方——主要是将源分为注释、字符文字、字符串文字和几乎所有其他内容。 OTOH,它确实处理三字母、行拼接等。

我想我还应该补充一点,通过在翻译(文本)模式下打开文件,这会将平台的行结束字符转换为换行符到底层实现。在大多数情况下,这可能是正确的做法,但如果您想生成类似交叉编译器的东西,其中源文件的行结束顺序与该主机的正常行结束顺序不同,您可能必须更改它。

首先是定义所有这些东西的外部接口的标头:

/* get_src.h */   
#ifndef GET_SRC_INCLUDED
#define GET_SRC_INCLUDED

#include <stdio.h>

#ifdef __cplusplus
extern "C" {
#endif

/* This is the size of the largest token we'll attempt to deal with.  If
 * you want to deal with bigger tokens, change this, and recompile
 * get_src.c.  Note that an entire comment is treated as a single token,
 * so long comments could overflow this.  In case of an overflow, the
 * entire comment will be read as a single token, but the part larger
 * than this will not be stored.
 */
#define MAX_TOKEN_SIZE 8192

/* `last_token' will contain the text of the most recently read token (comment,
 * string literal, or character literal).
 */
extern char last_token[];

/* This is the maximum number of characters that can be put back into a
 * file opened with parse_fopen or parse_fdopen.
 */
#define MAX_UNGETS 5

#include <limits.h>
#include <stdio.h>

typedef struct {
    FILE *file;
    char peeks[MAX_UNGETS];
    int last_peek;
} PFILE;

/* Some codes we return to indicate having found various items in the
 * source code.  ERROR is returned to indicate a newline found in the
 * middle of a character or string literal or if a file ends inside a
 * comment, or if a character literal contains more than two characters.
 *
 * Note that this starts at INT_MIN, the most negative number available
 * in an int.  This keeps these symbols from conflicting with any
 * characters read from the file.  However, one of these could
 * theoretically conflict with EOF.  EOF usually -1, and these are far
 * more negative than that.  However, officially EOF can be any value
 * less than 0...
 */
enum {
    ERROR = INT_MIN,
    COMMENT,
    CHAR_LIT,
    STR_LIT
};

/* Opens a file for parsing and returns a pointer to a structure which
 * can be passed to the other functions in the parser/lexer to identify
 * the file being worked with.
 */
PFILE *parse_fopen(char const *name);

/* This corresponds closely to fdopen - it takes a FILE * as its
 * only parameter, creates a PFILE structure identifying that file, and
 * returns a pointer to that structure.
 */
PFILE *parse_ffopen(FILE *stream);

/* Corresponds to fclose.
 */
int parse_fclose(PFILE *stream);

/* returns characters from `stream' read as C source code.  String
 * literals, characters literals and comments are each returned as a
 * single code from those above.  All strings of any kind of whitespace
 * are returned as a single space character.
 */
int get_source(PFILE *stream);

/* Basically, these two work just like the normal versions of the same,
 * with the minor exception that unget_character can unget more than one
 * character.
 */
int get_character(PFILE *stream);
void unget_character(int ch, PFILE *stream);

#ifdef __cplusplus
}
#endif

#endif

然后是所有这些的实现:

/* get_src.c */
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>

#define GET_SOURCE
#include "get_src.h"

static size_t current = 0;

char last_token[MAX_TOKEN_SIZE];

PFILE *parse_fopen(char const *name) {

    PFILE *temp = malloc(sizeof(PFILE));

    if ( NULL != temp ) {
        temp->file = fopen(name, "r");
        memset(temp->peeks, 0, sizeof(temp->peeks));
        temp->last_peek = 0;
    }
    return temp;
}

PFILE *parse_ffopen(FILE *file) {

    PFILE *temp = malloc(sizeof(PFILE));

    if ( NULL != temp) {
        temp->file = file;
        memset(temp->peeks, 0, sizeof(temp->peeks));
        temp->last_peek = 0;
    }
    return temp;
}

int parse_fclose(PFILE *stream) {

    int retval = fclose(stream->file);

    free(stream);
    return retval;
}

static void addchar(int ch) {
/* adds the passed character to the end of `last_token' */

    if ( current < sizeof(last_token) -1 )
        last_token[current++] = (char)ch;

    if ( current == sizeof(last_token)-1 )
        last_token[current] = '\0';
}

static void clear(void) {
/* clears the previous token and starts building a new one. */
    current = 0;
}

static int read_char(PFILE *stream) {
    if ( stream->last_peek > 0 )
        return stream->peeks[--stream->last_peek];
    return fgetc(stream->file);
}

void unget_character(int ch, PFILE * stream) {
    if ( stream->last_peek < sizeof(stream->peeks) )
        stream->peeks[stream->last_peek++] = ch;
}

static int check_trigraph(PFILE *stream) {
/* Checks for trigraphs and returns the equivalant character if there
 * is one.  Expects that the leading '?' of the trigraph has already
 * been read before this is called.
 */

    int ch;

    if ( '?' != (ch=read_char(stream))) {
        unget_character(ch, stream);
        return '?';
    }

    ch = read_char(stream);

    switch( ch ) {
        case '(':   return '[';
        case ')':   return ']';
        case '/':   return '\\';
        case '\'':  return '^';
        case '<':   return '{';
        case '>':   return '}';
        case '!':   return '|';
        case '-':   return '~';
        case '=':   return '#';
        default:
            unget_character('?', stream);
            unget_character(ch, stream);
            return '?';
    }
}

#ifdef DIGRAPH
static int check_digraph(PFILE *stream, int first) {
/* Checks for a digraph.  The first character of the digraph is
 * transmitted as the second parameter, as there are several possible
 * first characters of a digraph.
 */

    int ch = read_char(stream);

    switch(first) {
        case '<':
            if ( '%' == ch )
                return '{';
            if ( ':' == ch )
                return '[';
            break;
        case ':':
            if ( '>' == ch )
                return ']';
            break;
        case '%':
            if ( '>' == ch )
                return '}';
            if ( ':' == ch )
                return '#';
            break;
    }

/* If it's not one of the specific combos above, return the characters
 * separately and unchanged by putting the second one back into the
 * stream, and returning the first one as-is.
 */
    unget_character(ch, stream);
    return first;
}
#endif


static int get_char(PFILE *stream) {
/* Gets a single character from the stream with any trigraphs or digraphs converted 
 * to the single character represented. Note that handling digraphs this early in
 * translation isn't really correct (and shouldn't happen in C at all).
 */
    int ch = read_char(stream);

    if ( ch == '?' )
        return check_trigraph(stream);

#ifdef DIGRAPH
    if (( ch == '<' || ch == ':' || ch == '%' ))
        return check_digraph(stream, ch);
#endif

    return ch;
}

int get_character(PFILE *stream) {
/* gets a character from `stream'.  Any amount of any kind of whitespace
 * is returned as a single space. Escaped new-lines are "eaten" here as well.
 */
    int ch;

    if ( !isspace(ch=get_char(stream)) && ch != '\\')
        return ch;

    // handle line-slicing
    if (ch == '\\') {
        ch = get_char(stream);
        if (ch == '\n') 
            ch = get_char(stream);
        else {
            unget_character(ch, stream);
            return ch;
        }
    }

    /* If it's a space, skip over consecutive white-space */
    while (isspace(ch) && ('\n' != ch))
        ch = get_char(stream);

    if ('\n' == ch)
        return ch;

    /* Then put the non-ws character back */
    unget_character(ch, stream);

    /* and return a single space character... */
    return ' ';
}

static int read_char_lit(PFILE *stream) {
/* This is used internally by `get_source' (below) - it expects the
 * opening quote of a character literal to have already been read and
 * returns CHAR_LIT or ERROR if there's a newline before a close
 * quote is found, or if the character literal contains more than two
 * characters after escapes are taken into account.
 */

    int ch;
    int i;


    clear();
    addchar('\'');

    for (i=0; i<2 && ('\'' != ( ch = read_char(stream))); i++) {

        addchar(ch);

        if ( ch == '\n' )
            return ERROR;

        if (ch == '\\' ) {
            ch = get_char(stream);
            addchar(ch);
        }
    }
    addchar('\'');
    addchar('\0');

    if ( i > 2 )
        return ERROR;

    return CHAR_LIT;
}

static int read_str_lit(PFILE *stream) {
/* Used internally by get_source.  Expects the opening quote of a string
 * literal to have already been read.  Returns STR_LIT, or ERROR if a
 * un-escaped newline is found before the close quote.
 */

    int ch;

    clear();
    addchar('"');

    while ( '"' != ( ch = get_char(stream))) {

        if ( '\n' == ch || EOF == ch )
            return ERROR;

        addchar(ch);

        if( ch == '\\' ) {
            ch = read_char(stream);
            addchar(ch);
        }

    }

    addchar('"');
    addchar('\0');

    return STR_LIT;
}

static int read_comment(PFILE *stream) {
/* Skips over a comment in stream.  Assumes the leading '/' has already
 * been read and skips over the body.  If we're reading C++ source, skips
 * C++ single line comments as well as normal C comments.
 */
    int ch;

    clear();

    ch = get_char(stream);

    /* Handle a single line comment.
     */
    if ('/' == ch) {
        addchar('/');
        addchar('/');

        while ( '\n' != ( ch = get_char(stream))) 
            addchar(ch);       

        addchar('\0');
        return COMMENT;
    }

    if ('*' != ch ) {
        unget_character(ch, stream);
        return '/';
    }

    addchar('/');

    do {
        addchar(ch);
        while ('*' !=(ch = get_char(stream)))
            if (EOF == ch)
                return ERROR;
            else
                addchar(ch);
        addchar(ch);
    } while ( '/' != (ch=get_char(stream)));

    addchar('/');
    addchar('\0');

    return COMMENT;
}

int get_source(PFILE *stream) {
/* reads and returns a single "item" from the stream.  An "item" is a
 * comment, a literal or a single character after trigraph and possible
 * digraph substitution has taken place.
 */

    int ch = get_character(stream);

    switch(ch) {
        case '\'':
            return read_char_lit(stream);
        case '"':
            return read_str_lit(stream);
        case '/':
            return read_comment(stream);
        default:
            return ch;
    }
}

#ifdef TEST

int main(int argc, char **argv)  {
    PFILE *f;
    int ch;

    if (argc != 2) {
        fprintf(stderr, "Usage: get_src <filename>\n");
        return EXIT_FAILURE;
    }

    if (NULL==(f= parse_fopen(argv[1]))) {
        fprintf(stderr, "Unable to open: %s\n", argv[1]);
        return EXIT_FAILURE;
    }

    while (EOF!=(ch=get_source(f))) 
        if (ch < 0) 
            printf("\n%s\n", last_token);
        else
            printf("%c", ch);
    parse_fclose(f);
    return 0;       
}

#endif

我不确定将其集成到基于 Flex 的词法分析器中有多容易/困难 - 我似乎回想一下,Flex 有某种钩子来定义它用来读取字符的内容,但我从未尝试过使用它,所以我不能对它说太多(最终,甚至不能说任何接近的东西)确定它甚至存在)。

Yes, there's at least one around.

Edit:

Since there are a few issues that doesn't handle, perhaps it's worth looking at some (hand written) lexing code I wrote several years ago. This basically only handles phases 1, 2 and 3 of translation. If you define DIGRAPH, it also turns on some code to translate C++ digraphs. If memory serves, however, it's doing that earlier in translation than it should really happen, but you probably don't want it in any case. OTOH, this does not even attempt to recognize anywhere close to all tokens -- mostly it separates the source into comments, character literals, string literals, and pretty much everything else. OTOH, it does handle trigraphs, line splicing, etc.

I suppose I should also add that this leaves conversion of the platform's line-ending character into a new-line to the underlying implementation by opening the file in translated (text) mode. Under most circumstances, that's probably the right thing to do, but if you want to produce something like a cross-compiler where your source files have a different line-ending sequence than is normal for this host, you might have to change that.

First the header that defines the external interface to all this stuff:

/* get_src.h */   
#ifndef GET_SRC_INCLUDED
#define GET_SRC_INCLUDED

#include <stdio.h>

#ifdef __cplusplus
extern "C" {
#endif

/* This is the size of the largest token we'll attempt to deal with.  If
 * you want to deal with bigger tokens, change this, and recompile
 * get_src.c.  Note that an entire comment is treated as a single token,
 * so long comments could overflow this.  In case of an overflow, the
 * entire comment will be read as a single token, but the part larger
 * than this will not be stored.
 */
#define MAX_TOKEN_SIZE 8192

/* `last_token' will contain the text of the most recently read token (comment,
 * string literal, or character literal).
 */
extern char last_token[];

/* This is the maximum number of characters that can be put back into a
 * file opened with parse_fopen or parse_fdopen.
 */
#define MAX_UNGETS 5

#include <limits.h>
#include <stdio.h>

typedef struct {
    FILE *file;
    char peeks[MAX_UNGETS];
    int last_peek;
} PFILE;

/* Some codes we return to indicate having found various items in the
 * source code.  ERROR is returned to indicate a newline found in the
 * middle of a character or string literal or if a file ends inside a
 * comment, or if a character literal contains more than two characters.
 *
 * Note that this starts at INT_MIN, the most negative number available
 * in an int.  This keeps these symbols from conflicting with any
 * characters read from the file.  However, one of these could
 * theoretically conflict with EOF.  EOF usually -1, and these are far
 * more negative than that.  However, officially EOF can be any value
 * less than 0...
 */
enum {
    ERROR = INT_MIN,
    COMMENT,
    CHAR_LIT,
    STR_LIT
};

/* Opens a file for parsing and returns a pointer to a structure which
 * can be passed to the other functions in the parser/lexer to identify
 * the file being worked with.
 */
PFILE *parse_fopen(char const *name);

/* This corresponds closely to fdopen - it takes a FILE * as its
 * only parameter, creates a PFILE structure identifying that file, and
 * returns a pointer to that structure.
 */
PFILE *parse_ffopen(FILE *stream);

/* Corresponds to fclose.
 */
int parse_fclose(PFILE *stream);

/* returns characters from `stream' read as C source code.  String
 * literals, characters literals and comments are each returned as a
 * single code from those above.  All strings of any kind of whitespace
 * are returned as a single space character.
 */
int get_source(PFILE *stream);

/* Basically, these two work just like the normal versions of the same,
 * with the minor exception that unget_character can unget more than one
 * character.
 */
int get_character(PFILE *stream);
void unget_character(int ch, PFILE *stream);

#ifdef __cplusplus
}
#endif

#endif

And then the implementation of all that:

/* get_src.c */
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>

#define GET_SOURCE
#include "get_src.h"

static size_t current = 0;

char last_token[MAX_TOKEN_SIZE];

PFILE *parse_fopen(char const *name) {

    PFILE *temp = malloc(sizeof(PFILE));

    if ( NULL != temp ) {
        temp->file = fopen(name, "r");
        memset(temp->peeks, 0, sizeof(temp->peeks));
        temp->last_peek = 0;
    }
    return temp;
}

PFILE *parse_ffopen(FILE *file) {

    PFILE *temp = malloc(sizeof(PFILE));

    if ( NULL != temp) {
        temp->file = file;
        memset(temp->peeks, 0, sizeof(temp->peeks));
        temp->last_peek = 0;
    }
    return temp;
}

int parse_fclose(PFILE *stream) {

    int retval = fclose(stream->file);

    free(stream);
    return retval;
}

static void addchar(int ch) {
/* adds the passed character to the end of `last_token' */

    if ( current < sizeof(last_token) -1 )
        last_token[current++] = (char)ch;

    if ( current == sizeof(last_token)-1 )
        last_token[current] = '\0';
}

static void clear(void) {
/* clears the previous token and starts building a new one. */
    current = 0;
}

static int read_char(PFILE *stream) {
    if ( stream->last_peek > 0 )
        return stream->peeks[--stream->last_peek];
    return fgetc(stream->file);
}

void unget_character(int ch, PFILE * stream) {
    if ( stream->last_peek < sizeof(stream->peeks) )
        stream->peeks[stream->last_peek++] = ch;
}

static int check_trigraph(PFILE *stream) {
/* Checks for trigraphs and returns the equivalant character if there
 * is one.  Expects that the leading '?' of the trigraph has already
 * been read before this is called.
 */

    int ch;

    if ( '?' != (ch=read_char(stream))) {
        unget_character(ch, stream);
        return '?';
    }

    ch = read_char(stream);

    switch( ch ) {
        case '(':   return '[';
        case ')':   return ']';
        case '/':   return '\\';
        case '\'':  return '^';
        case '<':   return '{';
        case '>':   return '}';
        case '!':   return '|';
        case '-':   return '~';
        case '=':   return '#';
        default:
            unget_character('?', stream);
            unget_character(ch, stream);
            return '?';
    }
}

#ifdef DIGRAPH
static int check_digraph(PFILE *stream, int first) {
/* Checks for a digraph.  The first character of the digraph is
 * transmitted as the second parameter, as there are several possible
 * first characters of a digraph.
 */

    int ch = read_char(stream);

    switch(first) {
        case '<':
            if ( '%' == ch )
                return '{';
            if ( ':' == ch )
                return '[';
            break;
        case ':':
            if ( '>' == ch )
                return ']';
            break;
        case '%':
            if ( '>' == ch )
                return '}';
            if ( ':' == ch )
                return '#';
            break;
    }

/* If it's not one of the specific combos above, return the characters
 * separately and unchanged by putting the second one back into the
 * stream, and returning the first one as-is.
 */
    unget_character(ch, stream);
    return first;
}
#endif


static int get_char(PFILE *stream) {
/* Gets a single character from the stream with any trigraphs or digraphs converted 
 * to the single character represented. Note that handling digraphs this early in
 * translation isn't really correct (and shouldn't happen in C at all).
 */
    int ch = read_char(stream);

    if ( ch == '?' )
        return check_trigraph(stream);

#ifdef DIGRAPH
    if (( ch == '<' || ch == ':' || ch == '%' ))
        return check_digraph(stream, ch);
#endif

    return ch;
}

int get_character(PFILE *stream) {
/* gets a character from `stream'.  Any amount of any kind of whitespace
 * is returned as a single space. Escaped new-lines are "eaten" here as well.
 */
    int ch;

    if ( !isspace(ch=get_char(stream)) && ch != '\\')
        return ch;

    // handle line-slicing
    if (ch == '\\') {
        ch = get_char(stream);
        if (ch == '\n') 
            ch = get_char(stream);
        else {
            unget_character(ch, stream);
            return ch;
        }
    }

    /* If it's a space, skip over consecutive white-space */
    while (isspace(ch) && ('\n' != ch))
        ch = get_char(stream);

    if ('\n' == ch)
        return ch;

    /* Then put the non-ws character back */
    unget_character(ch, stream);

    /* and return a single space character... */
    return ' ';
}

static int read_char_lit(PFILE *stream) {
/* This is used internally by `get_source' (below) - it expects the
 * opening quote of a character literal to have already been read and
 * returns CHAR_LIT or ERROR if there's a newline before a close
 * quote is found, or if the character literal contains more than two
 * characters after escapes are taken into account.
 */

    int ch;
    int i;


    clear();
    addchar('\'');

    for (i=0; i<2 && ('\'' != ( ch = read_char(stream))); i++) {

        addchar(ch);

        if ( ch == '\n' )
            return ERROR;

        if (ch == '\\' ) {
            ch = get_char(stream);
            addchar(ch);
        }
    }
    addchar('\'');
    addchar('\0');

    if ( i > 2 )
        return ERROR;

    return CHAR_LIT;
}

static int read_str_lit(PFILE *stream) {
/* Used internally by get_source.  Expects the opening quote of a string
 * literal to have already been read.  Returns STR_LIT, or ERROR if a
 * un-escaped newline is found before the close quote.
 */

    int ch;

    clear();
    addchar('"');

    while ( '"' != ( ch = get_char(stream))) {

        if ( '\n' == ch || EOF == ch )
            return ERROR;

        addchar(ch);

        if( ch == '\\' ) {
            ch = read_char(stream);
            addchar(ch);
        }

    }

    addchar('"');
    addchar('\0');

    return STR_LIT;
}

static int read_comment(PFILE *stream) {
/* Skips over a comment in stream.  Assumes the leading '/' has already
 * been read and skips over the body.  If we're reading C++ source, skips
 * C++ single line comments as well as normal C comments.
 */
    int ch;

    clear();

    ch = get_char(stream);

    /* Handle a single line comment.
     */
    if ('/' == ch) {
        addchar('/');
        addchar('/');

        while ( '\n' != ( ch = get_char(stream))) 
            addchar(ch);       

        addchar('\0');
        return COMMENT;
    }

    if ('*' != ch ) {
        unget_character(ch, stream);
        return '/';
    }

    addchar('/');

    do {
        addchar(ch);
        while ('*' !=(ch = get_char(stream)))
            if (EOF == ch)
                return ERROR;
            else
                addchar(ch);
        addchar(ch);
    } while ( '/' != (ch=get_char(stream)));

    addchar('/');
    addchar('\0');

    return COMMENT;
}

int get_source(PFILE *stream) {
/* reads and returns a single "item" from the stream.  An "item" is a
 * comment, a literal or a single character after trigraph and possible
 * digraph substitution has taken place.
 */

    int ch = get_character(stream);

    switch(ch) {
        case '\'':
            return read_char_lit(stream);
        case '"':
            return read_str_lit(stream);
        case '/':
            return read_comment(stream);
        default:
            return ch;
    }
}

#ifdef TEST

int main(int argc, char **argv)  {
    PFILE *f;
    int ch;

    if (argc != 2) {
        fprintf(stderr, "Usage: get_src <filename>\n");
        return EXIT_FAILURE;
    }

    if (NULL==(f= parse_fopen(argv[1]))) {
        fprintf(stderr, "Unable to open: %s\n", argv[1]);
        return EXIT_FAILURE;
    }

    while (EOF!=(ch=get_source(f))) 
        if (ch < 0) 
            printf("\n%s\n", last_token);
        else
            printf("%c", ch);
    parse_fclose(f);
    return 0;       
}

#endif

I'm not sure about how easy/difficult it would/will be to integrate that into a Flex-based lexer though -- I seem to recall Flex has some sort of hook to define what it uses to read a character, but I've never tried to use it, so I can't say much more about it (and ultimately, can't even say with anything approaching certainty that it even exists).

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文