C 中使用分隔符分割字符串

发布于 2025-01-04 12:37:06 字数 151 浏览 1 评论 0 原文

如何在 C 编程语言中编写一个函数来分割并返回带有分隔符的字符串数组?

char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
str_split(str,',');

How do I write a function to split and return an array for a string with delimiters in the C programming language?

char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
str_split(str,',');

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(25

娇妻 2025-01-11 12:37:06

您可以使用 strtok() 函数来分割字符串(并指定要使用的分隔符)。请注意,strtok() 将修改传递给它的字符串。如果其他地方需要原始字符串,请复制它并将该副本传递给 strtok()

编辑:

示例(注意它不处理连续的分隔符,例如“JAN,,,FEB,MAR”):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

char** str_split(char* a_str, const char a_delim)
{
    char** result    = 0;
    size_t count     = 0;
    char* tmp        = a_str;
    char* last_comma = 0;
    char delim[2];
    delim[0] = a_delim;
    delim[1] = 0;

    /* Count how many elements will be extracted. */
    while (*tmp)
    {
        if (a_delim == *tmp)
        {
            count++;
            last_comma = tmp;
        }
        tmp++;
    }

    /* Add space for trailing token. */
    count += last_comma < (a_str + strlen(a_str) - 1);

    /* Add space for terminating null string so caller
       knows where the list of returned strings ends. */
    count++;

    result = malloc(sizeof(char*) * count);

    if (result)
    {
        size_t idx  = 0;
        char* token = strtok(a_str, delim);

        while (token)
        {
            assert(idx < count);
            *(result + idx++) = strdup(token);
            token = strtok(0, delim);
        }
        assert(idx == count - 1);
        *(result + idx) = 0;
    }

    return result;
}

int main()
{
    char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
    char** tokens;

    printf("months=[%s]\n\n", months);

    tokens = str_split(months, ',');

    if (tokens)
    {
        int i;
        for (i = 0; *(tokens + i); i++)
        {
            printf("month=[%s]\n", *(tokens + i));
            free(*(tokens + i));
        }
        printf("\n");
        free(tokens);
    }

    return 0;
}

输出:

$ ./main.exe
months=[JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC]

month=[JAN]
month=[FEB]
month=[MAR]
month=[APR]
month=[MAY]
month=[JUN]
month=[JUL]
month=[AUG]
month=[SEP]
month=[OCT]
month=[NOV]
month=[DEC]

You can use the strtok() function to split a string (and specify the delimiter to use). Note that strtok() will modify the string passed into it. If the original string is required elsewhere make a copy of it and pass the copy to strtok().

EDIT:

Example (note it does not handle consecutive delimiters, "JAN,,,FEB,MAR" for example):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

char** str_split(char* a_str, const char a_delim)
{
    char** result    = 0;
    size_t count     = 0;
    char* tmp        = a_str;
    char* last_comma = 0;
    char delim[2];
    delim[0] = a_delim;
    delim[1] = 0;

    /* Count how many elements will be extracted. */
    while (*tmp)
    {
        if (a_delim == *tmp)
        {
            count++;
            last_comma = tmp;
        }
        tmp++;
    }

    /* Add space for trailing token. */
    count += last_comma < (a_str + strlen(a_str) - 1);

    /* Add space for terminating null string so caller
       knows where the list of returned strings ends. */
    count++;

    result = malloc(sizeof(char*) * count);

    if (result)
    {
        size_t idx  = 0;
        char* token = strtok(a_str, delim);

        while (token)
        {
            assert(idx < count);
            *(result + idx++) = strdup(token);
            token = strtok(0, delim);
        }
        assert(idx == count - 1);
        *(result + idx) = 0;
    }

    return result;
}

int main()
{
    char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
    char** tokens;

    printf("months=[%s]\n\n", months);

    tokens = str_split(months, ',');

    if (tokens)
    {
        int i;
        for (i = 0; *(tokens + i); i++)
        {
            printf("month=[%s]\n", *(tokens + i));
            free(*(tokens + i));
        }
        printf("\n");
        free(tokens);
    }

    return 0;
}

Output:

$ ./main.exe
months=[JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC]

month=[JAN]
month=[FEB]
month=[MAR]
month=[APR]
month=[MAY]
month=[JUN]
month=[JUL]
month=[AUG]
month=[SEP]
month=[OCT]
month=[NOV]
month=[DEC]
伤痕我心 2025-01-11 12:37:06

我认为 strsep 仍然是最好的工具:

while ((token = strsep(&str, ","))) my_fn(token);

这实际上是分割字符串的一行。

额外的括号是一个风格元素,表明我们有意测试赋值的结果,而不是相等运算符 ==

为了使该模式发挥作用,tokenstr 都具有 char * 类型。如果您从字符串文字开始,那么您需要先复制它:

// More general pattern:
const char *my_str_literal = "JAN,FEB,MAR";
char *token, *str, *tofree;

tofree = str = strdup(my_str_literal);  // We own str's memory now.
while ((token = strsep(&str, ","))) my_fn(token);
free(tofree);

如果两个分隔符同时出现在 str 中,您将获得一个 token 值那是空字符串。 str 的值被修改,因为遇到的每个分隔符都被零字节覆盖 - 这是首先复制要解析的字符串的另一个好理由。

在评论中,有人建议 strtokstrsep 更好,因为 strtok 更便携。 Ubuntu 和 Mac OS X 有 strsep;可以肯定地猜测其他 unixy 系统也这样做。 Windows 缺少 strsep,但它有 strbrk,它可以实现这个简短而甜蜜的 strsep 替换:

char *strsep(char **stringp, const char *delim) {
  if (*stringp == NULL) { return NULL; }
  char *token_start = *stringp;
  *stringp = strpbrk(token_start, delim);
  if (*stringp) {
    **stringp = '\0';
    (*stringp)++;
  }
  return token_start;
}

这里很好地解释了strsepstrtok。可以主观地判断利弊;但是,我认为这是一个明显的迹象,表明 strsep 被设计为 strtok 的替代品。

I think strsep is still the best tool for this:

while ((token = strsep(&str, ","))) my_fn(token);

That is literally one line that splits a string.

The extra parentheses are a stylistic element to indicate that we're intentionally testing the result of an assignment, not an equality operator ==.

For that pattern to work, token and str both have type char *. If you started with a string literal, then you'd want to make a copy of it first:

// More general pattern:
const char *my_str_literal = "JAN,FEB,MAR";
char *token, *str, *tofree;

tofree = str = strdup(my_str_literal);  // We own str's memory now.
while ((token = strsep(&str, ","))) my_fn(token);
free(tofree);

If two delimiters appear together in str, you'll get a token value that's the empty string. The value of str is modified in that each delimiter encountered is overwritten with a zero byte - another good reason to copy the string being parsed first.

In a comment, someone suggested that strtok is better than strsep because strtok is more portable. Ubuntu and Mac OS X have strsep; it's safe to guess that other unixy systems do as well. Windows lacks strsep, but it has strbrk which enables this short and sweet strsep replacement:

char *strsep(char **stringp, const char *delim) {
  if (*stringp == NULL) { return NULL; }
  char *token_start = *stringp;
  *stringp = strpbrk(token_start, delim);
  if (*stringp) {
    **stringp = '\0';
    (*stringp)++;
  }
  return token_start;
}

Here is a good explanation of strsep vs strtok. The pros and cons may be judged subjectively; however, I think it's a telling sign that strsep was designed as a replacement for strtok.

粉红×色少女 2025-01-11 12:37:06

字符串分词器这段代码应该会让你走向正确的方向。

int main(void) {
  char st[] ="Where there is will, there is a way.";
  char *ch;
  ch = strtok(st, " ");
  while (ch != NULL) {
    printf("%s\n", ch);
    ch = strtok(NULL, " ,");
  }
  getch();
  return 0;
}

String tokenizer this code should put you in the right direction.

int main(void) {
  char st[] ="Where there is will, there is a way.";
  char *ch;
  ch = strtok(st, " ");
  while (ch != NULL) {
    printf("%s\n", ch);
    ch = strtok(NULL, " ,");
  }
  getch();
  return 0;
}
怀里藏娇 2025-01-11 12:37:06

这是我的两分钱:

int split (const char *txt, char delim, char ***tokens)
{
    int *tklen, *t, count = 1;
    char **arr, *p = (char *) txt;

    while (*p != '\0') if (*p++ == delim) count += 1;
    t = tklen = calloc (count, sizeof (int));
    for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
    *tokens = arr = malloc (count * sizeof (char *));
    t = tklen;
    p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
    while (*txt != '\0')
    {
        if (*txt == delim)
        {
            p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
            txt++;
        }
        else *p++ = *txt++;
    }
    free (tklen);
    return count;
}

用法:

char **tokens;
int count, i;
const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";

count = split (str, ',', &tokens);
for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);

/* freeing tokens */
for (i = 0; i < count; i++) free (tokens[i]);
free (tokens);

Here is my two cents:

int split (const char *txt, char delim, char ***tokens)
{
    int *tklen, *t, count = 1;
    char **arr, *p = (char *) txt;

    while (*p != '\0') if (*p++ == delim) count += 1;
    t = tklen = calloc (count, sizeof (int));
    for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
    *tokens = arr = malloc (count * sizeof (char *));
    t = tklen;
    p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
    while (*txt != '\0')
    {
        if (*txt == delim)
        {
            p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
            txt++;
        }
        else *p++ = *txt++;
    }
    free (tklen);
    return count;
}

Usage:

char **tokens;
int count, i;
const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";

count = split (str, ',', &tokens);
for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);

/* freeing tokens */
for (i = 0; i < count; i++) free (tokens[i]);
free (tokens);
沙沙粒小 2025-01-11 12:37:06

下面的方法将为您完成所有工作(内存分配、计算长度)。更多信息和描述可以在这里找到 - 实现Java String.split()方法分割C字符串

int split (const char *str, char c, char ***arr)
{
    int count = 1;
    int token_len = 1;
    int i = 0;
    char *p;
    char *t;

    p = str;
    while (*p != '\0')
    {
        if (*p == c)
            count++;
        p++;
    }

    *arr = (char**) malloc(sizeof(char*) * count);
    if (*arr == NULL)
        exit(1);

    p = str;
    while (*p != '\0')
    {
        if (*p == c)
        {
            (*arr)[i] = (char*) malloc( sizeof(char) * token_len );
            if ((*arr)[i] == NULL)
                exit(1);

            token_len = 0;
            i++;
        }
        p++;
        token_len++;
    }
    (*arr)[i] = (char*) malloc( sizeof(char) * token_len );
    if ((*arr)[i] == NULL)
        exit(1);

    i = 0;
    p = str;
    t = ((*arr)[i]);
    while (*p != '\0')
    {
        if (*p != c && *p != '\0')
        {
            *t = *p;
            t++;
        }
        else
        {
            *t = '\0';
            i++;
            t = ((*arr)[i]);
        }
        p++;
    }

    return count;
}

使用方法:

int main (int argc, char ** argv)
{
    int i;
    char *s = "Hello, this is a test module for the string splitting.";
    int c = 0;
    char **arr = NULL;

    c = split(s, ' ', &arr);

    printf("found %d tokens.\n", c);

    for (i = 0; i < c; i++)
        printf("string #%d: %s\n", i, arr[i]);

    return 0;
}

Method below will do all the job (memory allocation, counting the length) for you. More information and description can be found here - Implementation of Java String.split() method to split C string

int split (const char *str, char c, char ***arr)
{
    int count = 1;
    int token_len = 1;
    int i = 0;
    char *p;
    char *t;

    p = str;
    while (*p != '\0')
    {
        if (*p == c)
            count++;
        p++;
    }

    *arr = (char**) malloc(sizeof(char*) * count);
    if (*arr == NULL)
        exit(1);

    p = str;
    while (*p != '\0')
    {
        if (*p == c)
        {
            (*arr)[i] = (char*) malloc( sizeof(char) * token_len );
            if ((*arr)[i] == NULL)
                exit(1);

            token_len = 0;
            i++;
        }
        p++;
        token_len++;
    }
    (*arr)[i] = (char*) malloc( sizeof(char) * token_len );
    if ((*arr)[i] == NULL)
        exit(1);

    i = 0;
    p = str;
    t = ((*arr)[i]);
    while (*p != '\0')
    {
        if (*p != c && *p != '\0')
        {
            *t = *p;
            t++;
        }
        else
        {
            *t = '\0';
            i++;
            t = ((*arr)[i]);
        }
        p++;
    }

    return count;
}

How to use it:

int main (int argc, char ** argv)
{
    int i;
    char *s = "Hello, this is a test module for the string splitting.";
    int c = 0;
    char **arr = NULL;

    c = split(s, ' ', &arr);

    printf("found %d tokens.\n", c);

    for (i = 0; i < c; i++)
        printf("string #%d: %s\n", i, arr[i]);

    return 0;
}
故事↓在人 2025-01-11 12:37:06

在上面的示例中,有一种方法可以在字符串中返回一个以空结尾的字符串数组(如您想要的)。但它无法传递文字字符串,因为它必须由函数修改:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

char** str_split( char* str, char delim, int* numSplits )
{
    char** ret;
    int retLen;
    char* c;

    if ( ( str == NULL ) ||
        ( delim == '\0' ) )
    {
        /* Either of those will cause problems */
        ret = NULL;
        retLen = -1;
    }
    else
    {
        retLen = 0;
        c = str;

        /* Pre-calculate number of elements */
        do
        {
            if ( *c == delim )
            {
                retLen++;
            }

            c++;
        } while ( *c != '\0' );

        ret = malloc( ( retLen + 1 ) * sizeof( *ret ) );
        ret[retLen] = NULL;

        c = str;
        retLen = 1;
        ret[0] = str;

        do
        {
            if ( *c == delim )
            {
                ret[retLen++] = &c[1];
                *c = '\0';
            }

            c++;
        } while ( *c != '\0' );
    }

    if ( numSplits != NULL )
    {
        *numSplits = retLen;
    }

    return ret;
}

int main( int argc, char* argv[] )
{
    const char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";

    char* strCpy;
    char** split;
    int num;
    int i;

    strCpy = malloc( strlen( str ) * sizeof( *strCpy ) );
    strcpy( strCpy, str );

    split = str_split( strCpy, ',', &num );

    if ( split == NULL )
    {
        puts( "str_split returned NULL" );
    }
    else
    {
        printf( "%i Results: \n", num );

        for ( i = 0; i < num; i++ )
        {
            puts( split[i] );
        }
    }

    free( split );
    free( strCpy );

    return 0;
}

可能有一种更简洁的方法来做到这一点,但你明白了。

In the above example, there would be a way to return an array of null terminated strings (like you want) in place in the string. It would not make it possible to pass a literal string though, as it would have to be modified by the function:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

char** str_split( char* str, char delim, int* numSplits )
{
    char** ret;
    int retLen;
    char* c;

    if ( ( str == NULL ) ||
        ( delim == '\0' ) )
    {
        /* Either of those will cause problems */
        ret = NULL;
        retLen = -1;
    }
    else
    {
        retLen = 0;
        c = str;

        /* Pre-calculate number of elements */
        do
        {
            if ( *c == delim )
            {
                retLen++;
            }

            c++;
        } while ( *c != '\0' );

        ret = malloc( ( retLen + 1 ) * sizeof( *ret ) );
        ret[retLen] = NULL;

        c = str;
        retLen = 1;
        ret[0] = str;

        do
        {
            if ( *c == delim )
            {
                ret[retLen++] = &c[1];
                *c = '\0';
            }

            c++;
        } while ( *c != '\0' );
    }

    if ( numSplits != NULL )
    {
        *numSplits = retLen;
    }

    return ret;
}

int main( int argc, char* argv[] )
{
    const char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";

    char* strCpy;
    char** split;
    int num;
    int i;

    strCpy = malloc( strlen( str ) * sizeof( *strCpy ) );
    strcpy( strCpy, str );

    split = str_split( strCpy, ',', &num );

    if ( split == NULL )
    {
        puts( "str_split returned NULL" );
    }
    else
    {
        printf( "%i Results: \n", num );

        for ( i = 0; i < num; i++ )
        {
            puts( split[i] );
        }
    }

    free( split );
    free( strCpy );

    return 0;
}

There is probably a neater way to do it, but you get the idea.

童话 2025-01-11 12:37:06
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>

/**
 *  splits str on delim and dynamically allocates an array of pointers.
 *
 *  On error -1 is returned, check errno
 *  On success size of array is returned, which may be 0 on an empty string
 *  or 1 if no delim was found.  
 *
 *  You could rewrite this to return the char ** array instead and upon NULL
 *  know it's an allocation problem but I did the triple array here.  Note that
 *  upon the hitting two delim's in a row "foo,,bar" the array would be:
 *  { "foo", NULL, "bar" } 
 * 
 *  You need to define the semantics of a trailing delim Like "foo," is that a
 *  2 count array or an array of one?  I choose the two count with the second entry
 *  set to NULL since it's valueless.
 *  Modifies str so make a copy if this is a problem
 */
int split( char * str, char delim, char ***array, int *length ) {
  char *p;
  char **res;
  int count=0;
  int k=0;

  p = str;
  // Count occurance of delim in string
  while( (p=strchr(p,delim)) != NULL ) {
    *p = 0; // Null terminate the deliminator.
    p++; // Skip past our new null
    count++;
  }

  // allocate dynamic array
  res = calloc( 1, count * sizeof(char *));
  if( !res ) return -1;

  p = str;
  for( k=0; k<count; k++ ){
    if( *p ) res[k] = p;  // Copy start of string
    p = strchr(p, 0 );    // Look for next null
    p++; // Start of next string
  }

  *array = res;
  *length = count;

  return 0;
}

char str[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,";

int main() {
  char **res;
  int k=0;
  int count =0;
  int rc;

  rc = split( str, ',', &res, &count );
  if( rc ) {
    printf("Error: %s errno: %d \n", strerror(errno), errno);
  }

  printf("count: %d\n", count );
  for( k=0; k<count; k++ ) {
    printf("str: %s\n", res[k]);
  }

  free(res );
  return 0;
}
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>

/**
 *  splits str on delim and dynamically allocates an array of pointers.
 *
 *  On error -1 is returned, check errno
 *  On success size of array is returned, which may be 0 on an empty string
 *  or 1 if no delim was found.  
 *
 *  You could rewrite this to return the char ** array instead and upon NULL
 *  know it's an allocation problem but I did the triple array here.  Note that
 *  upon the hitting two delim's in a row "foo,,bar" the array would be:
 *  { "foo", NULL, "bar" } 
 * 
 *  You need to define the semantics of a trailing delim Like "foo," is that a
 *  2 count array or an array of one?  I choose the two count with the second entry
 *  set to NULL since it's valueless.
 *  Modifies str so make a copy if this is a problem
 */
int split( char * str, char delim, char ***array, int *length ) {
  char *p;
  char **res;
  int count=0;
  int k=0;

  p = str;
  // Count occurance of delim in string
  while( (p=strchr(p,delim)) != NULL ) {
    *p = 0; // Null terminate the deliminator.
    p++; // Skip past our new null
    count++;
  }

  // allocate dynamic array
  res = calloc( 1, count * sizeof(char *));
  if( !res ) return -1;

  p = str;
  for( k=0; k<count; k++ ){
    if( *p ) res[k] = p;  // Copy start of string
    p = strchr(p, 0 );    // Look for next null
    p++; // Start of next string
  }

  *array = res;
  *length = count;

  return 0;
}

char str[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,";

int main() {
  char **res;
  int k=0;
  int count =0;
  int rc;

  rc = split( str, ',', &res, &count );
  if( rc ) {
    printf("Error: %s errno: %d \n", strerror(errno), errno);
  }

  printf("count: %d\n", count );
  for( k=0; k<count; k++ ) {
    printf("str: %s\n", res[k]);
  }

  free(res );
  return 0;
}
红尘作伴 2025-01-11 12:37:06

我认为以下解决方案是理想的:

  • 不会破坏源字符串
  • 可重入 - 即,您可以从一个或多个线程中的任何位置安全地调用它
  • 可移植正确
  • 处理多个分隔符
  • 快速高效

代码说明:

  1. 定义一个结构 < code>token 存储令牌的地址和长度
  2. 在最坏的情况下为这些分配足够的内存,即当
    str 完全由分隔符组成,因此有 strlen(str) + 1
    标记,全部为空字符串
  3. 扫描str记录每个标记的地址和长度
  4. 使用它来分配正确大小的输出数组,包括用于NULL的额外空间哨兵值
  5. 使用开始和长度分配、复制和添加标记
    信息 - 使用 memcpy 因为它比 strcpy 更快,我们知道
    长度
  6. 释放令牌地址和长度数组
  7. 返回令牌数组
typedef struct {
    const char *start;
    size_t len;
} token;

char **split(const char *str, char sep)
{
    char **array;
    unsigned int start = 0, stop, toks = 0, t;
    token *tokens = malloc((strlen(str) + 1) * sizeof(token));
    for (stop = 0; str[stop]; stop++) {
        if (str[stop] == sep) {
            tokens[toks].start = str + start;
            tokens[toks].len = stop - start;
            toks++;
            start = stop + 1;
        }
    }
    /* Mop up the last token */
    tokens[toks].start = str + start;
    tokens[toks].len = stop - start;
    toks++;
    array = malloc((toks + 1) * sizeof(char*));
    for (t = 0; t < toks; t++) {
        /* Calloc makes it nul-terminated */
        char *token = calloc(tokens[t].len + 1, 1);
        memcpy(token, tokens[t].start, tokens[t].len);
        array[t] = token;
    }
    /* Add a sentinel */
    array[t] = NULL; 
    free(tokens);
    return array;
}

注意 为了简洁起见,省略了 malloc 检查。

一般来说,我不会从像这样的 split 函数返回 char * 指针数组,因为它给调用者带来了很多正确释放它们的责任。我更喜欢的接口是允许调用者传递回调函数并为每个令牌调用该函数,正如我在这里所描述的: 在 C 中分割字符串

I think the following solution is ideal:

  • Doesn't destroy the source string
  • Re-entrant - i.e., you can safely call it from anywhere in one or more threads
  • Portable
  • Handles multiple separators correctly
  • Fast and efficient

Explanation of the code:

  1. Define a structure token to store the address and lengths of the tokens
  2. Allocate enough memory for these in the worst case, which is when
    str is made up entirely of separators so there are strlen(str) + 1
    tokens, all of them empty strings
  3. Scan str recording the address and length of every token
  4. Use this to allocate the output array of the correct size, including an extra space for a NULL sentinel value
  5. Allocate, copy, and add the tokens using the start and length
    information - use memcpy as it's faster than strcpy and we know
    the lengths
  6. Free the token address and length array
  7. Return the array of tokens
typedef struct {
    const char *start;
    size_t len;
} token;

char **split(const char *str, char sep)
{
    char **array;
    unsigned int start = 0, stop, toks = 0, t;
    token *tokens = malloc((strlen(str) + 1) * sizeof(token));
    for (stop = 0; str[stop]; stop++) {
        if (str[stop] == sep) {
            tokens[toks].start = str + start;
            tokens[toks].len = stop - start;
            toks++;
            start = stop + 1;
        }
    }
    /* Mop up the last token */
    tokens[toks].start = str + start;
    tokens[toks].len = stop - start;
    toks++;
    array = malloc((toks + 1) * sizeof(char*));
    for (t = 0; t < toks; t++) {
        /* Calloc makes it nul-terminated */
        char *token = calloc(tokens[t].len + 1, 1);
        memcpy(token, tokens[t].start, tokens[t].len);
        array[t] = token;
    }
    /* Add a sentinel */
    array[t] = NULL; 
    free(tokens);
    return array;
}

Note malloc checking omitted for brevity.

In general, I wouldn't return an array of char * pointers from a split function like this as it places a lot of responsibility on the caller to free them correctly. An interface I prefer is to allow the caller to pass a callback function and call this for every token, as I have described here: Split a String in C.

谎言月老 2025-01-11 12:37:06

此优化方法在 *result 中创建(或更新现有)指针数组,并返回 *count 中的元素数量。

使用“max”来指示您期望的最大字符串数(当您指定现有数组或任何其他原因时),否则将其设置为 0

要与分隔符列表进行比较,请将 delim 定义为 char* 并替换该行:

if (str[i]==delim) {

包含以下两行:

 char *c=delim; while(*c && *c!=str[i]) c++;
 if (*c) {

Enjoy

#include <stdlib.h>
#include <string.h>

char **split(char *str, size_t len, char delim, char ***result, unsigned long *count, unsigned long max) {
  size_t i;
  char **_result;

  // there is at least one string returned
  *count=1;

  _result= *result;

  // when the result array is specified, fill it during the first pass
  if (_result) {
    _result[0]=str;
  }

  // scan the string for delimiter, up to specified length
  for (i=0; i<len; ++i) {

    // to compare against a list of delimiters,
    // define delim as a string and replace 
    // the next line:
    //     if (str[i]==delim) {
    //
    // with the two following lines:
    //     char *c=delim; while(*c && *c!=str[i]) c++;
    //     if (*c) {
    //       
    if (str[i]==delim) {

      // replace delimiter with zero
      str[i]=0;

      // when result array is specified, fill it during the first pass
      if (_result) {
        _result[*count]=str+i+1;
      }

      // increment count for each separator found
      ++(*count);

      // if max is specified, dont go further
      if (max && *count==max)  {
        break;
      }

    }
  }

  // when result array is specified, we are done here
  if (_result) {
    return _result;
  }

  // else allocate memory for result
  // and fill the result array                                                                                    

  *result=malloc((*count)*sizeof(char*));
  if (!*result) {
    return NULL;
  }
  _result=*result;

  // add first string to result
  _result[0]=str;

  // if theres more strings
  for (i=1; i<*count; ++i) {

    // find next string
    while(*str) ++str;
    ++str;

    // add next string to result
    _result[i]=str;

  }

  return _result;
}  

用法示例:

#include <stdio.h>

int main(int argc, char **argv) {
  char *str="JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
  char **result=malloc(6*sizeof(char*));
  char **result2=0;
  unsigned long count;
  unsigned long count2;
  unsigned long i;

  split(strdup(str),strlen(str),',',&result,&count,6);
  split(strdup(str),strlen(str),',',&result2,&count2,0);

  if (result)
  for (i=0; i<count; ++i) {
    printf("%s\n",result[i]);
  }

  printf("\n");

  if (result2)
  for (i=0; i<count2; ++i) {
    printf("%s\n", result2[i]);
  }

  return 0;

}

This optimized method create (or update an existing) array of pointers in *result and returns the number of elements in *count.

Use "max" to indicate the maximum number of strings you expect (when you specify an existing array or any other reaseon), else set it to 0

To compare against a list of delimiters, define delim as a char* and replace the line:

if (str[i]==delim) {

with the two following lines:

 char *c=delim; while(*c && *c!=str[i]) c++;
 if (*c) {

Enjoy

#include <stdlib.h>
#include <string.h>

char **split(char *str, size_t len, char delim, char ***result, unsigned long *count, unsigned long max) {
  size_t i;
  char **_result;

  // there is at least one string returned
  *count=1;

  _result= *result;

  // when the result array is specified, fill it during the first pass
  if (_result) {
    _result[0]=str;
  }

  // scan the string for delimiter, up to specified length
  for (i=0; i<len; ++i) {

    // to compare against a list of delimiters,
    // define delim as a string and replace 
    // the next line:
    //     if (str[i]==delim) {
    //
    // with the two following lines:
    //     char *c=delim; while(*c && *c!=str[i]) c++;
    //     if (*c) {
    //       
    if (str[i]==delim) {

      // replace delimiter with zero
      str[i]=0;

      // when result array is specified, fill it during the first pass
      if (_result) {
        _result[*count]=str+i+1;
      }

      // increment count for each separator found
      ++(*count);

      // if max is specified, dont go further
      if (max && *count==max)  {
        break;
      }

    }
  }

  // when result array is specified, we are done here
  if (_result) {
    return _result;
  }

  // else allocate memory for result
  // and fill the result array                                                                                    

  *result=malloc((*count)*sizeof(char*));
  if (!*result) {
    return NULL;
  }
  _result=*result;

  // add first string to result
  _result[0]=str;

  // if theres more strings
  for (i=1; i<*count; ++i) {

    // find next string
    while(*str) ++str;
    ++str;

    // add next string to result
    _result[i]=str;

  }

  return _result;
}  

Usage example:

#include <stdio.h>

int main(int argc, char **argv) {
  char *str="JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
  char **result=malloc(6*sizeof(char*));
  char **result2=0;
  unsigned long count;
  unsigned long count2;
  unsigned long i;

  split(strdup(str),strlen(str),',',&result,&count,6);
  split(strdup(str),strlen(str),',',&result2,&count2,0);

  if (result)
  for (i=0; i<count; ++i) {
    printf("%s\n",result[i]);
  }

  printf("\n");

  if (result2)
  for (i=0; i<count2; ++i) {
    printf("%s\n", result2[i]);
  }

  return 0;

}
╰沐子 2025-01-11 12:37:06

我的版本:

int split(char* str, const char delimeter, char*** args) {
    int cnt = 1;
    char* t = str;

    while (*t == delimeter) t++;

    char* t2 = t;
    while (*(t2++))
        if (*t2 == delimeter && *(t2 + 1) != delimeter && *(t2 + 1) != 0) cnt++;

    (*args) = malloc(sizeof(char*) * cnt);

    for(int i = 0; i < cnt; i++) {
        char* ts = t;
        while (*t != delimeter && *t != 0) t++;

        int len = (t - ts + 1);
        (*args)[i] = malloc(sizeof(char) * len);
        memcpy((*args)[i], ts, sizeof(char) * (len - 1));
        (*args)[i][len - 1] = 0;

        while (*t == delimeter) t++;
    }

    return cnt;
}

My version:

int split(char* str, const char delimeter, char*** args) {
    int cnt = 1;
    char* t = str;

    while (*t == delimeter) t++;

    char* t2 = t;
    while (*(t2++))
        if (*t2 == delimeter && *(t2 + 1) != delimeter && *(t2 + 1) != 0) cnt++;

    (*args) = malloc(sizeof(char*) * cnt);

    for(int i = 0; i < cnt; i++) {
        char* ts = t;
        while (*t != delimeter && *t != 0) t++;

        int len = (t - ts + 1);
        (*args)[i] = malloc(sizeof(char) * len);
        memcpy((*args)[i], ts, sizeof(char) * (len - 1));
        (*args)[i][len - 1] = 0;

        while (*t == delimeter) t++;
    }

    return cnt;
}
我们只是彼此的过ke 2025-01-11 12:37:06

该函数接受一个 char* 字符串并用分隔符将其分割。连续可以有多个分隔符。请注意,该函数修改了原始字符串。如果您需要原始字符串保持不变,则必须首先复制原始字符串。该函数不使用任何 cstring 函数调用,因此它可能比其他函数快一点。如果你不关心内存分配,你可以在函数顶部分配 sub_strings ,大小为 strlen(src_str)/2 ,并且(就像提到的 c++“版本”)跳过函数的下半部分。如果这样做,函数会减少到 O(N),但下面显示的内存优化方式是 O(2N)。

功能:

char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
  //replace deliminator's with zeros and count how many
  //sub strings with length >= 1 exist
  num_sub_str = 0;
  char *src_str_tmp = src_str;
  bool found_delim = true;
  while(*src_str_tmp){
    if(*src_str_tmp == deliminator){
      *src_str_tmp = 0;
      found_delim = true;
    }
    else if(found_delim){ //found first character of a new string
      num_sub_str++;
      found_delim = false;
      //sub_str_vec.push_back(src_str_tmp); //for c++
    }
    src_str_tmp++;
  }
  printf("Start - found %d sub strings\n", num_sub_str);
  if(num_sub_str <= 0){
    printf("str_split() - no substrings were found\n");
    return(0);
  }

  //if you want to use a c++ vector and push onto it, the rest of this function
  //can be omitted (obviously modifying input parameters to take a vector, etc)

  char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
  const char *src_str_terminator = src_str_tmp;
  src_str_tmp = src_str;
  bool found_null = true;
  size_t idx = 0;
  while(src_str_tmp < src_str_terminator){
    if(!*src_str_tmp) //found a NULL
      found_null = true;
    else if(found_null){
      sub_strings[idx++] = src_str_tmp;
      //printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
      found_null = false;
    }
    src_str_tmp++;
  }
  sub_strings[num_sub_str] = NULL;

  return(sub_strings);
}

使用方法:

  char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
  char *str = strdup(months);
  size_t num_sub_str;
  char **sub_strings = str_split(str, ',', num_sub_str);
  char *endptr;
  if(sub_strings){
    for(int i = 0; sub_strings[i]; i++)
      printf("[%s]\n", sub_strings[i]);
  }
  free(sub_strings);
  free(str);

This function takes a char* string and splits it by the deliminator. There can be multiple deliminators in a row. Note that the function modifies the orignal string. You must make a copy of the original string first if you need the original to stay unaltered. This function doesn't use any cstring function calls so it might be a little faster than others. If you don't care about memory allocation, you can allocate sub_strings at the top of the function with size strlen(src_str)/2 and (like the c++ "version" mentioned) skip the bottom half of the function. If you do this, the function is reduced to O(N), but the memory optimized way shown below is O(2N).

The function:

char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
  //replace deliminator's with zeros and count how many
  //sub strings with length >= 1 exist
  num_sub_str = 0;
  char *src_str_tmp = src_str;
  bool found_delim = true;
  while(*src_str_tmp){
    if(*src_str_tmp == deliminator){
      *src_str_tmp = 0;
      found_delim = true;
    }
    else if(found_delim){ //found first character of a new string
      num_sub_str++;
      found_delim = false;
      //sub_str_vec.push_back(src_str_tmp); //for c++
    }
    src_str_tmp++;
  }
  printf("Start - found %d sub strings\n", num_sub_str);
  if(num_sub_str <= 0){
    printf("str_split() - no substrings were found\n");
    return(0);
  }

  //if you want to use a c++ vector and push onto it, the rest of this function
  //can be omitted (obviously modifying input parameters to take a vector, etc)

  char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
  const char *src_str_terminator = src_str_tmp;
  src_str_tmp = src_str;
  bool found_null = true;
  size_t idx = 0;
  while(src_str_tmp < src_str_terminator){
    if(!*src_str_tmp) //found a NULL
      found_null = true;
    else if(found_null){
      sub_strings[idx++] = src_str_tmp;
      //printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
      found_null = false;
    }
    src_str_tmp++;
  }
  sub_strings[num_sub_str] = NULL;

  return(sub_strings);
}

How to use it:

  char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
  char *str = strdup(months);
  size_t num_sub_str;
  char **sub_strings = str_split(str, ',', num_sub_str);
  char *endptr;
  if(sub_strings){
    for(int i = 0; sub_strings[i]; i++)
      printf("[%s]\n", sub_strings[i]);
  }
  free(sub_strings);
  free(str);
小兔几 2025-01-11 12:37:06

下面是我从 zString 库 实现的 strtok()
zstring_strtok() 与标准库的 strtok() 的不同之处在于它处理连续分隔符的方式。

只需看一下下面的代码,确保您会了解它是如何工作的(我尝试使用尽可能多的注释)

char *zstring_strtok(char *str, const char *delim) {
    static char *static_str=0;      /* var to store last address */
    int index=0, strlength=0;       /* integers for indexes */
    int found = 0;                  /* check if delim is found */

    /* delimiter cannot be NULL
    * if no more char left, return NULL as well
    */
    if (delim==0 || (str == 0 && static_str == 0))
        return 0;

    if (str == 0)
        str = static_str;

    /* get length of string */
    while(str[strlength])
        strlength++;

    /* find the first occurance of delim */
    for (index=0;index<strlength;index++)
        if (str[index]==delim[0]) {
            found=1;
            break;
        }

    /* if delim is not contained in str, return str */
    if (!found) {
        static_str = 0;
        return str;
    }

    /* check for consecutive delimiters
    *if first char is delim, return delim
    */
    if (str[0]==delim[0]) {
        static_str = (str + 1);
        return (char *)delim;
    }

    /* terminate the string
    * this assignmetn requires char[], so str has to
    * be char[] rather than *char
    */
    str[index] = '\0';

    /* save the rest of the string */
    if ((str + index + 1)!=0)
        static_str = (str + index + 1);
    else
        static_str = 0;

        return str;
}

下面是一个示例用法...

  Example Usage
      char str[] = "A,B,,,C";
      printf("1 %s\n",zstring_strtok(s,","));
      printf("2 %s\n",zstring_strtok(NULL,","));
      printf("3 %s\n",zstring_strtok(NULL,","));
      printf("4 %s\n",zstring_strtok(NULL,","));
      printf("5 %s\n",zstring_strtok(NULL,","));
      printf("6 %s\n",zstring_strtok(NULL,","));

  Example Output
      1 A
      2 B
      3 ,
      4 ,
      5 C
      6 (null)

该库可以从 Github 下载
https://github.com/fnoyanisi/zString

Below is my strtok() implementation from zString library.
zstring_strtok() differs from standard library's strtok() in the way it treats consecutive delimiters.

Just have a look at the code below,sure that you will get an idea about how it works (I tried to use as many comments as I could)

char *zstring_strtok(char *str, const char *delim) {
    static char *static_str=0;      /* var to store last address */
    int index=0, strlength=0;       /* integers for indexes */
    int found = 0;                  /* check if delim is found */

    /* delimiter cannot be NULL
    * if no more char left, return NULL as well
    */
    if (delim==0 || (str == 0 && static_str == 0))
        return 0;

    if (str == 0)
        str = static_str;

    /* get length of string */
    while(str[strlength])
        strlength++;

    /* find the first occurance of delim */
    for (index=0;index<strlength;index++)
        if (str[index]==delim[0]) {
            found=1;
            break;
        }

    /* if delim is not contained in str, return str */
    if (!found) {
        static_str = 0;
        return str;
    }

    /* check for consecutive delimiters
    *if first char is delim, return delim
    */
    if (str[0]==delim[0]) {
        static_str = (str + 1);
        return (char *)delim;
    }

    /* terminate the string
    * this assignmetn requires char[], so str has to
    * be char[] rather than *char
    */
    str[index] = '\0';

    /* save the rest of the string */
    if ((str + index + 1)!=0)
        static_str = (str + index + 1);
    else
        static_str = 0;

        return str;
}

Below is an example usage...

  Example Usage
      char str[] = "A,B,,,C";
      printf("1 %s\n",zstring_strtok(s,","));
      printf("2 %s\n",zstring_strtok(NULL,","));
      printf("3 %s\n",zstring_strtok(NULL,","));
      printf("4 %s\n",zstring_strtok(NULL,","));
      printf("5 %s\n",zstring_strtok(NULL,","));
      printf("6 %s\n",zstring_strtok(NULL,","));

  Example Output
      1 A
      2 B
      3 ,
      4 ,
      5 C
      6 (null)

The library can be downloaded from Github
https://github.com/fnoyanisi/zString

是伱的 2025-01-11 12:37:06

这是一个可以处理多字符分隔符的字符串分割函数。请注意,如果分隔符比要分割的字符串长,则 bufferstringLengths 将设置为 (void *) 0,并且 numStrings 将设置为 0

该算法已经过测试并且有效。 (免责声明:尚未针对非 ASCII 字符串进行测试,并且假设调用者给出了有效参数)

void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
    const int lo = strlen(original);
    const int ld = strlen(delimiter);
    if(ld > lo){
        *buffer = (void *)0;
        *numStrings = 0;
        *stringLengths = (void *)0;
        return;
    }

    *numStrings = 1;

    for(int i = 0;i < (lo - ld);i++){
        if(strncmp(&original[i], delimiter, ld) == 0) {
            i += (ld - 1);
            (*numStrings)++;
        }
    }

    *stringLengths = (int *) malloc(sizeof(int) * *numStrings);

    int currentStringLength = 0;
    int currentStringNumber = 0;
    int delimiterTokenDecrementCounter = 0;
    for(int i = 0;i < lo;i++){
        if(delimiterTokenDecrementCounter > 0){
            delimiterTokenDecrementCounter--;
        } else if(i < (lo - ld)){
            if(strncmp(&original[i], delimiter, ld) == 0){
                (*stringLengths)[currentStringNumber] = currentStringLength;
                currentStringNumber++;
                currentStringLength = 0;
                delimiterTokenDecrementCounter = ld - 1;
            } else {
                currentStringLength++;
            }
        } else {
            currentStringLength++;
        }

        if(i == (lo - 1)){
            (*stringLengths)[currentStringNumber] = currentStringLength;
        }
    }

    *buffer = (char **) malloc(sizeof(char *) * (*numStrings));
    for(int i = 0;i < *numStrings;i++){
        (*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
    }

    currentStringNumber = 0;
    currentStringLength = 0;
    delimiterTokenDecrementCounter = 0;
    for(int i = 0;i < lo;i++){
        if(delimiterTokenDecrementCounter > 0){
            delimiterTokenDecrementCounter--;
        } else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
            (*buffer)[currentStringNumber][currentStringLength] = 0;
            delimiterTokenDecrementCounter = ld - 1;
            currentStringLength = 0;
            currentStringNumber++;
        } else {
            (*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
            currentStringLength++;
        }
    }
    buffer[currentStringNumber][currentStringLength] = 0;
}

示例代码:

int main(){
    const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
    char **buffer;
    int numStrings;
    int * stringLengths;

    splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);

    for(int i = 0;i < numStrings;i++){
        printf("String: %s\n", buffer[i]);
    }
}

库:

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

This is a string splitting function that can handle multi-character delimiters. Note that if the delimiter is longer than the string that is being split, then buffer and stringLengths will be set to (void *) 0, and numStrings will be set to 0.

This algorithm has been tested, and works. (Disclaimer: It has not been tested for non-ASCII strings, and it assumes that the caller gave valid parameters)

void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
    const int lo = strlen(original);
    const int ld = strlen(delimiter);
    if(ld > lo){
        *buffer = (void *)0;
        *numStrings = 0;
        *stringLengths = (void *)0;
        return;
    }

    *numStrings = 1;

    for(int i = 0;i < (lo - ld);i++){
        if(strncmp(&original[i], delimiter, ld) == 0) {
            i += (ld - 1);
            (*numStrings)++;
        }
    }

    *stringLengths = (int *) malloc(sizeof(int) * *numStrings);

    int currentStringLength = 0;
    int currentStringNumber = 0;
    int delimiterTokenDecrementCounter = 0;
    for(int i = 0;i < lo;i++){
        if(delimiterTokenDecrementCounter > 0){
            delimiterTokenDecrementCounter--;
        } else if(i < (lo - ld)){
            if(strncmp(&original[i], delimiter, ld) == 0){
                (*stringLengths)[currentStringNumber] = currentStringLength;
                currentStringNumber++;
                currentStringLength = 0;
                delimiterTokenDecrementCounter = ld - 1;
            } else {
                currentStringLength++;
            }
        } else {
            currentStringLength++;
        }

        if(i == (lo - 1)){
            (*stringLengths)[currentStringNumber] = currentStringLength;
        }
    }

    *buffer = (char **) malloc(sizeof(char *) * (*numStrings));
    for(int i = 0;i < *numStrings;i++){
        (*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
    }

    currentStringNumber = 0;
    currentStringLength = 0;
    delimiterTokenDecrementCounter = 0;
    for(int i = 0;i < lo;i++){
        if(delimiterTokenDecrementCounter > 0){
            delimiterTokenDecrementCounter--;
        } else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
            (*buffer)[currentStringNumber][currentStringLength] = 0;
            delimiterTokenDecrementCounter = ld - 1;
            currentStringLength = 0;
            currentStringNumber++;
        } else {
            (*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
            currentStringLength++;
        }
    }
    buffer[currentStringNumber][currentStringLength] = 0;
}

Sample code:

int main(){
    const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
    char **buffer;
    int numStrings;
    int * stringLengths;

    splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);

    for(int i = 0;i < numStrings;i++){
        printf("String: %s\n", buffer[i]);
    }
}

Libraries:

#include <stdlib.h>
#include <string.h>
#include <stdio.h>
傾旎 2025-01-11 12:37:06

爆炸& implode - 初始字符串保持不变,动态内存分配

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

typedef struct
{
    uintptr_t   ptr;
    int         size;
} token_t;

int explode(char *str, int slen, const char *delimiter, token_t **tokens)
{
    int i = 0, c1 = 0, c2 = 0;

    for(i = 0; i <= slen; i++)
    {
            if(str[i] == *delimiter)
            {
                c1++;
            }
    }

    if(c1 == 0)
    {
            return -1;
    }

    *tokens = (token_t*)calloc((c1 + 1), sizeof(token_t));
    ((*tokens)[c2]).ptr = (uintptr_t)str;

    i = 0; 
    while(i <= slen)
    {
        if((str[i] == *delimiter) || (i == slen))
        {
                ((*tokens)[c2]).size = (int)((uintptr_t)&(str[i]) - (uintptr_t)(((*tokens)[c2]).ptr));
                if(i < slen)
                {
                    c2++;
                    ((*tokens)[c2]).ptr = (uintptr_t)&(str[i + 1]);
                }
        }
        i++;
    }
    return (c1 + 1);
}

char* implode(token_t *tokens, int size, const char *delimiter)
{
    int     i, len = 0;
    char    *str;

    for(i = 0; i < len; i++)
    {
        len += tokens[i].size + 1;
    }

    str = (char*)calloc(len, sizeof(char));

    len = 0;
    for(i = 0; i < size; i++)
    {
        memcpy((void*)&str[len], (void*)tokens[i].ptr, tokens[i].size);
        len += tokens[i].size;
        str[(len++)] = *delimiter;
    }

    str[len - 1] = '\0';

    return str;
}

用法:

int main(int argc, char **argv)
{
    int         i, c;
    char        *exp = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
    token_t     *tokens;
    char        *imp;

    printf("%s\n", exp);

    if((c = explode(exp, strlen(exp), ",", &tokens)) > 0)
    {
        imp = implode(tokens, c, ",");
        printf("%s\n", imp);

        for(i = 0; i < c; i++)
        {
            printf("%.*s, %d\n", tokens[i].size, (char*)tokens[i].ptr, tokens[i].size);
        }
    }

    free((void*)tokens);
    free((void*)imp);
    return 0;
}

Explode & implode - initial string remains intact, dynamic memory allocation

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>

typedef struct
{
    uintptr_t   ptr;
    int         size;
} token_t;

int explode(char *str, int slen, const char *delimiter, token_t **tokens)
{
    int i = 0, c1 = 0, c2 = 0;

    for(i = 0; i <= slen; i++)
    {
            if(str[i] == *delimiter)
            {
                c1++;
            }
    }

    if(c1 == 0)
    {
            return -1;
    }

    *tokens = (token_t*)calloc((c1 + 1), sizeof(token_t));
    ((*tokens)[c2]).ptr = (uintptr_t)str;

    i = 0; 
    while(i <= slen)
    {
        if((str[i] == *delimiter) || (i == slen))
        {
                ((*tokens)[c2]).size = (int)((uintptr_t)&(str[i]) - (uintptr_t)(((*tokens)[c2]).ptr));
                if(i < slen)
                {
                    c2++;
                    ((*tokens)[c2]).ptr = (uintptr_t)&(str[i + 1]);
                }
        }
        i++;
    }
    return (c1 + 1);
}

char* implode(token_t *tokens, int size, const char *delimiter)
{
    int     i, len = 0;
    char    *str;

    for(i = 0; i < len; i++)
    {
        len += tokens[i].size + 1;
    }

    str = (char*)calloc(len, sizeof(char));

    len = 0;
    for(i = 0; i < size; i++)
    {
        memcpy((void*)&str[len], (void*)tokens[i].ptr, tokens[i].size);
        len += tokens[i].size;
        str[(len++)] = *delimiter;
    }

    str[len - 1] = '\0';

    return str;
}

Usage:

int main(int argc, char **argv)
{
    int         i, c;
    char        *exp = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
    token_t     *tokens;
    char        *imp;

    printf("%s\n", exp);

    if((c = explode(exp, strlen(exp), ",", &tokens)) > 0)
    {
        imp = implode(tokens, c, ",");
        printf("%s\n", imp);

        for(i = 0; i < c; i++)
        {
            printf("%.*s, %d\n", tokens[i].size, (char*)tokens[i].ptr, tokens[i].size);
        }
    }

    free((void*)tokens);
    free((void*)imp);
    return 0;
}
邮友 2025-01-11 12:37:06

如果您愿意使用外部库,我强烈推荐 bstrlib 。它需要一些额外的设置,但从长远来看更容易使用。

例如,拆分下面的字符串,首先使用 bfromcstr() 调用创建一个 bstring。 (bstring 是 char 缓冲区的包装器)。
接下来,用逗号分割字符串,将结果保存在 struct bstrList 中,其中包含字段 qty 和数组 entry(它是一个数组) bstring

bstrlib 还有许多其他函数可以在 bstring 上操作,

非常简单...

#include "bstrlib.h"
#include <stdio.h>
int main() {
  int i;
  char *tmp = "Hello,World,sak";
  bstring bstr = bfromcstr(tmp);
  struct bstrList *blist = bsplit(bstr, ',');
  printf("num %d\n", blist->qty);
  for(i=0;i<blist->qty;i++) {
    printf("%d: %s\n", i, bstr2cstr(blist->entry[i], '_'));
  }

}

If you are willing to use an external library, I can't recommend bstrlib enough. It takes a little extra setup, but is easier to use in the long run.

For example, split the string below, one first creates a bstring with the bfromcstr() call. (A bstring is a wrapper around a char buffer).
Next, split the string on commas, saving the result in a struct bstrList, which has fields qty and an array entry, which is an array of bstrings.

bstrlib has many other functions to operate on bstrings

Easy as pie...

#include "bstrlib.h"
#include <stdio.h>
int main() {
  int i;
  char *tmp = "Hello,World,sak";
  bstring bstr = bfromcstr(tmp);
  struct bstrList *blist = bsplit(bstr, ',');
  printf("num %d\n", blist->qty);
  for(i=0;i<blist->qty;i++) {
    printf("%d: %s\n", i, bstr2cstr(blist->entry[i], '_'));
  }

}
无悔心 2025-01-11 12:37:06

我知道,聚会迟到了,但这里还有 2 个函数可供使用,可能会进一步调整以满足您的需求(源代码位于帖子底部

另请参阅实现说明,进一步确定哪个功能更适合您的需求。

#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h>  // C99

// tokenize destructively
char **str_toksarray_alloc(
    char **strp,       /* InOut: pointer to the source non-constant c-string */
    const char *delim, /* c-string containing the delimiting chars */
    size_t *ntoks,     /* InOut: # of tokens to parse/parsed (NULL or *ntoks==0 for all tokens) */
    bool keepnulls     /* false ignores empty tokens, true includes them */
    );

// tokenize non-destructively
char **str_toksarray_alloc2(
   const char *str,    /* the source c-string */
   const char *delim,
   size_t *ntoks,
   bool keepnulls
   );

使用说明

它们的原型几乎相同,除了源字符串(分别为 strpstr)。

strp(指向字符串的指针)是已分配的非常量 C 字符串的地址,将就地标记化。 str 是一个未更改的 C 字符串(它甚至可以是字符串文字)。我所说的c-string是指以nul结尾的字符缓冲区。这两个函数的其余参数相同。

要解析所有可用标记,请静音 ntoks(意味着在将其传递给任何函数之前将其设置为 0 或将其作为 NULL 指针传递) )。否则,函数最多解析 *ntoks 个标记,或者直到没有更多标记(以先到者为准)。无论如何,当 ntoksnon-NULL 时,它会使用成功解析的令牌的计数进行更新。

另请注意,非静音 ntoks 确定将分配多少个指针。因此,如果源字符串包含 10 个标记,并且我们将 ntoks 设置为 1000,则最终会得到 990 个不必要的分配指针。另一方面,如果源字符串包含 1000 个标记,但我们只需要前 10 个,则将 ntoks 设置为 10 听起来是一个更明智的选择。

这两个函数都分配并返回一个字符指针数组,但 str_toksarray_alloc() 使它们指向修改后的源字符串本身中的标记,而 str_toksarray_alloc2( ) 使它们指向动态分配的令牌副本(其名称末尾的 2 表示 2 级分配)。

返回的数组附加了一个 NULL 哨兵指针,在 ntoks 的传回值中不考虑该指针(否则,当 非 NULL 时) ntoks 将返回数组的长度(而不是其第一级大小)传回给调用者。

keepnulls 设置为 true 时,生成的令牌与我们期望的strsep() 函数。主要意味着源字符串中的连续分隔符会产生空标记 (null),如果 delim 是空 c 字符串或者在源字符串中未找到其包含的分隔符字符,则结果只是 1 个标记:源字符串。与 strsep() 相反,空标记可以被忽略将 keepnulls 设置为 false

函数的失败调用可以通过检查其返回值与NULL来识别,或者通过检查ntoks的传回值与0(假设 ntoks非 NULL)。我建议在尝试访问返回的数组之前始终检查失败,因为这些函数包括健全性检查,可以推迟否则立即崩溃(例如,传递 NULL 指针作为源字符串)。

成功时,调用者应在使用完数组后释放该数组。
对于str_toksarray_alloc(),一个简单的free() 就足够了。对于str_toksarray_alloc2(),由于第二级分配,涉及到一个循环。 NULL 哨兵(或 非 NULL ntoks 的传回值)使这变得微不足道,但我还提供了 <下面是 code>toksarray_free2() 函数,供所有懒惰的蜜蜂使用:)

下面是使用这两个函数的简化示例。

准备:

const char *src = ";b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to";
const char *delim = ";,";
bool keepnulls = true;
size_t ntoks = 0;

str_toksarray_alloc():

// destructive (use copy of src)

char *scopy = strdup( src );
if (!scopy) { ... };          // handle strdup failure

printf( "%s\n", src );
char **arrtoks = str_toksarray_alloc( &scopy, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
    for (int i=0; arrtoks[i]; i++) {
        printf( "%d: %s\n", i, arrtoks[i] );
    }
}
free( scopy );
free( arrtoks );

/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
11 tokens read
0:
1: b
2: test
3: Tèst
4:
5:
6: cd
7: ελληνικά
8: nørmälize
9:
10: string to
 */

str_toksarray_alloc2():

// non-destructive

keepnulls = false;    // reject empty tokens

printf( "%s\n", src );
arrtoks = str_toksarray_alloc2( src, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
    for (int i=0; arrtoks[i]; i++) {
        printf( "%d: %s\n", i, arrtoks[i] );
    }
}
toksarray_free2( arrtoks );                     // dangling arrtoks
// or: arrtoks = toksarray_free2( arrtoks );    // non-dangling artoks

/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
7 tokens read
0: b
1: test
2: Tèst
3: cd
4: ελληνικά
5: nørmälize
6: string to
*/

实现说明

这两个函数都使用 strsep() 用于标记化,这使得它们线程安全,但事实并非如此一个标准函数。如果未提供,您始终可以使用开源实现(例如 GNU 的Apple 的 例如)。使用的函数 strdup() 也是如此在 str_toksarray_alloc2() 中(它的实现很简单,但这里还是GNU 的Apple 的)。

strsep() 中使用的副作用code>str_toksarray_alloc() 是在解析循环的每一步中源字符串的起始指针不断移动到下一个标记。这意味着调用者将无法释放已解析的字符串,除非他们已将起始地址保存到额外的指针。 我们通过使用 strpSaved 指针在函数本地执行此操作,从而避免了麻烦str_toksarray_alloc2() 不受此影响,因为它不触及源字符串。

这两个函数之间的主要区别在于 str_toksarray_alloc() 不会为找到的令牌分配内存。它只是为数组指针分配空间并将它们设置为直接指向源字符串。这是有效的,因为 strsep() nul< /code> - 就地终止找到的令牌。这种依赖关系可能会使您的支持代码变得复杂,但对于大字符串,它也会对性能产生很大的影响。如果保留源字符串并不重要,它也会对内存占用产生很大的影响。

另一方面,str_toksarray_alloc2() 分配并返回一个动态分配的令牌副本的自维持数组,无需进一步的依赖。它首先通过从源字符串的本地副本创建数组,然后将实际标记内容复制到数组中来实现此目的。与 str_toksarray_alloc() 相比,这要慢得多,并且会留下更大的内存占用,但它没有进一步的依赖项,并且对源字符串的性质没有任何特殊要求。这使得编写更简单(因此更易于维护)的支持代码变得更容易。

这两个函数之间的另一个区别是当 ntoks 被静音时的第一级分配(数组指针)。它们都解析所有可用的标记,但采用的方法截然不同。 str_toksarray_alloc() 使用初始大小为 16(字符指针)的 alloc-ahead,在解析循环中根据需要将其加倍。 str_toksarray_alloc2() 进行第一遍计数所有可用标记,然后它只分配一次这么多的字符指针。第一遍是通过辅助函数 str_toksfound() 完成的,该函数使用标准函数 strpbrk()strchr()。我也在下面提供了该函数的源代码。

哪种方法更好实际上取决于您的决定,具体取决于您的项目的需求。请随意将每个函数的代码调整为任一方法并从那里开始。

我想说,平均而言,对于真正大的字符串,提前分配要快得多,特别是当初始大小和增长因子根据具体情况进行微调时(例如,使它们成为函数参数)。使用所有这些 strchr()strpbrk() 保存额外的传递可以产生影响。然而,对于相对较小的字符串(这几乎是常态),仅提前分配一堆字符指针就有点矫枉过正了。这并没有什么坏处,但在这种情况下,它确实会无缘无故地使代码变得混乱。无论如何,请随意选择最适合您的。

这两个函数也是如此。我想说,在大多数情况下,str_toksarray_alloc2() 处理起来要简单得多,因为对于中小型字符串来说,内存和性能很少是问题。如果您必须处理巨大的字符串,请考虑使用 str_toksarray_alloc() (尽管在这些情况下您应该使用专门的字符串解析函数,接近您的项目的需求和输入的规格) 。

天哪,我想这不仅仅是 2 美分(笑)。

不管怎样,这是两个函数和辅助函数的代码(我已经删除了它们的大部分描述注释,因为我已经涵盖了几乎所有内容)。

源代码

str_toksarray_alloc():

// ----------------------------------------
// Tokenize destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of char-pointers
// each pointing to each token found in the source-string, or NULL on error.
//
char **str_toksarray_alloc(char **strp, const char *delim, size_t *ntoks, bool keepnulls)
{
    // sanity checks
    if ( !strp || !*strp || !**strp || !delim ) {
        goto failed;
    }

    char *strpSaved = *strp;                    // save initial *strp pointer
    bool ntoksOk = (ntoks && *ntoks);           // false when ntoks is muted
    size_t _ntoks = (ntoksOk ? *ntoks : 16);    // # of tokens to alloc-ahead

    // alloc array of char-pointers (+1 for NULL sentinel)
    char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
    if ( !toksarr ) {
        goto failed;
    }

    // Parse *strp tokens into the array
    size_t i = 0;           // # of actually parsed tokens
    char *tok;
    while ( (tok = strsep(strp, delim)) ) {
        // if requested, ignore empty tokens
        if ( *tok == '\0' && !keepnulls ) {
            continue;
        }
        // non-muted ntoks reached? we are done
        if ( ntoksOk && i == _ntoks ) {
            *ntoks = i;
            break;
        }
        // muted ntoks & ran out of space? double toksarr and keep parsing
        if ( !ntoksOk && i == _ntoks ) {
            _ntoks *= 2;
            char **tmparr = realloc( toksarr, (_ntoks+1) * sizeof(*tmparr) );
            if ( !tmparr ) {
                *strp = strpSaved;
                free( toksarr );
                goto failed;
            }
            toksarr = tmparr;
        }
        toksarr[i++] = tok; // get token address
    }
    toksarr[i] = NULL;      // NULL sentinel

    *strp = strpSaved;      // restore initial *strp pointer
    if (ntoks) *ntoks = i;  // pass to caller # of parsed tokens
    return toksarr;

failed:
    if (ntoks) *ntoks = 0;
    return NULL;
}

str_toksarray_alloc2():

// ----------------------------------------
// Tokenize non-destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of dynamically
// allocated and nul-terminated string copies of each token found in the
// source-string. Return NULL on error.
// The 2 at the end of the name means 2-levels of allocation.
//
char **str_toksarray_alloc2( const char *str, const char *delim, size_t *ntoks, bool keepnulls )
{
    // sanity checks
    if ( !str || !*str || !delim ) {
        if (ntoks) *ntoks = 0;
        return NULL;
    }

    // make a copy of str to work with
    char *_str = strdup( str ); 
    if ( !_str ) {
        if (ntoks) *ntoks = 0;
        return NULL;
    }

    // if ntoks is muted we'll allocate str_tokscount() tokens, else *ntoks
    size_t _ntoks = (ntoks && *ntoks) ? *ntoks : str_tokscount(_str, delim, keepnulls);
    if ( _ntoks == 0 ) {        // str_tokscount() failed
        goto fail_free_str;
    }
    
    // alloc the array of strings (+1 for an extra NULL sentinel)
    char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
    if ( !toksarr ) {
        goto fail_free_str;
    }

    // Parse str tokens and duplicate them into the array
    size_t i = 0;           // # of actually parsed tokens
    char *tok;
    while ( i < _ntoks && (tok = strsep(&_str, delim)) ) {
        // if requested, skip empty tokens
        if ( *tok == '\0' && !keepnulls ) {
            continue;
        }
        // duplicate current token into the array
        char *tmptok = strdup( tok );
        if ( !tmptok ) {
            goto fail_free_arr;
        }
        toksarr[i++] = tmptok;
    }
    toksarr[i] = NULL;      // NULL sentinel

    free( _str );           // release the local copy of the source-string
    if (ntoks) *ntoks = i;  // pass to caller the # of parsed tokens
    return toksarr;

// cleanup before failing
fail_free_arr:
    for (size_t idx=0; idx < i; idx++) {
        free( toksarr[idx] );
    }
    free( toksarr );

fail_free_str:
    free( _str );
    if (ntoks) *ntoks = 0;
    return NULL;
}

str_tokscount() - 辅助函数,由 str_toksarr_alloc2() 使用:

// ----------------------------------------
// Return the count of tokens present in a nul-terminated source-string (str),
// based on the delimiting chars contained in a 2nd nul-terminated string (delim).
// If the boolean argument is false, empty tokens are excluded.
//
// To stay consistent with the behavior of strsep(), the function returns 1 if
// delim is an empty string or none of its delimiters is found in str (in those
// cases the source-string is considered a single token).
// 0 is returned when str or delim are passed as NULL pointers, or when str is
// passed as an empty string.
//
size_t str_tokscount( const char *str, const char *delim, bool keepnulls )
{
    // sanity checks
    if ( !str || !*str || !delim ) {
        return 0;
    }

    const char *tok = str;
    size_t nnulls = strchr(delim, *str) ? 1 : 0;
    size_t ntoks = 1;   // even when no delims in str, str counts as 1 token 
    for (; (str = strpbrk(tok, delim)); ntoks++ ) {
        tok = ++str;
        if ( strchr(delim, *str) ) {
            nnulls++;
        }
    }

    return keepnulls ? ntoks : (ntoks - nnulls);
}

toksarray_free2() - 在str_toksarr_alloc2()返回的数组上使用它:

// ----------------------------------------
// Free a dynamically allocated, NULL terminated, array of char-pointers
// with each such pointer pointing to its own dynamically allocated data.
// Return NULL, so the caller has the choice of assigning it back to the
// dangling pointer. The 2 at the end of the name means 2-levels of deallocation.
//
// NULL terminated array means ending with a NULL sentinel.
//      e.g.: toksarr[0] = tok1, ..., toksarr[len] = NULL
//
char **toksarray_free2( char **toksarr )
{
    if ( toksarr ) {
        char **toks = toksarr;
        while ( *toks ) {   // walk until NULL sentinel
            free( *toks++ );
        }
        free( toksarr );
    }

    return NULL;
}

Late to the party I know, but here's 2 more functions to play with and probably further adjust to your needs (source code at the bottom of the post)

See also the Implementation Notes, further below, to decide which function suits your needs better.

#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h>  // C99

// tokenize destructively
char **str_toksarray_alloc(
    char **strp,       /* InOut: pointer to the source non-constant c-string */
    const char *delim, /* c-string containing the delimiting chars */
    size_t *ntoks,     /* InOut: # of tokens to parse/parsed (NULL or *ntoks==0 for all tokens) */
    bool keepnulls     /* false ignores empty tokens, true includes them */
    );

// tokenize non-destructively
char **str_toksarray_alloc2(
   const char *str,    /* the source c-string */
   const char *delim,
   size_t *ntoks,
   bool keepnulls
   );

Usage Notes

Their prototypes are almost identical, except for the source-string (strp and str, respectively).

strp (pointer to string) is the address of an already allocated, non-constant c-string, to be tokenized in-place. str is a c-string which is not altered (it can even be a string-literal). By c-string I mean a nul-terminated buffer of chars. The rest of the arguments are the same for both functions.

To parse all available tokens, mute ntoks (meaning set it to 0 before passing it to any of the functions or pass it as a NULL pointer). Else the functions parse up to *ntoks tokens, or until there are no more tokens (whichever comes first). In any case, when ntoks is non-NULL it gets updated with the count of successfully parsed tokens.

Note also that a non-muted ntoks determines how many pointers will be allocated. Thus if the source string contains say 10 tokens and we set ntoks to 1000, we'll end up with 990 needlessly allocated pointers. On the other hand, if the source-string contains say 1000 tokens but we only need the first 10, setting ntoks to 10 sounds like a much wiser choice.

Both functions allocate and return an array of char-pointers, but str_toksarray_alloc() makes them point to the tokens in the modified source-string itself, while str_toksarray_alloc2() makes them point to dynamically allocated copies of the tokens (that 2 at the end of its name indicates the 2-levels of allocation).

The returned array is appended with a NULL sentinel pointer, which is not taken into account in the passed-back value of ntoks (put otherwise, when non-NULL, ntoks passes-back to the caller the length of the returned array, not its 1st level size).

When keepnulls is set to true, the resulting tokens are similar to what we'd expect from the strsep() function. Mostly meaning that consecutive delimiters in the source-string produce empty tokens (nulls), and if delim is an empty c-string or none of its contained delimiter-chars were found in the source string, the result is just 1 token: the source string. Contrary to strsep(), empty tokens can be ignored by setting keepnulls to false.

Failed calls of the functions can be identified by checking their return value against NULL, or by checking the passed-back value of ntoks against 0 (provided ntoks was non-NULL). I suggest always checking against failure before attempting to access the returned array, because the functions include sanity checks which can postpone otherwise immediate crashes (for example, passing a NULL pointer as the source string).

On success, the caller should free the array when they're done with it.
For str_toksarray_alloc(), a simple free() is enough. For str_toksarray_alloc2() a loop is involved, due to the 2nd level of allocation. The NULL sentinel (or the passed-back value of a non-NULL ntoks) makes this trivial, but I'm also providing a toksarray_free2() function below, for all the lazy bees out there :)

Simplified examples using both functions follow.

Prep:

const char *src = ";b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to";
const char *delim = ";,";
bool keepnulls = true;
size_t ntoks = 0;

str_toksarray_alloc():

// destructive (use copy of src)

char *scopy = strdup( src );
if (!scopy) { ... };          // handle strdup failure

printf( "%s\n", src );
char **arrtoks = str_toksarray_alloc( &scopy, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
    for (int i=0; arrtoks[i]; i++) {
        printf( "%d: %s\n", i, arrtoks[i] );
    }
}
free( scopy );
free( arrtoks );

/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
11 tokens read
0:
1: b
2: test
3: Tèst
4:
5:
6: cd
7: ελληνικά
8: nørmälize
9:
10: string to
 */

str_toksarray_alloc2():

// non-destructive

keepnulls = false;    // reject empty tokens

printf( "%s\n", src );
arrtoks = str_toksarray_alloc2( src, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
    for (int i=0; arrtoks[i]; i++) {
        printf( "%d: %s\n", i, arrtoks[i] );
    }
}
toksarray_free2( arrtoks );                     // dangling arrtoks
// or: arrtoks = toksarray_free2( arrtoks );    // non-dangling artoks

/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
7 tokens read
0: b
1: test
2: Tèst
3: cd
4: ελληνικά
5: nørmälize
6: string to
*/

Implementation Notes

Both functions use strsep() for the tokenization which makes them thread-safe, but it's not a standard function. If not provided, you can always use an open-source implementation (like GNU's or Apple's for example). Same goes for the function strdup() which is used in str_toksarray_alloc2() (its implementation is trivial but again here's GNU's and Apple's for example).

A side-effect of using strsep() in str_toksarray_alloc() is that the starting pointer of the source-string keeps moving to the next token in every step of the parsing loop. This means that the caller won't be able to free the parsed string, unless they had saved the starting address to an extra pointer. We save them the hassle, by doing that locally in the function, using the strpSaved pointer. str_toksarray_alloc2() is not affected by this, because it doesn't touch the source-string.

A main difference between the 2 functions is that str_toksarray_alloc() does not allocate memory for the found tokens. It rather allocates space just for the array pointers and sets them pointing directly into the source-string. This works because strsep() nul-terminates the found tokens in-place. This dependency can complicate your supporting code, but with big strings it can also make a big difference in performance. If preserving the source-string is not important, it can make a big difference in memory footprint too.

On the other hand, str_toksarray_alloc2() allocates and returns a self sustained array of dynamically allocated copies of the tokens, without further dependencies. It does so firstly by creating the array from a local duplicate of the source-string, and secondly by duplicating the actual tokens contents into the array. This is a lot slower and leaves a much bigger memory footprint compared to str_toksarray_alloc(), but it has no further dependencies, and sets no special requirements for the nature of the source-string. This makes it easier to write simpler (hence better maintainable) supporting code.

Another difference between the 2 functions is the 1st level of allocation (the array pointers) when ntoks is muted. They both parse all available tokens, but they take quite different approaches. str_toksarray_alloc() uses alloc-ahead with an initial size of 16 (char-pointers), doubling it on demand in the parsing loop. str_toksarray_alloc2() makes a 1st pass counting all available tokens, then it allocates that many char-pointers just once. That 1st pass is done with a helper function str_toksfound() which uses the standard functions strpbrk() and strchr(). I'm providing the source-code of that function too, further below.

Which approach is better is really up to you to decide, depending on the needs of your project. Feel free to adjust the code of each function to either approach and take it from there.

I'd say that on average and for really big strings alloc-ahead is much faster, especially when the initial size and grow factor are fine tuned on a per-case basis (making them function parameters for example). Saving that extra pass with all those strchr()'s and strpbrk()'s can make a difference there. However, with relatively small strings which is pretty much the norm, allocing-ahead just a bunch of char-pointers is just an overkill. It doesn't hurt but it does clutter the code for no good reason in this case. Anyway, feel free to choose whichever suits you best.

Same goes for these 2 functions. I'd say in most cases str_toksarray_alloc2() is much simpler to cope with, since memory and performance are rarely an issue with small to medium strings. If you have to deal with huge strings, then consider using str_toksarray_alloc() (though in those cases you should roll a specialized string parsing function, close to the needs of your project and the specs of your input).

Oh boy, I think that was a bit more than just 2 cents (lol).

Anyway, here is the code of the 2 functions and the helper ones (I've removed most of their description comments, since I've covered pretty much everything already).

Source Code

str_toksarray_alloc():

// ----------------------------------------
// Tokenize destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of char-pointers
// each pointing to each token found in the source-string, or NULL on error.
//
char **str_toksarray_alloc(char **strp, const char *delim, size_t *ntoks, bool keepnulls)
{
    // sanity checks
    if ( !strp || !*strp || !**strp || !delim ) {
        goto failed;
    }

    char *strpSaved = *strp;                    // save initial *strp pointer
    bool ntoksOk = (ntoks && *ntoks);           // false when ntoks is muted
    size_t _ntoks = (ntoksOk ? *ntoks : 16);    // # of tokens to alloc-ahead

    // alloc array of char-pointers (+1 for NULL sentinel)
    char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
    if ( !toksarr ) {
        goto failed;
    }

    // Parse *strp tokens into the array
    size_t i = 0;           // # of actually parsed tokens
    char *tok;
    while ( (tok = strsep(strp, delim)) ) {
        // if requested, ignore empty tokens
        if ( *tok == '\0' && !keepnulls ) {
            continue;
        }
        // non-muted ntoks reached? we are done
        if ( ntoksOk && i == _ntoks ) {
            *ntoks = i;
            break;
        }
        // muted ntoks & ran out of space? double toksarr and keep parsing
        if ( !ntoksOk && i == _ntoks ) {
            _ntoks *= 2;
            char **tmparr = realloc( toksarr, (_ntoks+1) * sizeof(*tmparr) );
            if ( !tmparr ) {
                *strp = strpSaved;
                free( toksarr );
                goto failed;
            }
            toksarr = tmparr;
        }
        toksarr[i++] = tok; // get token address
    }
    toksarr[i] = NULL;      // NULL sentinel

    *strp = strpSaved;      // restore initial *strp pointer
    if (ntoks) *ntoks = i;  // pass to caller # of parsed tokens
    return toksarr;

failed:
    if (ntoks) *ntoks = 0;
    return NULL;
}

str_toksarray_alloc2():

// ----------------------------------------
// Tokenize non-destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of dynamically
// allocated and nul-terminated string copies of each token found in the
// source-string. Return NULL on error.
// The 2 at the end of the name means 2-levels of allocation.
//
char **str_toksarray_alloc2( const char *str, const char *delim, size_t *ntoks, bool keepnulls )
{
    // sanity checks
    if ( !str || !*str || !delim ) {
        if (ntoks) *ntoks = 0;
        return NULL;
    }

    // make a copy of str to work with
    char *_str = strdup( str ); 
    if ( !_str ) {
        if (ntoks) *ntoks = 0;
        return NULL;
    }

    // if ntoks is muted we'll allocate str_tokscount() tokens, else *ntoks
    size_t _ntoks = (ntoks && *ntoks) ? *ntoks : str_tokscount(_str, delim, keepnulls);
    if ( _ntoks == 0 ) {        // str_tokscount() failed
        goto fail_free_str;
    }
    
    // alloc the array of strings (+1 for an extra NULL sentinel)
    char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
    if ( !toksarr ) {
        goto fail_free_str;
    }

    // Parse str tokens and duplicate them into the array
    size_t i = 0;           // # of actually parsed tokens
    char *tok;
    while ( i < _ntoks && (tok = strsep(&_str, delim)) ) {
        // if requested, skip empty tokens
        if ( *tok == '\0' && !keepnulls ) {
            continue;
        }
        // duplicate current token into the array
        char *tmptok = strdup( tok );
        if ( !tmptok ) {
            goto fail_free_arr;
        }
        toksarr[i++] = tmptok;
    }
    toksarr[i] = NULL;      // NULL sentinel

    free( _str );           // release the local copy of the source-string
    if (ntoks) *ntoks = i;  // pass to caller the # of parsed tokens
    return toksarr;

// cleanup before failing
fail_free_arr:
    for (size_t idx=0; idx < i; idx++) {
        free( toksarr[idx] );
    }
    free( toksarr );

fail_free_str:
    free( _str );
    if (ntoks) *ntoks = 0;
    return NULL;
}

str_tokscount() - helper function, used by str_toksarr_alloc2():

// ----------------------------------------
// Return the count of tokens present in a nul-terminated source-string (str),
// based on the delimiting chars contained in a 2nd nul-terminated string (delim).
// If the boolean argument is false, empty tokens are excluded.
//
// To stay consistent with the behavior of strsep(), the function returns 1 if
// delim is an empty string or none of its delimiters is found in str (in those
// cases the source-string is considered a single token).
// 0 is returned when str or delim are passed as NULL pointers, or when str is
// passed as an empty string.
//
size_t str_tokscount( const char *str, const char *delim, bool keepnulls )
{
    // sanity checks
    if ( !str || !*str || !delim ) {
        return 0;
    }

    const char *tok = str;
    size_t nnulls = strchr(delim, *str) ? 1 : 0;
    size_t ntoks = 1;   // even when no delims in str, str counts as 1 token 
    for (; (str = strpbrk(tok, delim)); ntoks++ ) {
        tok = ++str;
        if ( strchr(delim, *str) ) {
            nnulls++;
        }
    }

    return keepnulls ? ntoks : (ntoks - nnulls);
}

toksarray_free2() - use it on the array returned by str_toksarr_alloc2():

// ----------------------------------------
// Free a dynamically allocated, NULL terminated, array of char-pointers
// with each such pointer pointing to its own dynamically allocated data.
// Return NULL, so the caller has the choice of assigning it back to the
// dangling pointer. The 2 at the end of the name means 2-levels of deallocation.
//
// NULL terminated array means ending with a NULL sentinel.
//      e.g.: toksarr[0] = tok1, ..., toksarr[len] = NULL
//
char **toksarray_free2( char **toksarr )
{
    if ( toksarr ) {
        char **toks = toksarr;
        while ( *toks ) {   // walk until NULL sentinel
            free( *toks++ );
        }
        free( toksarr );
    }

    return NULL;
}
撩心不撩汉 2025-01-11 12:37:06

strtok()strsep() 都会修改输入字符串。我们可以编写一个函数,使用 strspn()strpbrk()

算法:

  1. 如果输入字符串不为空,则转至步骤 2,否则返回 null
  2. 跳过分隔符(如果字符串开头有分隔符),并记录单词的起始位置(为此使用 strspn()),将其称为 start
  3. 从上一步中找到的当前开头查找下一个分隔符位置(如果不再存在分隔符,则为字符串末尾)(为此使用 strpbrk()),将其称为 end
  4. 分配内存并将字符串从该内存中的 start 复制到 end
  5. 返回令牌。

优点:

  1. 线程安全。
  2. 处理多个分隔符。
  3. 便携的。
  4. 不会像 strtok()strsep() 那样修改输入字符串。

实施:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/*
 * alloc_str function allocates memory and copy substring
 * to allocated memory.
 */

static char * alloc_str (const char * start, const char * end) {
    if (!start || !end || (start >= end)) {
        return NULL;
    }

    char * tmp = malloc (end - start + 1);
    if (tmp) {
        memcpy (tmp, start, end - start);
        tmp[end - start] = '\0';
    } else {
        fprintf (stderr, "Failed to allocate memory\n");
        exit (EXIT_FAILURE);
    }

    return tmp;
}

/*
 * str_split function returns the next token which is sequences of contiguous
 * characters separated by any of the characters that are part of delimiters.
 *
 * Parameters: 
 * p_str : Address of pointer to the string that you want to split.
 * sep : A set of characters that delimit the pieces in the string.
 *
 * Behaviour is undefined if sep is not a pointer to a null-terminated string. 
 *
 * Return :
 * Returns the pointer to dynamically allocated memory where the token is copied.
 * If p_str is NULL or empty string, NULL is returned.
 */

char * str_split (char ** p_str, const char * sep) {
    char * token = NULL;

    if (*p_str && **p_str) {
        char * p_end;

        // skip separator
        *p_str += strspn(*p_str, sep);

        p_end = *p_str;

        // find separator
        p_end = strpbrk (p_end, sep);

        // strpbrk() returns null pointer if no such character
        // exists in the input string which is part of sep argument.
        if (!p_end) {
            p_end = *p_str + strlen (*p_str);
        }

        token = alloc_str (*p_str, p_end);
        *p_str = p_end;
    }

    return token;
}

/*==================================================*/
/*==================================================*/

/*
 * Just a helper function
 */

void token_helper (char * in_str, const char * delim) {
    printf ("\nInput string : ");

    if (in_str) printf ("\"%s\"\n", in_str);
    else printf ("NULL\n");

    if (delim) printf ("Delimiter : \"%s\"\n", delim);

    char * ptr = in_str;
    char * token = NULL;

    printf ("Tokens:\n");
    while ((token = str_split(&ptr, delim)) != NULL) {
        printf ("-> %s\n", token);
        /* You can assign this token to a pointer of an array of pointers
         * and return that array of pointers from this function.
         * Since, this is for demonstration purpose, I am 
         * freeing the allocated memory now.
         */
        free (token);
    }
}

/*
 * Driver function
 */

int main (void) {
    /* test cases */

    char string[100] = "hello world!";
    const char * delim = " ";
    token_helper (string, delim);

    strcpy (string, " hello world,friend of mine!");
    delim = " ,";
    token_helper (string, delim);

    strcpy (string, "Another string");
    delim = "-!";
    token_helper (string, delim);

    strcpy (string, "   one  more   -- string  !");
    delim = "- !";
    token_helper (string, delim); 

    strcpy (string, "");
    delim = " ";
    token_helper (string, delim);

    token_helper (NULL, "");

    strcpy (string, "hi");
    delim = " -$";
    token_helper (string, delim);

    strcpy (string, "Give papa a cup of proper coffee in a copper coffee cup.");
    delim = "cp";
    token_helper (string, delim);

    strcpy (string, "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC");
    delim = ",";
    token_helper (string, delim);

    return 0;
}

输出:

# ./a.out

Input string : "hello world!"
Delimiter : " "
Tokens:
-> hello
-> world!

Input string : " hello world,friend of mine!"
Delimiter : " ,"
Tokens:
-> hello
-> world
-> friend
-> of
-> mine!

Input string : "Another string"
Delimiter : "-!"
Tokens:
-> Another string

Input string : "   one  more   -- string  !"
Delimiter : "- !"
Tokens:
-> one
-> more
-> string

Input string : ""
Delimiter : " "
Tokens:

Input string : NULL
Delimiter : ""
Tokens:

Input string : "hi"
Delimiter : " -$"
Tokens:
-> hi

Input string : "Give papa a cup of proper coffee in a copper coffee cup."
Delimiter : "cp"
Tokens:
-> Give 
-> a
-> a a 
-> u
->  of 
-> ro
-> er 
-> offee in a 
-> o
-> er 
-> offee 
-> u
-> .

Input string : "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC"
Delimiter : ","
Tokens:
-> JAN
-> FEB
-> MAR
-> APR
-> MAY
-> JUN
-> JUL
-> AUG
-> SEP
-> OCT
-> NOV
-> DEC

Both strtok() and strsep() modify the input string. We can write a function to split the string based on delimiters using strspn() and strpbrk().

Algorithm:

  1. If the input string is not empty, go to step 2 else return null.
  2. Skip separator, if any at the start of string, and record start position of word (using strspn() for this), call it start.
  3. Find next separator position (or end of string if no more separator exists) from the current start found in previous step (using strpbrk() for this), call it end.
  4. Allocate memory and copy string from start to end in that memory.
  5. Return token.

Advantage:

  1. Thread safe.
  2. Handles multiple delimiters.
  3. Portable.
  4. Doesn't modify the input string, like strtok() and strsep() does.

Implementation:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/*
 * alloc_str function allocates memory and copy substring
 * to allocated memory.
 */

static char * alloc_str (const char * start, const char * end) {
    if (!start || !end || (start >= end)) {
        return NULL;
    }

    char * tmp = malloc (end - start + 1);
    if (tmp) {
        memcpy (tmp, start, end - start);
        tmp[end - start] = '\0';
    } else {
        fprintf (stderr, "Failed to allocate memory\n");
        exit (EXIT_FAILURE);
    }

    return tmp;
}

/*
 * str_split function returns the next token which is sequences of contiguous
 * characters separated by any of the characters that are part of delimiters.
 *
 * Parameters: 
 * p_str : Address of pointer to the string that you want to split.
 * sep : A set of characters that delimit the pieces in the string.
 *
 * Behaviour is undefined if sep is not a pointer to a null-terminated string. 
 *
 * Return :
 * Returns the pointer to dynamically allocated memory where the token is copied.
 * If p_str is NULL or empty string, NULL is returned.
 */

char * str_split (char ** p_str, const char * sep) {
    char * token = NULL;

    if (*p_str && **p_str) {
        char * p_end;

        // skip separator
        *p_str += strspn(*p_str, sep);

        p_end = *p_str;

        // find separator
        p_end = strpbrk (p_end, sep);

        // strpbrk() returns null pointer if no such character
        // exists in the input string which is part of sep argument.
        if (!p_end) {
            p_end = *p_str + strlen (*p_str);
        }

        token = alloc_str (*p_str, p_end);
        *p_str = p_end;
    }

    return token;
}

/*==================================================*/
/*==================================================*/

/*
 * Just a helper function
 */

void token_helper (char * in_str, const char * delim) {
    printf ("\nInput string : ");

    if (in_str) printf ("\"%s\"\n", in_str);
    else printf ("NULL\n");

    if (delim) printf ("Delimiter : \"%s\"\n", delim);

    char * ptr = in_str;
    char * token = NULL;

    printf ("Tokens:\n");
    while ((token = str_split(&ptr, delim)) != NULL) {
        printf ("-> %s\n", token);
        /* You can assign this token to a pointer of an array of pointers
         * and return that array of pointers from this function.
         * Since, this is for demonstration purpose, I am 
         * freeing the allocated memory now.
         */
        free (token);
    }
}

/*
 * Driver function
 */

int main (void) {
    /* test cases */

    char string[100] = "hello world!";
    const char * delim = " ";
    token_helper (string, delim);

    strcpy (string, " hello world,friend of mine!");
    delim = " ,";
    token_helper (string, delim);

    strcpy (string, "Another string");
    delim = "-!";
    token_helper (string, delim);

    strcpy (string, "   one  more   -- string  !");
    delim = "- !";
    token_helper (string, delim); 

    strcpy (string, "");
    delim = " ";
    token_helper (string, delim);

    token_helper (NULL, "");

    strcpy (string, "hi");
    delim = " -
quot;;
    token_helper (string, delim);

    strcpy (string, "Give papa a cup of proper coffee in a copper coffee cup.");
    delim = "cp";
    token_helper (string, delim);

    strcpy (string, "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC");
    delim = ",";
    token_helper (string, delim);

    return 0;
}

Output:

# ./a.out

Input string : "hello world!"
Delimiter : " "
Tokens:
-> hello
-> world!

Input string : " hello world,friend of mine!"
Delimiter : " ,"
Tokens:
-> hello
-> world
-> friend
-> of
-> mine!

Input string : "Another string"
Delimiter : "-!"
Tokens:
-> Another string

Input string : "   one  more   -- string  !"
Delimiter : "- !"
Tokens:
-> one
-> more
-> string

Input string : ""
Delimiter : " "
Tokens:

Input string : NULL
Delimiter : ""
Tokens:

Input string : "hi"
Delimiter : " -
quot;
Tokens:
-> hi

Input string : "Give papa a cup of proper coffee in a copper coffee cup."
Delimiter : "cp"
Tokens:
-> Give 
-> a
-> a a 
-> u
->  of 
-> ro
-> er 
-> offee in a 
-> o
-> er 
-> offee 
-> u
-> .

Input string : "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC"
Delimiter : ","
Tokens:
-> JAN
-> FEB
-> MAR
-> APR
-> MAY
-> JUN
-> JUL
-> AUG
-> SEP
-> OCT
-> NOV
-> DEC
离不开的别离 2025-01-11 12:37:06

尝试使用这个。

char** strsplit(char* str, const char* delim){
    char** res = NULL;
    char*  part;
    int i = 0;

    char* aux = strdup(str);

    part = strdup(strtok(aux, delim));

    while(part){
        res = (char**)realloc(res, (i + 1) * sizeof(char*));
        *(res + i) = strdup(part);

        part = strdup(strtok(NULL, delim));
        i++;
    }

    res = (char**)realloc(res, i * sizeof(char*));
    *(res + i) = NULL;

    return res;
}

Try use this.

char** strsplit(char* str, const char* delim){
    char** res = NULL;
    char*  part;
    int i = 0;

    char* aux = strdup(str);

    part = strdup(strtok(aux, delim));

    while(part){
        res = (char**)realloc(res, (i + 1) * sizeof(char*));
        *(res + i) = strdup(part);

        part = strdup(strtok(NULL, delim));
        i++;
    }

    res = (char**)realloc(res, i * sizeof(char*));
    *(res + i) = NULL;

    return res;
}
不奢求什么 2025-01-11 12:37:06

我的方法是扫描字符串,让指针指向分隔符(和第一个字符)之后的每个字符,同时将字符串中分隔符的出现分配给'\0'。
首先复制原始字符串(因为它是常量),然后通过扫描获取分割数,并将其传递给指针参数len。之后,将第一个结果指针指向复制字符串指针,然后扫描复制字符串:一旦遇到分隔符,则将其赋值为'\0',从而将前一个结果字符串终止,并将下一个结果字符串指针指向下一个结果字符串。字符指针。

char** split(char* a_str, const char a_delim, int* len){
    char* s = (char*)malloc(sizeof(char) * strlen(a_str));
    strcpy(s, a_str);
    char* tmp = a_str;
    int count = 0;
    while (*tmp != '\0'){
        if (*tmp == a_delim) count += 1;
        tmp += 1;
    }
    *len = count;
    char** results = (char**)malloc(count * sizeof(char*));
    results[0] = s;
    int i = 1;
    while (*s!='\0'){
        if (*s == a_delim){
            *s = '\0';
            s += 1;
            results[i++] = s;
        }
        else s += 1;
    }
    return results;
}

My approach is to scan the string and let the pointers point to every character after the deliminators(and the first character), at the same time assign the appearances of deliminator in string to '\0'.
First make a copy of original string(since it's constant), then get the number of splits by scan it pass it to pointer parameter len. After that, point the first result pointer to the copy string pointer, then scan the copy string: once encounter a deliminator, assign it to '\0' thus the previous result string is terminated, and point the next result string pointer to the next character pointer.

char** split(char* a_str, const char a_delim, int* len){
    char* s = (char*)malloc(sizeof(char) * strlen(a_str));
    strcpy(s, a_str);
    char* tmp = a_str;
    int count = 0;
    while (*tmp != '\0'){
        if (*tmp == a_delim) count += 1;
        tmp += 1;
    }
    *len = count;
    char** results = (char**)malloc(count * sizeof(char*));
    results[0] = s;
    int i = 1;
    while (*s!='\0'){
        if (*s == a_delim){
            *s = '\0';
            s += 1;
            results[i++] = s;
        }
        else s += 1;
    }
    return results;
}
温暖的光 2025-01-11 12:37:06

我的代码(已测试):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int dtmsplit(char *str, const char *delim, char ***array, int *length ) {
  int i=0;
  char *token;
  char **res = (char **) malloc(0 * sizeof(char *));

  /* get the first token */
   token = strtok(str, delim);
   while( token != NULL ) 
   {
        res = (char **) realloc(res, (i + 1) * sizeof(char *));
        res[i] = token;
        i++;
      token = strtok(NULL, delim);
   }
   *array = res;
   *length = i;
  return 1;
}

int main()
{
    int i;
    int c = 0;
    char **arr = NULL;

    int count =0;

    char str[80] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
    c = dtmsplit(str, ",", &arr, &count);
    printf("Found %d tokens.\n", count);

    for (i = 0; i < count; i++)
        printf("string #%d: %s\n", i, arr[i]);

   return(0);
}

结果:

Found 12 tokens.
string #0: JAN
string #1: FEB
string #2: MAR
string #3: APR
string #4: MAY
string #5: JUN
string #6: JUL
string #7: AUG
string #8: SEP
string #9: OCT
string #10: NOV
string #11: DEC

My code (tested):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int dtmsplit(char *str, const char *delim, char ***array, int *length ) {
  int i=0;
  char *token;
  char **res = (char **) malloc(0 * sizeof(char *));

  /* get the first token */
   token = strtok(str, delim);
   while( token != NULL ) 
   {
        res = (char **) realloc(res, (i + 1) * sizeof(char *));
        res[i] = token;
        i++;
      token = strtok(NULL, delim);
   }
   *array = res;
   *length = i;
  return 1;
}

int main()
{
    int i;
    int c = 0;
    char **arr = NULL;

    int count =0;

    char str[80] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
    c = dtmsplit(str, ",", &arr, &count);
    printf("Found %d tokens.\n", count);

    for (i = 0; i < count; i++)
        printf("string #%d: %s\n", i, arr[i]);

   return(0);
}

Result:

Found 12 tokens.
string #0: JAN
string #1: FEB
string #2: MAR
string #3: APR
string #4: MAY
string #5: JUN
string #6: JUL
string #7: AUG
string #8: SEP
string #9: OCT
string #10: NOV
string #11: DEC
趁年轻赶紧闹 2025-01-11 12:37:06

围绕这个问题的两个问题是内存管理和线程安全。从众多帖子中可以看出,
这不是一个用 C 语言无缝完成的简单任务。我想要一个解决方案:

  • 线程安全。 (strtok 不是线程安全的)
  • 不使用 malloc 或其任何衍生产品(以避免内存管理问题)
  • 检查各个字段的数组边界(以避免未知数据上的段错误)
  • 使用多字节字段分隔符 (utf-8) )
  • 忽略输入中的额外字段
  • 为无效字段长度提供软错误例程

我提出的解决方案满足所有这些标准。设置可能需要更多工作
比这里发布的其他一些解决方案要好,但我认为在实践中,额外的工作是值得的
以避免其他解决方案的常见陷阱。

#include <stdio.h>
#include <string.h>

struct splitFieldType {
    char *field;
    int   maxLength;
};

typedef struct splitFieldType splitField;

int strsplit(splitField *fields, int expected, const char *input, const char *fieldSeparator, void (*softError)(int fieldNumber,int expected,int actual))  {
    int i;
    int fieldSeparatorLen=strlen(fieldSeparator);
    const char *tNext, *tLast=input;

    for (i=0; i<expected && (tNext=strstr(tLast, fieldSeparator))!=NULL; ++i) {
        int len=tNext-tLast;
        if (len>=fields[i].maxLength) {
            softError(i,fields[i].maxLength-1,len);
            len=fields[i].maxLength-1;
        }
        fields[i].field[len]=0;
        strncpy(fields[i].field,tLast,len);
        tLast=tNext+fieldSeparatorLen;
    }
    if (i<expected) {
        if (strlen(tLast)>fields[i].maxLength) {
            softError(i,fields[i].maxLength,strlen(tLast));
        } else {
            strcpy(fields[i].field,tLast);
        }
        return i+1;
    } else {
        return i;
    }
}


void monthSplitSoftError(int fieldNumber, int expected, int actual) {
    fprintf(stderr,"monthSplit: input field #%d is %d bytes, expected %d bytes\n",fieldNumber+1,actual,expected);
}


int main() {
  const char *fieldSeparator=",";
  const char *input="JAN,FEB,MAR,APRI,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR";

  struct monthFieldsType {
    char field1[4];
    char field2[4];
    char field3[4];
    char field4[4];
    char field5[4];
    char field6[4];
    char field7[4];
    char field8[4];
    char field9[4];
    char field10[4];
    char field11[4];
    char field12[4];
  } monthFields;

  splitField inputFields[12] = {
    {monthFields.field1,  sizeof(monthFields.field1)},
    {monthFields.field2,  sizeof(monthFields.field2)},
    {monthFields.field3,  sizeof(monthFields.field3)},
    {monthFields.field4,  sizeof(monthFields.field4)},
    {monthFields.field5,  sizeof(monthFields.field5)},
    {monthFields.field6,  sizeof(monthFields.field6)},
    {monthFields.field7,  sizeof(monthFields.field7)},
    {monthFields.field8,  sizeof(monthFields.field8)},
    {monthFields.field9,  sizeof(monthFields.field9)},
    {monthFields.field10, sizeof(monthFields.field10)},
    {monthFields.field11, sizeof(monthFields.field11)},
    {monthFields.field12, sizeof(monthFields.field12)}
  };

  int expected=sizeof(inputFields)/sizeof(splitField);

  printf("input data: %s\n", input);
  printf("expecting %d fields\n",expected);

  int ct=strsplit(inputFields, expected, input, fieldSeparator, monthSplitSoftError);

  if (ct!=expected) {
    printf("string split %d fields, expected %d\n", ct,expected);
  }

  for (int i=0;i<expected;++i) {
    printf("field %d: %s\n",i+1,inputFields[i].field);
  }

  printf("\n");
  printf("Direct structure access, field 10: %s", monthFields.field10);
}

下面是一个编译和输出的示例。请注意,在我的示例中,我特意拼出了“APRIL”,以便您可以看到软错误是如何工作的。

$ gcc strsplitExample.c && ./a.out
input data: JAN,FEB,MAR,APRIL,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR
expecting 12 fields
monthSplit: input field #4 is 5 bytes, expected 3 bytes
field 1: JAN
field 2: FEB
field 3: MAR
field 4: APR
field 5: MAY
field 6: JUN
field 7: JUL
field 8: AUG
field 9: SEP
field 10: OCT
field 11: NOV
field 12: DEC

Direct structure access, field 10: OCT

享受!

Two issues surrounding this question are memory management and thread safety. As you can see from the numerous posts,
this isn't an easy task to accomplish seamlessly in C. I desired a solution that is:

  • Thread safe. (strtok is not thread safe)
  • Does not employ malloc or any of it's derivatives (to avoid memory management issues)
  • Checks array bounds on the individual fields (to avoid segment faults on unknown data)
  • Works with multi-byte field separators (utf-8)
  • ignores extra fields in the input
  • provides soft error routine for invalid field lengths

The solution I came up meets all of these criteria. It's probably a little more work to setup
than some other solutions posted here, but I think that in practice, the extra work is worth
it in order to avoid the common pitfalls of other solutions.

#include <stdio.h>
#include <string.h>

struct splitFieldType {
    char *field;
    int   maxLength;
};

typedef struct splitFieldType splitField;

int strsplit(splitField *fields, int expected, const char *input, const char *fieldSeparator, void (*softError)(int fieldNumber,int expected,int actual))  {
    int i;
    int fieldSeparatorLen=strlen(fieldSeparator);
    const char *tNext, *tLast=input;

    for (i=0; i<expected && (tNext=strstr(tLast, fieldSeparator))!=NULL; ++i) {
        int len=tNext-tLast;
        if (len>=fields[i].maxLength) {
            softError(i,fields[i].maxLength-1,len);
            len=fields[i].maxLength-1;
        }
        fields[i].field[len]=0;
        strncpy(fields[i].field,tLast,len);
        tLast=tNext+fieldSeparatorLen;
    }
    if (i<expected) {
        if (strlen(tLast)>fields[i].maxLength) {
            softError(i,fields[i].maxLength,strlen(tLast));
        } else {
            strcpy(fields[i].field,tLast);
        }
        return i+1;
    } else {
        return i;
    }
}


void monthSplitSoftError(int fieldNumber, int expected, int actual) {
    fprintf(stderr,"monthSplit: input field #%d is %d bytes, expected %d bytes\n",fieldNumber+1,actual,expected);
}


int main() {
  const char *fieldSeparator=",";
  const char *input="JAN,FEB,MAR,APRI,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR";

  struct monthFieldsType {
    char field1[4];
    char field2[4];
    char field3[4];
    char field4[4];
    char field5[4];
    char field6[4];
    char field7[4];
    char field8[4];
    char field9[4];
    char field10[4];
    char field11[4];
    char field12[4];
  } monthFields;

  splitField inputFields[12] = {
    {monthFields.field1,  sizeof(monthFields.field1)},
    {monthFields.field2,  sizeof(monthFields.field2)},
    {monthFields.field3,  sizeof(monthFields.field3)},
    {monthFields.field4,  sizeof(monthFields.field4)},
    {monthFields.field5,  sizeof(monthFields.field5)},
    {monthFields.field6,  sizeof(monthFields.field6)},
    {monthFields.field7,  sizeof(monthFields.field7)},
    {monthFields.field8,  sizeof(monthFields.field8)},
    {monthFields.field9,  sizeof(monthFields.field9)},
    {monthFields.field10, sizeof(monthFields.field10)},
    {monthFields.field11, sizeof(monthFields.field11)},
    {monthFields.field12, sizeof(monthFields.field12)}
  };

  int expected=sizeof(inputFields)/sizeof(splitField);

  printf("input data: %s\n", input);
  printf("expecting %d fields\n",expected);

  int ct=strsplit(inputFields, expected, input, fieldSeparator, monthSplitSoftError);

  if (ct!=expected) {
    printf("string split %d fields, expected %d\n", ct,expected);
  }

  for (int i=0;i<expected;++i) {
    printf("field %d: %s\n",i+1,inputFields[i].field);
  }

  printf("\n");
  printf("Direct structure access, field 10: %s", monthFields.field10);
}

Below is an example compile and output. Note that in my example, I purposefully spelled out "APRIL" so that you can see how the soft error works.

$ gcc strsplitExample.c && ./a.out
input data: JAN,FEB,MAR,APRIL,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR
expecting 12 fields
monthSplit: input field #4 is 5 bytes, expected 3 bytes
field 1: JAN
field 2: FEB
field 3: MAR
field 4: APR
field 5: MAY
field 6: JUN
field 7: JUL
field 8: AUG
field 9: SEP
field 10: OCT
field 11: NOV
field 12: DEC

Direct structure access, field 10: OCT

Enjoy!

心安伴我暖 2025-01-11 12:37:06

这是另一个实现,它将安全地操作来标记与问题中请求的原型匹配的字符串文字,返回分配的指向 char 的指针(例如 char ** )。分隔符字符串可以包含多个字符,输入字符串可以包含任意数量的标记。所有分配和重新分配均由 mallocrealloc 处理,无需 POSIX strdup

分配的指针的初始数量由 NPTRS 常量控制,唯一的限制是它大于零。返回的 char ** 在最后一个标记后包含一个 sentinel NULL ,类似于 *argv[] 并且在可供 execvexecvpexecve 使用的形式。

strtok() 一样,多个连续分隔符被视为单个分隔符,因此 "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV ,DEC" 将被解析为只有一个 ',' 分隔 "MAY,JUN"

下面的函数进行了内联注释,并添加了一个简短的 main() 来分割月份。分配的初始指针数量设置为 2,以在对输入字符串进行标记期间强制进行 3 次重新分配:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define NPTRS 2     /* initial number of pointers to allocate (must be > 0) */

/* split src into tokens with sentinel NULL after last token.
 * return allocated pointer-to-pointer with sentinel NULL on success,
 * or NULL on failure to allocate initial block of pointers. The number
 * of allocated pointers are doubled each time reallocation required.
 */
char **strsplit (const char *src, const char *delim)
{
    int i = 0, in = 0, nptrs = NPTRS;       /* index, in/out flag, ptr count */
    char **dest = NULL;                     /* ptr-to-ptr to allocate/fill */
    const char *p = src, *ep = p;           /* pointer and end-pointer */

    /* allocate/validate nptrs pointers for dest */
    if (!(dest = malloc (nptrs * sizeof *dest))) {
        perror ("malloc-dest");
        return NULL;
    }
    *dest = NULL;   /* set first pointer as sentinel NULL */

    for (;;) {  /* loop continually until end of src reached */
        if (!*ep || strchr (delim, *ep)) {  /* if at nul-char or delimiter char */
            size_t len = ep - p;            /* get length of token */
            if (in && len) {                /* in-word and chars in token */
                if (i == nptrs - 1) {       /* used pointer == allocated - 1? */
                    /* realloc dest to temporary pointer/validate */
                    void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
                    if (!tmp) {
                        perror ("realloc-dest");
                        break;  /* don't exit, original dest still valid */
                    }
                    dest = tmp;             /* assign reallocated block to dest */
                    nptrs *= 2;             /* increment allocated pointer count */
                }
                /* allocate/validate storage for token */
                if (!(dest[i] = malloc (len + 1))) {
                    perror ("malloc-dest[i]");
                    break;
                }
                memcpy (dest[i], p, len);   /* copy len chars to storage */
                dest[i++][len] = 0;         /* nul-terminate, advance index */
                dest[i] = NULL;             /* set next pointer NULL */
            }
            if (!*ep)                       /* if at end, break */
                break;
            in = 0;                         /* set in-word flag 0 (false) */
        }
        else {  /* normal word char */
            if (!in)                        /* if not in-word */
                p = ep;                     /* update start to end-pointer */
            in = 1;                         /* set in-word flag 1 (true) */
        }
        ep++;   /* advance to next character */
    }

    return dest;
}

int main (void) {

    char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
        **tokens;                           /* pointer to pointer to char */

    if ((tokens = strsplit (str, ","))) {   /* split string into tokens */
        for (char **p = tokens; *p; p++) {  /* loop over filled pointers */
            puts (*p);
            free (*p);      /* don't forget to free allocated strings */
        }
        free (tokens);      /* and pointers */
    }
}

示例使用/输出

$ ./bin/splitinput
JAN
FEB
MAR
APR
MAY
JUN
JUL
AUG
SEP
OCT
NOV
DEC

如果您还有任何其他问题,请告诉我。

Here is another implementation that will operate safely to tokenize a string-literal matching the prototype requested in the question returning an allocated pointer-to-pointer to char (e.g. char **). The delimiter string can contain multiple characters, and the input string can contain any number of tokens. All allocations and reallocations are handled by malloc or realloc without POSIX strdup.

The initial number of pointers allocated is controlled by the NPTRS constant and the only limitation is that it be greater than zero. The char ** returned contains a sentinel NULL after the last token similar to *argv[] and in the form usable by execv, execvp and execve.

As with strtok() multiple sequential delimiters are treated as a single delimiter, so "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC" will be parsed as if only a single ',' separates "MAY,JUN".

The function below is commented in-line and a short main() was added splitting the months. The initial number of pointers allocated was set at 2 to force three reallocation during tokenizing the input string:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define NPTRS 2     /* initial number of pointers to allocate (must be > 0) */

/* split src into tokens with sentinel NULL after last token.
 * return allocated pointer-to-pointer with sentinel NULL on success,
 * or NULL on failure to allocate initial block of pointers. The number
 * of allocated pointers are doubled each time reallocation required.
 */
char **strsplit (const char *src, const char *delim)
{
    int i = 0, in = 0, nptrs = NPTRS;       /* index, in/out flag, ptr count */
    char **dest = NULL;                     /* ptr-to-ptr to allocate/fill */
    const char *p = src, *ep = p;           /* pointer and end-pointer */

    /* allocate/validate nptrs pointers for dest */
    if (!(dest = malloc (nptrs * sizeof *dest))) {
        perror ("malloc-dest");
        return NULL;
    }
    *dest = NULL;   /* set first pointer as sentinel NULL */

    for (;;) {  /* loop continually until end of src reached */
        if (!*ep || strchr (delim, *ep)) {  /* if at nul-char or delimiter char */
            size_t len = ep - p;            /* get length of token */
            if (in && len) {                /* in-word and chars in token */
                if (i == nptrs - 1) {       /* used pointer == allocated - 1? */
                    /* realloc dest to temporary pointer/validate */
                    void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
                    if (!tmp) {
                        perror ("realloc-dest");
                        break;  /* don't exit, original dest still valid */
                    }
                    dest = tmp;             /* assign reallocated block to dest */
                    nptrs *= 2;             /* increment allocated pointer count */
                }
                /* allocate/validate storage for token */
                if (!(dest[i] = malloc (len + 1))) {
                    perror ("malloc-dest[i]");
                    break;
                }
                memcpy (dest[i], p, len);   /* copy len chars to storage */
                dest[i++][len] = 0;         /* nul-terminate, advance index */
                dest[i] = NULL;             /* set next pointer NULL */
            }
            if (!*ep)                       /* if at end, break */
                break;
            in = 0;                         /* set in-word flag 0 (false) */
        }
        else {  /* normal word char */
            if (!in)                        /* if not in-word */
                p = ep;                     /* update start to end-pointer */
            in = 1;                         /* set in-word flag 1 (true) */
        }
        ep++;   /* advance to next character */
    }

    return dest;
}

int main (void) {

    char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
        **tokens;                           /* pointer to pointer to char */

    if ((tokens = strsplit (str, ","))) {   /* split string into tokens */
        for (char **p = tokens; *p; p++) {  /* loop over filled pointers */
            puts (*p);
            free (*p);      /* don't forget to free allocated strings */
        }
        free (tokens);      /* and pointers */
    }
}

Example Use/Output

$ ./bin/splitinput
JAN
FEB
MAR
APR
MAY
JUN
JUL
AUG
SEP
OCT
NOV
DEC

Let me know if you have any further questions.

猛虎独行 2025-01-11 12:37:06

遇到这个问题正在寻找一个简单的解决方案。
我对所有选项都很着迷,但对我自己的用例/品味不满意(这可能很糟糕)。

我创建了一个有点独特的解决方案,旨在为用户提供清晰的行为,而不是重新分配任何内存,并且具有人类可读性+注释。

上传到gist.github:https://gist.github.com/RepComm/1e89f7611733ce0e75c8476d5ef66093

示例:

#include "./strutils.c"

struct str_split_info info;
info.source = " SPLIT ME hello SPLIT ME world SPLIT ME whats SPLIT ME going SPLIT ME on SPLIT ME today";
info.delimiter = " SPLIT ME ";

str_split_begin(&info);

char * substr;

for (int i=0; i<info.splitStringsCount; i++) {
  substr = info.splitStrings[i];
  printf("substring: '%s'\n", substr);
}

str_split_end(&info);

输出:

$ ./test
substring: ''
substring: 'hello'
substring: 'world'
substring: 'whats'
substring: 'going'
substring: 'on'
substring: 'today'

strutils.c 的完整源代码

#ifndef STRUTILS_C
#define STRUTILS_C 1

#ifndef str
#define str char *
#endif

#include <stdlib.h>
#include <stdbool.h>
#include <string.h>

#include <stdio.h>

struct str_split_info {
  /* The string to be split
  * Provided by caller of str_split_begin function
  */
  str source;
  /* The string that cuts the source string, all occurances of
  * this string will be removed from the source string

  * Provided by caller of str_split_begin function
  */
  str delimiter;

  /* Array of strings split by delimiter
  * Provided and allocated by str_split_begin function
  * Must be garbage collected by str_split_end function
  */
  str * splitStrings;

  /* Array of string lengths split by delimiter
    * Provided and allocated by str_split_begin function
    * Must be garbage collected by str_split_end function
    */
  int * splitStringsLengths;

  /* Number of strings split by delimiter contained in splitStrings
  * Provided by str_split_begin function
  */
  int splitStringsCount;
};
#define str_split_infop struct str_split_info *

/* Split a string by a delimiting string
* 
* The caller is responsible only for calling str_split_end
* when finished with the results in 'info'
*/
void str_split_begin (str_split_infop info) {
  info->splitStringsCount = 0;

  int sourceLength = strlen(info->source);
  int sourceOffset = 0;
  char sourceChar;

  int delimiterLength = strlen(info->delimiter);
  int delimiterOffset = 0;
  char delimiterChar;

  //first pass, simply count occurances so we can allocate only once
  for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
    sourceChar = info->source[sourceOffset];
    delimiterChar = info->delimiter[delimiterOffset];

    if (sourceChar == delimiterChar) {
      delimiterOffset++;

      if (delimiterOffset >= delimiterLength) {
        delimiterOffset = 0;
        //increment count
        info->splitStringsCount ++;
      }
    } else {
      delimiterOffset = 0;
    }
  }
  info->splitStringsCount++;

  //allocate arrays since we know the count
  //this one is an array of strings, which are each char arrays
  info->splitStrings = (str *) malloc(sizeof (str *) * info->splitStringsCount);
  //this one is an array of ints
  info->splitStringsLengths = (int*) malloc(sizeof(int) *info->splitStringsCount);

  int stringBegin = 0;
  int stringEnd = 0;
  int splitIndex = 0;
  int splitLength = 0;

  //second pass, fill the arrays
  for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
    sourceChar = info->source[sourceOffset];
    delimiterChar = info->delimiter[delimiterOffset];

    if (sourceChar == delimiterChar) {
      delimiterOffset++;

      //if we've reached the end of the delimiter
      if (delimiterOffset >= delimiterLength) {

        //don't worry about delimiter trailing null, strlen doesn't count those
        stringEnd = sourceOffset - delimiterLength;
        
        //char count of substring we want to split
        splitLength = stringEnd - stringBegin + 1;

        //allocate for our substring split
        info->splitStrings[splitIndex] = (str) malloc(
          //+1 for trailing null for c-string
          sizeof(char) * splitLength + 1
        );

        //copy substring from source into splitStrings array
        memcpy(
          info->splitStrings[splitIndex],
          info->source + stringBegin,
          splitLength
        );
        //explicitly set the last char of this split to a NULL just for fun
        info->splitStrings[splitIndex][splitLength] = 0x00;

        //conveniently put the substring split size for the
        //user of str_split_begin :)
        info->splitStringsLengths[splitIndex] = splitLength;

        //move to next split index
        splitIndex ++;

        //reset delimiter offset so we look for new occurances of it
        delimiterOffset = 0;

        //next substring split should occur after the current delimiter
        stringBegin = sourceOffset+1;
      }
    } else {
      //reset delimiter offset so we look for new occurances of it
      delimiterOffset = 0;
    }
  }

  //handle edge case of last substring after last delimiter
  if (stringEnd != stringBegin) {
    stringEnd = sourceLength-1;

    splitLength = stringEnd - stringBegin + 1;

    //allocate for our substring split
    info->splitStrings[splitIndex] = (str) malloc(
      //+1 for trailing null for c-string
      sizeof(char) * splitLength + 1
    );

    //copy substring from source into splitStrings array
    memcpy(
      info->splitStrings[splitIndex],
      info->source + stringBegin,
      splitLength
    );
    
  }
}
int str_split_count (str_split_infop info) {
  return info->splitStringsCount;
}

void str_split_get (str_split_infop info, str * out) {
  for (int i=0; i < info->splitStringsCount; i++) {
    strcpy(out[i], info->splitStrings[i]);
  }
}

void str_split_end (str_split_infop info) {
  if (info->splitStringsCount > 0 && info->splitStrings != NULL) {
    //free each string allocated
    for (int i=0; i < info->splitStringsCount; i++) {
      free(info->splitStrings[i]);
    }
    //free string array pointer
    free (info->splitStrings);

    //free string lengths array pointer
    free(info->splitStringsLengths);

    info->splitStringsCount = 0;
  }
}

void str_split_test () {
  char * source = "hello world this is a test";
  str delimiter = " ";

  struct str_split_info info;
  
  info.source = source;
  info.delimiter = delimiter;

  str_split_begin (&info);

  //iterate thru split substrings
  //NOTE: removed/memory cleanup after str_split_end
  for (int i=0; i<info.splitStringsCount; i++) {
    // info.splitStrings[i];
  }

  str_split_end(&info);
}

#endif

Came across this looking for a simple solution.
I am fascinated by all of the options but dissatisfied for my own use case/taste (which may be terrible).

I have created a somewhat unique solution that aims to clearly behave for its user, not re-allocate any memory, and be human readable + with comments.

Uploaded to gist.github here: https://gist.github.com/RepComm/1e89f7611733ce0e75c8476d5ef66093

Example:

#include "./strutils.c"

struct str_split_info info;
info.source = " SPLIT ME hello SPLIT ME world SPLIT ME whats SPLIT ME going SPLIT ME on SPLIT ME today";
info.delimiter = " SPLIT ME ";

str_split_begin(&info);

char * substr;

for (int i=0; i<info.splitStringsCount; i++) {
  substr = info.splitStrings[i];
  printf("substring: '%s'\n", substr);
}

str_split_end(&info);

Output:

$ ./test
substring: ''
substring: 'hello'
substring: 'world'
substring: 'whats'
substring: 'going'
substring: 'on'
substring: 'today'

Full source of strutils.c

#ifndef STRUTILS_C
#define STRUTILS_C 1

#ifndef str
#define str char *
#endif

#include <stdlib.h>
#include <stdbool.h>
#include <string.h>

#include <stdio.h>

struct str_split_info {
  /* The string to be split
  * Provided by caller of str_split_begin function
  */
  str source;
  /* The string that cuts the source string, all occurances of
  * this string will be removed from the source string

  * Provided by caller of str_split_begin function
  */
  str delimiter;

  /* Array of strings split by delimiter
  * Provided and allocated by str_split_begin function
  * Must be garbage collected by str_split_end function
  */
  str * splitStrings;

  /* Array of string lengths split by delimiter
    * Provided and allocated by str_split_begin function
    * Must be garbage collected by str_split_end function
    */
  int * splitStringsLengths;

  /* Number of strings split by delimiter contained in splitStrings
  * Provided by str_split_begin function
  */
  int splitStringsCount;
};
#define str_split_infop struct str_split_info *

/* Split a string by a delimiting string
* 
* The caller is responsible only for calling str_split_end
* when finished with the results in 'info'
*/
void str_split_begin (str_split_infop info) {
  info->splitStringsCount = 0;

  int sourceLength = strlen(info->source);
  int sourceOffset = 0;
  char sourceChar;

  int delimiterLength = strlen(info->delimiter);
  int delimiterOffset = 0;
  char delimiterChar;

  //first pass, simply count occurances so we can allocate only once
  for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
    sourceChar = info->source[sourceOffset];
    delimiterChar = info->delimiter[delimiterOffset];

    if (sourceChar == delimiterChar) {
      delimiterOffset++;

      if (delimiterOffset >= delimiterLength) {
        delimiterOffset = 0;
        //increment count
        info->splitStringsCount ++;
      }
    } else {
      delimiterOffset = 0;
    }
  }
  info->splitStringsCount++;

  //allocate arrays since we know the count
  //this one is an array of strings, which are each char arrays
  info->splitStrings = (str *) malloc(sizeof (str *) * info->splitStringsCount);
  //this one is an array of ints
  info->splitStringsLengths = (int*) malloc(sizeof(int) *info->splitStringsCount);

  int stringBegin = 0;
  int stringEnd = 0;
  int splitIndex = 0;
  int splitLength = 0;

  //second pass, fill the arrays
  for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
    sourceChar = info->source[sourceOffset];
    delimiterChar = info->delimiter[delimiterOffset];

    if (sourceChar == delimiterChar) {
      delimiterOffset++;

      //if we've reached the end of the delimiter
      if (delimiterOffset >= delimiterLength) {

        //don't worry about delimiter trailing null, strlen doesn't count those
        stringEnd = sourceOffset - delimiterLength;
        
        //char count of substring we want to split
        splitLength = stringEnd - stringBegin + 1;

        //allocate for our substring split
        info->splitStrings[splitIndex] = (str) malloc(
          //+1 for trailing null for c-string
          sizeof(char) * splitLength + 1
        );

        //copy substring from source into splitStrings array
        memcpy(
          info->splitStrings[splitIndex],
          info->source + stringBegin,
          splitLength
        );
        //explicitly set the last char of this split to a NULL just for fun
        info->splitStrings[splitIndex][splitLength] = 0x00;

        //conveniently put the substring split size for the
        //user of str_split_begin :)
        info->splitStringsLengths[splitIndex] = splitLength;

        //move to next split index
        splitIndex ++;

        //reset delimiter offset so we look for new occurances of it
        delimiterOffset = 0;

        //next substring split should occur after the current delimiter
        stringBegin = sourceOffset+1;
      }
    } else {
      //reset delimiter offset so we look for new occurances of it
      delimiterOffset = 0;
    }
  }

  //handle edge case of last substring after last delimiter
  if (stringEnd != stringBegin) {
    stringEnd = sourceLength-1;

    splitLength = stringEnd - stringBegin + 1;

    //allocate for our substring split
    info->splitStrings[splitIndex] = (str) malloc(
      //+1 for trailing null for c-string
      sizeof(char) * splitLength + 1
    );

    //copy substring from source into splitStrings array
    memcpy(
      info->splitStrings[splitIndex],
      info->source + stringBegin,
      splitLength
    );
    
  }
}
int str_split_count (str_split_infop info) {
  return info->splitStringsCount;
}

void str_split_get (str_split_infop info, str * out) {
  for (int i=0; i < info->splitStringsCount; i++) {
    strcpy(out[i], info->splitStrings[i]);
  }
}

void str_split_end (str_split_infop info) {
  if (info->splitStringsCount > 0 && info->splitStrings != NULL) {
    //free each string allocated
    for (int i=0; i < info->splitStringsCount; i++) {
      free(info->splitStrings[i]);
    }
    //free string array pointer
    free (info->splitStrings);

    //free string lengths array pointer
    free(info->splitStringsLengths);

    info->splitStringsCount = 0;
  }
}

void str_split_test () {
  char * source = "hello world this is a test";
  str delimiter = " ";

  struct str_split_info info;
  
  info.source = source;
  info.delimiter = delimiter;

  str_split_begin (&info);

  //iterate thru split substrings
  //NOTE: removed/memory cleanup after str_split_end
  for (int i=0; i<info.splitStringsCount; i++) {
    // info.splitStrings[i];
  }

  str_split_end(&info);
}

#endif
热情消退 2025-01-11 12:37:06
#include <cstring>
#include <cstdio>
int main()
{
    char buf[] = "This is Luke Skywalker    here!";
    for( char* tok = strtok( buf, " ");
         tok != nullptr;
         tok = strtok( nullptr, " ")) {
        puts( tok);
    }
}

输出

This
is
Luke
Skywalker
here!
#include <cstring>
#include <cstdio>
int main()
{
    char buf[] = "This is Luke Skywalker    here!";
    for( char* tok = strtok( buf, " ");
         tok != nullptr;
         tok = strtok( nullptr, " ")) {
        puts( tok);
    }
}

Outputs

This
is
Luke
Skywalker
here!
笑红尘 2025-01-11 12:37:06

我尝试做一个非常简单的。我还在 main() 中展示了示例。

#include <stdio.h>
#include <string.h>

void split(char* inputArr, char** outputArr, char* delim) {
    
        char *temp;
        temp = strtok(inputArr, delim);

        for(int i = 0; temp != NULL; i++) {
            outputArr[i] = temp;
            temp = strtok(NULL, delim);
        }
}

int main(int argc, char **argv){
    
    /* check for proper arguments */
    
    if(argc != 2){
        printf("One Argument Expected\n");
    } else {

        printf("\n");
        /*---------main code starts here----------*/
        FILE * myScriptFile;
        myScriptFile = fopen(argv[1], "r");
        
        /* read txt file and split into array like java split() */
        
        int bufferLen = 100;
        char buffer[bufferLen];
        
        char *splitArr[100];        

        while(fgets(buffer, bufferLen, myScriptFile) != NULL){
            
            split(buffer, splitArr, " ");

            printf("Index 0 String: %s\n", splitArr[0]);
            printf("Index 1 String: %s\n", splitArr[1]);
            printf("Index 2 String: %s\n", splitArr[2]);
            printf("Index 3 String: %s\n", splitArr[3]);
        }
        fclose(myScriptFile);
    }
    printf("\nProgram-Script Ended\n");
    return 0;
}

假设 .txt 文件已

Hello this is test
Hello2 this is test2

使用 .txt 文件作为参数运行它,则会给出

Index 0 String: Hello
Index 1 String: this
Index 2 String: is
Index 3 String: test

Index 0 String: Hello2
Index 1 String: this
Index 2 String: is
Index 3 String: test2

I tried to make a very simple one. I am also showing example in the main().

#include <stdio.h>
#include <string.h>

void split(char* inputArr, char** outputArr, char* delim) {
    
        char *temp;
        temp = strtok(inputArr, delim);

        for(int i = 0; temp != NULL; i++) {
            outputArr[i] = temp;
            temp = strtok(NULL, delim);
        }
}

int main(int argc, char **argv){
    
    /* check for proper arguments */
    
    if(argc != 2){
        printf("One Argument Expected\n");
    } else {

        printf("\n");
        /*---------main code starts here----------*/
        FILE * myScriptFile;
        myScriptFile = fopen(argv[1], "r");
        
        /* read txt file and split into array like java split() */
        
        int bufferLen = 100;
        char buffer[bufferLen];
        
        char *splitArr[100];        

        while(fgets(buffer, bufferLen, myScriptFile) != NULL){
            
            split(buffer, splitArr, " ");

            printf("Index 0 String: %s\n", splitArr[0]);
            printf("Index 1 String: %s\n", splitArr[1]);
            printf("Index 2 String: %s\n", splitArr[2]);
            printf("Index 3 String: %s\n", splitArr[3]);
        }
        fclose(myScriptFile);
    }
    printf("\nProgram-Script Ended\n");
    return 0;
}

Assume a .txt file has

Hello this is test
Hello2 this is test2

running it with a .txt file as a parameter would give

Index 0 String: Hello
Index 1 String: this
Index 2 String: is
Index 3 String: test

Index 0 String: Hello2
Index 1 String: this
Index 2 String: is
Index 3 String: test2
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文