C - 尝试返回到文件中的上一行

发布于 2024-12-08 03:43:28 字数 1327 浏览 5 评论 0原文

我必须阅读一个可以以可选注释开头的文本文件。在实践中，我必须跳过文件开头不以“@”或“>”开头的任何行。在我的测试用例中，文件如下所示：

# Sun Jul 12 22:04:52 2009 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/solid0065/primary.20090712170542775 
# Cwd: /state/partition1/home/pipeline
# Title: solid0065_20090629_FC1_Tomate_Heinz_4_5_Kb_Tomate_Heinz_4_5_Kb_01
>125_963_316_F3
T1230330231223011323010013

所以我必须跳过前 3 行（但一般来说我必须跳过 n 行）。我必须用 2 或 4 个文件（位于 FILE** inputFiles 内）重复此操作。我尝试过这个循环：

buffer = (char*) malloc (sizeof(char) * 5000);
if (buffer == NULL)
    notEnoughMemory();

for (i = 0; i < (cIn-1); i++){
    fgetpos(inputFiles[i], &position);
    fgets(buffer, 4999, inputFiles[i]);
    while ((buffer[0] != '@') && (buffer[0] != '>')){
        fgetpos(inputFiles[i], &position);
        fgets(buffer, 4999, inputFiles[i]);
    }
    fsetpos(inputFiles[i], &position);
}

其中 cIn 是 number_of_input_files + 1。尝试对其进行调试，循环在读取第四行后正确停止。但是当我使用 setpos 时，它不会像我预期的那样返回到第四行的开头，而是返回到第三行的中间。事实上，如果在 fsetpos() 之后，我在这些操作之后打印缓冲区：

fgets(buffer, 4999, inputFiles[i]);
fgets(buffer, 4999, inputFiles[i]);

我得到：

FC1_Tomate_Heinz_4_5_Kb_Tomate_Heinz_4_5_Kb_01
>125_963_316_F3

有什么想法吗？提前致谢

原文

I have to read a text file which can begin with optional comments. In practice I have to skip any line at the beginning of the file that doesn't begin with '@' or '>'.
In my test case the file looks like:

# Sun Jul 12 22:04:52 2009 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/solid0065/primary.20090712170542775 
# Cwd: /state/partition1/home/pipeline
# Title: solid0065_20090629_FC1_Tomate_Heinz_4_5_Kb_Tomate_Heinz_4_5_Kb_01
>125_963_316_F3
T1230330231223011323010013

So I have to skip the first 3 line (but in general I have to skip n lines). I have to repeat this with 2 or 4 files [which are inside FILE** inputFiles]. I've tried with this loop:

buffer = (char*) malloc (sizeof(char) * 5000);
if (buffer == NULL)
    notEnoughMemory();

for (i = 0; i < (cIn-1); i++){
    fgetpos(inputFiles[i], &position);
    fgets(buffer, 4999, inputFiles[i]);
    while ((buffer[0] != '@') && (buffer[0] != '>')){
        fgetpos(inputFiles[i], &position);
        fgets(buffer, 4999, inputFiles[i]);
    }
    fsetpos(inputFiles[i], &position);
}

Where cIn is number_of_input_files + 1.
Trying to debug it the loop correctly stops after it reads the fourth line. But when I use setpos it doesn't go back to the beginning of the fourth line as I'd expect, but at the middle of the third.
In fact if, exactly after the fsetpos(), I print buffer after these operations:

fgets(buffer, 4999, inputFiles[i]);
fgets(buffer, 4999, inputFiles[i]);

I get:

FC1_Tomate_Heinz_4_5_Kb_Tomate_Heinz_4_5_Kb_01
>125_963_316_F3

Any idea?
Thanks in advance

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

秋叶绚丽 2024-12-15 03:43:28

而不是 fgetpos(); fsetpos(); 你可能会使用
fseek(inputFiles[i], -strlen(buffer), SEEK_CUR);

回复收藏 0 原文

桜花祭 2024-12-15 03:43:28

（恕我直言）最好的方法是将整个文件读入一个大缓冲区（mmap 也是一个选项，如果可用），然后查找并修复行结尾和 fasta 标题。这也将减少内存碎片。它大大简化了“解析器”。

编辑：添加源（它并不完美，但上次我检查它时，它有效；-）可能不完整，我从一个更大的程序中剪下了它。

struct fastapart {
  char * name;
  char * data;
  unsigned size;
  struct roedel *friends;
  };
struct fastafile {
  size_t totsize;
  char *tot;
  unsigned count;
  struct fastapart *parts;
  int *alloc;
  };

struct fastafile * read_complete_fasta(char *name)
{
int rc,state;
struct fastafile * result;
size_t pos,len,cnt,idx;
struct strbuff *fwd=NULL,*rev = NULL;

result = malloc (sizeof *result);
if (!result) return NULL;
result->tot = read_complete_file(name , &result->totsize);
if (!result->tot) goto failfree;

result->count = 0;
result->parts = NULL;

for (pos=cnt=state=0; pos < result->totsize; ) {
switch (state) {
case 0: /* find first '>' */
  if (result->tot[pos] == '>') { pos++; state=2; continue; }
  pos += strcspn( result->tot+pos, "\n" );
case 1: /* not found: sync to newline */
  if (result->tot[pos] == '\n') { pos++; state=0; continue; }
  else pos++;
  continue;;
case 2: /* Got '>'; grab name */
  len = strcspn( result->tot+pos, " \t\n" );
  if (cnt >= result->count) {
    size_t siz;
    siz = result->count ? 2* result->count: 16;
    result->parts = realloc( result->parts
      , siz * sizeof *result->parts);
    for (  ; result->count < siz;result->count ++) {
      result->parts[cnt].name = NULL;
      result->parts[cnt].data = NULL;
      result->parts[cnt].friends = NULL;
      result->parts[cnt].size = 0;
      }
    }

  result->parts[cnt].name = result->tot+pos;
  result->parts[cnt].name[len] = 0;
  pos += 1+len;
  len = strspn( result->tot+pos, " \t\n" );
  pos += len;
  state++;
  continue;
case 3: /* grab data; for the moment, throw away reversed data */
  if (result->tot[pos] == '>') {
    if (fwd) {
      memcpy(result->parts[cnt].data, fwd->data, fwd->used ); result->parts[cnt].size = fwd->used;
      result->parts[cnt].data [ fwd->used ] = 0;
      fwd->used = 0; }
    if (rev) {
      /* memcpy(result->parts[cnt].data+result->parts[cnt].size, rev->data, rev->used );  */
      rev->used = 0;
      }
    if (result->parts[cnt].data) cnt++;
    pos++; state=2;
    continue;
    }
  len = strcspn( result->tot+pos, "\t\n" );
  if (!len) { /* empty line; what to do? skip it! */
    fprintf(stderr, "Empty\n" );
    pos++; state=1;
    continue; }
  if (!result->parts[cnt].data) {result->parts[cnt].data = result->tot+pos;  }
  fwd = strbuff_add(fwd, result->tot+pos, len);
  pos += len;
  if (result->tot[pos] == '\t' ) {
    pos += strspn(result->tot+pos, " \t" );
    len = strcspn( result->tot+pos, "\n" );
    rev =  strbuff_add(rev, result->tot+pos, len);
    pos += len;
    }
  pos += strspn(result->tot+pos, " \t\r\n" );
  }}
if (state == 3) {
  if (fwd) {
    memcpy(result->parts[cnt].data, fwd->data, fwd->used ); result->parts[cnt].size = fwd->used;
    result->parts[cnt].data [ fwd->used ] = 0;
    fwd->used = 0;
    }
  if (rev) {
    /* memcpy(result->parts[cnt].data+result->parts[cnt].size, rev->data, rev->used );  */
    rev->used = 0;
    }
  if (result->parts[cnt].data) cnt++;
  }
  /* final realloc */
result->parts = realloc( result->parts, cnt * sizeof *result->parts);
result->count  = cnt;
free (fwd);
free (rev);

result->alloc = malloc( result->count * sizeof result->alloc[0] );
if (result->alloc) {
  for (cnt = 0; cnt <  result->count; cnt++ ) result->alloc[cnt] = cnt;
  }
return result;

failfree:
free (fwd);
free (rev);
free (result);
return NULL;
}

char * read_complete_file(char *name, size_t *sizep)
{
int fd, rc;
size_t size, len;
char *result;

struct stat st;

fd = open(name, O_RDONLY);

if (fd == -1) goto fail;
rc = fstat(fd, &st);
if (rc == -1) goto closefail;
result = malloc (1+st.st_size );
if (!result ) goto closefail;
result[st.st_size] = 0;

for (size = 0; size < st.st_size;) {
  rc = read(fd, result, st.st_size - size);
  if (rc < 0) goto freeclosefail;
  size += rc;
  }

fprintf(stderr, "Read %lu bytes FROM %s\n"
  , (unsigned long) size, name);
close(fd);
*sizep = size;
return result;

freeclosefail:
  free(result);
closefail:
  close(fd);
fail:
  *sizep=0; return NULL;
}

(IMHO )Best is to read the entire file into one big buffer (mmap is also an option, if available) , then find and fix the line endings and fasta headers. This will also reduce memory fragmentation. And it simpifies the 'parser' a lot.

EDIT: added source (it is not perfect, but last time I checked it, it worked ;-) Might be incomplete, I snipped it from a larger program.

struct fastapart {
  char * name;
  char * data;
  unsigned size;
  struct roedel *friends;
  };
struct fastafile {
  size_t totsize;
  char *tot;
  unsigned count;
  struct fastapart *parts;
  int *alloc;
  };

struct fastafile * read_complete_fasta(char *name)
{
int rc,state;
struct fastafile * result;
size_t pos,len,cnt,idx;
struct strbuff *fwd=NULL,*rev = NULL;

result = malloc (sizeof *result);
if (!result) return NULL;
result->tot = read_complete_file(name , &result->totsize);
if (!result->tot) goto failfree;

result->count = 0;
result->parts = NULL;

for (pos=cnt=state=0; pos < result->totsize; ) {
switch (state) {
case 0: /* find first '>' */
  if (result->tot[pos] == '>') { pos++; state=2; continue; }
  pos += strcspn( result->tot+pos, "\n" );
case 1: /* not found: sync to newline */
  if (result->tot[pos] == '\n') { pos++; state=0; continue; }
  else pos++;
  continue;;
case 2: /* Got '>'; grab name */
  len = strcspn( result->tot+pos, " \t\n" );
  if (cnt >= result->count) {
    size_t siz;
    siz = result->count ? 2* result->count: 16;
    result->parts = realloc( result->parts
      , siz * sizeof *result->parts);
    for (  ; result->count < siz;result->count ++) {
      result->parts[cnt].name = NULL;
      result->parts[cnt].data = NULL;
      result->parts[cnt].friends = NULL;
      result->parts[cnt].size = 0;
      }
    }

  result->parts[cnt].name = result->tot+pos;
  result->parts[cnt].name[len] = 0;
  pos += 1+len;
  len = strspn( result->tot+pos, " \t\n" );
  pos += len;
  state++;
  continue;
case 3: /* grab data; for the moment, throw away reversed data */
  if (result->tot[pos] == '>') {
    if (fwd) {
      memcpy(result->parts[cnt].data, fwd->data, fwd->used ); result->parts[cnt].size = fwd->used;
      result->parts[cnt].data [ fwd->used ] = 0;
      fwd->used = 0; }
    if (rev) {
      /* memcpy(result->parts[cnt].data+result->parts[cnt].size, rev->data, rev->used );  */
      rev->used = 0;
      }
    if (result->parts[cnt].data) cnt++;
    pos++; state=2;
    continue;
    }
  len = strcspn( result->tot+pos, "\t\n" );
  if (!len) { /* empty line; what to do? skip it! */
    fprintf(stderr, "Empty\n" );
    pos++; state=1;
    continue; }
  if (!result->parts[cnt].data) {result->parts[cnt].data = result->tot+pos;  }
  fwd = strbuff_add(fwd, result->tot+pos, len);
  pos += len;
  if (result->tot[pos] == '\t' ) {
    pos += strspn(result->tot+pos, " \t" );
    len = strcspn( result->tot+pos, "\n" );
    rev =  strbuff_add(rev, result->tot+pos, len);
    pos += len;
    }
  pos += strspn(result->tot+pos, " \t\r\n" );
  }}
if (state == 3) {
  if (fwd) {
    memcpy(result->parts[cnt].data, fwd->data, fwd->used ); result->parts[cnt].size = fwd->used;
    result->parts[cnt].data [ fwd->used ] = 0;
    fwd->used = 0;
    }
  if (rev) {
    /* memcpy(result->parts[cnt].data+result->parts[cnt].size, rev->data, rev->used );  */
    rev->used = 0;
    }
  if (result->parts[cnt].data) cnt++;
  }
  /* final realloc */
result->parts = realloc( result->parts, cnt * sizeof *result->parts);
result->count  = cnt;
free (fwd);
free (rev);

result->alloc = malloc( result->count * sizeof result->alloc[0] );
if (result->alloc) {
  for (cnt = 0; cnt <  result->count; cnt++ ) result->alloc[cnt] = cnt;
  }
return result;

failfree:
free (fwd);
free (rev);
free (result);
return NULL;
}

char * read_complete_file(char *name, size_t *sizep)
{
int fd, rc;
size_t size, len;
char *result;

struct stat st;

fd = open(name, O_RDONLY);

if (fd == -1) goto fail;
rc = fstat(fd, &st);
if (rc == -1) goto closefail;
result = malloc (1+st.st_size );
if (!result ) goto closefail;
result[st.st_size] = 0;

for (size = 0; size < st.st_size;) {
  rc = read(fd, result, st.st_size - size);
  if (rc < 0) goto freeclosefail;
  size += rc;
  }

fprintf(stderr, "Read %lu bytes FROM %s\n"
  , (unsigned long) size, name);
close(fd);
*sizep = size;
return result;

freeclosefail:
  free(result);
closefail:
  close(fd);
fail:
  *sizep=0; return NULL;
}

回复收藏 0 原文

冷默言语 2024-12-15 03:43:28

您可以跳过处理您不感兴趣的行：

for (i = 0; i < (cIn-1); i++){

    while (fgets(buffer, 4999, inputFiles[i])){
       if(buffer[0] == '@' || buffer[0] == '>') {
          puts(buffer);
        }
        /* else do nothing*/
    }
}

然后您只需将 puts(buffer); 替换为处理有效行所需的代码。
（不过，从您的示例来看，您似乎只想忽略以 # 开头的行，？）

You could just skip processing the lines you are not interrested in:

for (i = 0; i < (cIn-1); i++){

    while (fgets(buffer, 4999, inputFiles[i])){
       if(buffer[0] == '@' || buffer[0] == '>') {
          puts(buffer);
        }
        /* else do nothing*/
    }
}

Then you just replace the puts(buffer); with the code you need to handle the valid lines.
(allthough, from your example it sounds like you rather want to only ignore lines starting with a #, ?)

回复收藏 0 原文

榕城若虚 2024-12-15 03:43:28

您可以获得任意给定点的位置。当你在 while 条件中检查 null 时，它确实很有帮助，但进入后你想将光标设置回上一行。

fpos_t 位置;

fgetpos (file, &position);

然后可以设置回相同的位置：

fsetpos (file, &position);

请按照文档操作，它已经过尝试和测试，工作正常。
http://www.cplusplus.com/reference/cstdio/fgetpos/

You can get the position at any given point. It's really helpful when you checking the null in while condition, but after come inside you want to set the cursor back to previous line.

fpos_t position;