当前位置：文江博客话题详情

如何在 C++ 中逐行读取 .gz 文件？

发布于 2024-09-08 11:05:53 字数 96 浏览 4 评论 0原文

我有 3 TB 的 .gz 文件，想要在 C++ 程序中逐行读取其未压缩的内容。由于文件相当大，我想避免将其完全加载到内存中。

任何人都可以发布一个简单的例子吗？

原文

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

那一片橙海， 2024-09-15 11:05:53

您很可能必须使用 ZLib 的 deflate，示例可以从他们的网站获取

。查看 BOOST C++ 包装器

示例来自 BOOST 页（从文件中解压缩数据并将其写入标准输出）

#include <fstream>
#include <iostream>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/zlib.hpp>

int main() 
{
    using namespace std;

    ifstream file("hello.z", ios_base::in | ios_base::binary);
    filtering_streambuf<input> in;
    in.push(zlib_decompressor());
    in.push(file);
    boost::iostreams::copy(in, cout);
}

You most probably will have to use ZLib's deflate, example is available from their site

Alternatively you may have a look at BOOST C++ wrapper

The example from BOOST page (decompresses data from a file and writes it to standard output)

#include <fstream>
#include <iostream>
#include <boost/iostreams/filtering_streambuf.hpp>
#include <boost/iostreams/copy.hpp>
#include <boost/iostreams/filter/zlib.hpp>

int main() 
{
    using namespace std;

    ifstream file("hello.z", ios_base::in | ios_base::binary);
    filtering_streambuf<input> in;
    in.push(zlib_decompressor());
    in.push(file);
    boost::iostreams::copy(in, cout);
}

回复收藏 0 原文

遥远的绿洲 2024-09-15 11:05:53

对于要经常使用的东西，您可能想要使用之前的建议之一。或者，您可以执行

gzcat file.gz | yourprogram

并让 yourprogram 从 cin 读取。这将根据需要解压缩内存中的部分文件，并将未压缩的输出发送到 yourprogram。

For something that is going to be used regularly, you probably want to use one of the previous suggestions. Alternatively, you can do

gzcat file.gz | yourprogram

and have yourprogram read from cin. This will decompress parts of the file in memory as it is needed, and send the uncompressed output to yourprogram.

回复收藏 0 原文

溺渁∝ 2024-09-15 11:05:53

使用zlib，我正在按照以下方式做一些事情：

// return a line in a std::vector< char >
std::vector< char > readline( gzFile f ) {
    std::vector< char > v( 256 );
    unsigned pos = 0;
    for ( ;; ) {
        if ( gzgets( f, &v[ pos ], v.size() - pos ) == 0 ) {
            // end-of-file or error
            int err;
            const char *msg = gzerror( f, &err );
            if ( err != Z_OK ) {
                // handle error
            }
            break;
        }
        unsigned read = strlen( &v[ pos ] );
        if ( v[ pos + read - 1 ] == '\n' ) {
            if ( pos + read >= 2 && v[ pos + read - 2 ] == '\r' ) {
                pos = pos + read - 2;
            } else {
                pos = pos + read - 1;
            }
            break;
        }
        if ( read == 0 || pos + read < v.size() - 1 ) {
            pos = read + pos;
            break;
        }
        pos = v.size() - 1;
        v.resize( v.size() * 2 );
    }
    v.resize( pos );
    return v;
}

编辑：删除了两个错误复制的内容上例中的 *。
编辑：更正了 v[pos + read - 2] 上的越界读取

Using zlib, I'm doing something along these lines:

// return a line in a std::vector< char >
std::vector< char > readline( gzFile f ) {
    std::vector< char > v( 256 );
    unsigned pos = 0;
    for ( ;; ) {
        if ( gzgets( f, &v[ pos ], v.size() - pos ) == 0 ) {
            // end-of-file or error
            int err;
            const char *msg = gzerror( f, &err );
            if ( err != Z_OK ) {
                // handle error
            }
            break;
        }
        unsigned read = strlen( &v[ pos ] );
        if ( v[ pos + read - 1 ] == '\n' ) {
            if ( pos + read >= 2 && v[ pos + read - 2 ] == '\r' ) {
                pos = pos + read - 2;
            } else {
                pos = pos + read - 1;
            }
            break;
        }
        if ( read == 0 || pos + read < v.size() - 1 ) {
            pos = read + pos;
            break;
        }
        pos = v.size() - 1;
        v.resize( v.size() * 2 );
    }
    v.resize( pos );
    return v;
}

EDIT: Removed two mis-copied * in the example above.
EDIT: Corrected out of bounds read on v[pos + read - 2]

回复收藏 0 原文

倒数 2024-09-15 11:05:53

zlib 库支持以块的方式解压内存中的文件，因此您不必解压整个文件命令来处理它。

回复收藏 0 原文

绿光 2024-09-15 11:05:53

下面是一些代码，您可以使用它们逐行读取普通文件和压缩文件：

char line[0x10000];
FILE *infile=open_file(file);
bool gzipped=endsWith(file, ".gz");
if(gzipped) 
    init_gzip_stream(infile,&line[0]);
while (readLine(infile,line,gzipped)) {
    if(line[0]==0)continue;// skip gzip new_block
    printf(line);
}


#include <zlib.h>
#define CHUNK 0x100
#define OUT_CHUNK CHUNK*100
unsigned char gzip_in[CHUNK];
unsigned char gzip_out[OUT_CHUNK];
///* These are parameters to inflateInit2. See http://zlib.net/manual.html for the exact meanings. */
#define windowBits 15
#define ENABLE_ZLIB_GZIP 32
z_stream strm = {0};
z_stream init_gzip_stream(FILE* file,char* out){// unsigned     
        strm.zalloc = Z_NULL;
        strm.zfree = Z_NULL;
        strm.opaque = Z_NULL;
        strm.next_in = gzip_in;
        strm.avail_in = 0;
        strm.next_out = gzip_out;
        inflateInit2 (& strm, windowBits | ENABLE_ZLIB_GZIP);
    return strm;
}

bool inflate_gzip(FILE* file, z_stream strm,size_t bytes_read){
            strm.avail_in = (int)bytes_read;
            do {
                strm.avail_out = OUT_CHUNK;
                inflate (& strm, Z_NO_FLUSH);
//              printf ("%s",gzip_out);
            }while (strm.avail_out == 0);
            if (feof (file)) {
                inflateEnd (& strm);
                return false;
            }
    return true;// all OK
}


char* first_line=(char*)&gzip_out[0];
char* current_line=first_line;
char* next_line=first_line;
char hangover[1000];
bool readLine(FILE* infile,char* line,bool gzipped){
    if(!gzipped)
        return fgets(line, sizeof(line), infile) != NULL;
    else{
        bool ok=true;
        current_line=next_line;
        if(!current_line || strlen(current_line)==0 || next_line-current_line>OUT_CHUNK){
            current_line=first_line;
            size_t bytes_read = fread (gzip_in, sizeof (char), CHUNK, infile);
            ok=inflate_gzip(infile,strm,bytes_read);
            strcpy(line,hangover);
        }
        if(ok){
            next_line=strstr(current_line,"\n");
            if(next_line){
                next_line[0]=0;
                next_line++;
                strcpy(line+strlen(hangover),current_line);
                hangover[0]=0;
            }else{
                strcpy(hangover,current_line);
                line[0]=0;// skip that one!!
            }
        }
        return ok;
    }
}

Here is some code with which you can read normal and zipped files line by line:

char line[0x10000];
FILE *infile=open_file(file);
bool gzipped=endsWith(file, ".gz");
if(gzipped) 
    init_gzip_stream(infile,&line[0]);
while (readLine(infile,line,gzipped)) {
    if(line[0]==0)continue;// skip gzip new_block
    printf(line);
}


#include <zlib.h>
#define CHUNK 0x100
#define OUT_CHUNK CHUNK*100
unsigned char gzip_in[CHUNK];
unsigned char gzip_out[OUT_CHUNK];
///* These are parameters to inflateInit2. See http://zlib.net/manual.html for the exact meanings. */
#define windowBits 15
#define ENABLE_ZLIB_GZIP 32
z_stream strm = {0};
z_stream init_gzip_stream(FILE* file,char* out){// unsigned     
        strm.zalloc = Z_NULL;
        strm.zfree = Z_NULL;
        strm.opaque = Z_NULL;
        strm.next_in = gzip_in;
        strm.avail_in = 0;
        strm.next_out = gzip_out;
        inflateInit2 (& strm, windowBits | ENABLE_ZLIB_GZIP);
    return strm;
}

bool inflate_gzip(FILE* file, z_stream strm,size_t bytes_read){
            strm.avail_in = (int)bytes_read;
            do {
                strm.avail_out = OUT_CHUNK;
                inflate (& strm, Z_NO_FLUSH);
//              printf ("%s",gzip_out);
            }while (strm.avail_out == 0);
            if (feof (file)) {
                inflateEnd (& strm);
                return false;
            }
    return true;// all OK
}


char* first_line=(char*)&gzip_out[0];
char* current_line=first_line;
char* next_line=first_line;
char hangover[1000];
bool readLine(FILE* infile,char* line,bool gzipped){
    if(!gzipped)
        return fgets(line, sizeof(line), infile) != NULL;
    else{
        bool ok=true;
        current_line=next_line;
        if(!current_line || strlen(current_line)==0 || next_line-current_line>OUT_CHUNK){
            current_line=first_line;
            size_t bytes_read = fread (gzip_in, sizeof (char), CHUNK, infile);
            ok=inflate_gzip(infile,strm,bytes_read);
            strcpy(line,hangover);
        }
        if(ok){
            next_line=strstr(current_line,"\n");
            if(next_line){
                next_line[0]=0;
                next_line++;
                strcpy(line+strlen(hangover),current_line);
                hangover[0]=0;
            }else{
                strcpy(hangover,current_line);
                line[0]=0;// skip that one!!
            }
        }
        return ok;
    }
}

回复收藏 0 原文