协程演示源码

发布于 2024-09-11 11:55:41 字数 5462 浏览 3 评论 0原文

这是一个程序示例，其中协程确实有助于简化该算法-恕我直言，否则很难实现。我还尝试为演示选择一个有用的任务 - 该实用程序将一个二进制文件到一系列 AZ 符号（以及后面），没有任何显着的冗余，并且它能够处理任何指定的字母表（参见 M.Init 行）。基本上它类似于广义的 base64。该代码经过测试并可与 MSC、IntelC 和 gcc/mingw 配合使用。

更新：该算法基于精确的算术编码，因此默认情况下它不是单行的。可以使用 putc/getc 文件 i/o 将其切成两半（因此仅保留修改后的范围编码器类和 do_process() ），但它会非常有限（例如，不适用于解码内存块或网络流）。实际上协程在这里是作为速度优化使用的，它的这篇文章的重点。不幸的是，我没有任何更简单的应用程序来正确演示这一点 - 我可以编写一个上下文建模压缩器，但这大约是 100 线路更多。

问题：
1) 如何用正确的 C++ 代码替换 INCLUDE_PROCESS_TEMPLATE 宏？
2）有没有办法在没有协程的情况下实现这个？
（但仍然具有内存中编码和缓冲文件 I/O）
3) 有任何修复/改进吗？

#include <io.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <setjmp.h>
#define NOINLINE __declspec(noinline)

typedef unsigned int   uint;
typedef unsigned char  byte;
typedef unsigned long long int qword;

enum{ STKPAD=1<<16 };
struct coroutine {
  volatile int   state;
  volatile byte* inpptr;
  volatile byte* inpbeg;
  volatile byte* inpend;
  volatile byte* outptr;
  volatile byte* outbeg;
  volatile byte* outend;
  jmp_buf PointA, PointB;
  void yield( int value ) { if( setjmp(PointB)==0 ) { state=value; longjmp(PointA,value); } }
  void chkinp( void ) { if( inpptr>=inpend ) yield(1), inpptr=*&inpptr; }
  void chkout( void ) { if( outptr>=outend ) yield(2); }
  template <int f> byte get( void ) { if( f ) chkinp(); return *inpptr++; }
  template <int f> void put( uint c ) { *outptr++ = c; if( f ) chkout(); }
  void coro_init( void ) { inpptr=inpbeg=inpend=0; outptr=outbeg=outend=0; state=0; }
  void addinp( byte* inp,uint inplen ) { inpbeg=inpptr=inp; inpend=&inp[inplen]; }
  void addout( byte* out,uint outlen ) { outbeg=outptr=out; outend=&out[outlen]; }
};

#define INCLUDE_PROCESS_TEMPLATE \
NOINLINE void call_do_process() { byte stktmp[STKPAD]; state=ptrdiff_t(stktmp); do_process(); } \
int coro_process( void ) { if( setjmp(PointA)==0 ) if( state ) longjmp(PointB,3); else call_do_process(); return state; } 

struct Rangecoder_SH1x : coroutine {
  enum { SCALElog=15, SCALE=1<<SCALElog };
  enum { NUM=4, sTOP=0x01000000U, Thres=0xFF000000U };
  uint f_decode; // 0=encode, 1=decode;
  uint range, Cache, FFNum;
  union {
    struct { uint low; uint Carry; };
    qword lowc;
    uint  code; 
  };
  uint rc_BProcess( uint freq, uint bit ) { 
    uint rnew = (range>>SCALElog)*freq;
    if( f_decode ) bit = (code>=rnew);
    range = ((range-rnew-rnew)&(-bit)) + rnew;
    rnew &= -bit;
    if( f_decode ) code-=rnew; else lowc+=rnew;
    if( f_decode ) while( range<sTOP ) range<<=8, (code<<=8)+=get<1>();
    else while( range<sTOP ) range<<=8, ShiftLow();
    return bit;
  }
  void ShiftLow( void ) {
    if( low<Thres || Carry ) {
      put<1>( Cache+Carry );
      for(; FFNum!=0; FFNum-- ) put<1>( Carry-1 );
      Cache=low>>24; Carry=0;
    } else FFNum++;
    low<<=8;
  }
  void rc_Init( int DECODE ) {
    f_decode=DECODE; range=-1; lowc=FFNum=Cache=0;
    if( f_decode ) for(int _=0; _<NUM+0; _++) (code<<=8)+=get<1>(); 
  }
};

struct Model : Rangecoder_SH1x {
  uint DECODE, f_quit;
  enum{ lCNUM=8, CNUM=1<<lCNUM, ROWSIZE=80 };
  uint count[2*CNUM];
  enum{ inpbufsize=1<<16, outbufsize=1<<16 };
  byte inpbuf[inpbufsize], outbuf[outbufsize];

  void Init( const char* s ) {
    uint i,j;
    uint (&p)[CNUM] = (uint(&)[CNUM])count[CNUM];
    for( i=0; i<2*CNUM; i++) count[i]=0;
    for( i=0; s[i]; i++ ) p[byte(s[i])]++;
    for( j=0; j<lCNUM; j++ ) for( i=(CNUM>>j); i<((CNUM+CNUM)>>j); i++ ) count[i>>1] += count[i];
  }

  INCLUDE_PROCESS_TEMPLATE
  void do_process( void ) {
    uint i,j;
    rc_Init(1-DECODE);
    for( i=0; !f_quit; i++ ) {
      uint c=0, ctx=1;
      if( DECODE ) do c=get<1>(); while( c==10 );
      for( j=lCNUM-1; j!=-1; j-- ) {
        uint freq = count[ctx*2+0]*SCALE/(count[ctx*2+0]+count[ctx*2+1]);
        ctx += ctx + ((freq==0) ? 1 : (freq==SCALE) ? 0 : rc_BProcess(freq,(c>>j)&1));
      }
      if( !DECODE ) put<1>(ctx), (((i+1)%ROWSIZE==0)?put<1>(10),0:0);
    }
    yield(0);
  }

  void ProcessFile( uint Mode, FILE* f, FILE* g ) {
    volatile uint r; volatile qword g_len=0; uint f_len=0;
    DECODE=Mode; f_quit=0;
    if( DECODE ) addout( (byte*)&g_len, sizeof(f_len)+1 ), r=1;
    else f_len=filelength(fileno(f)), addinp( (byte*)&f_len, sizeof(f_len) ),addout(0,0), r=2;
    do {
      if( r==1 ) {
        uint l = fread( inpbuf, 1, inpbufsize, f );
        if( l>0 ) {
          addinp( inpbuf, l );
        } else {
          if( inpbeg==inpbuf+1 ) f_quit=1;
          memset( inpbuf, 0x80, inpbufsize );
          addinp( inpbuf+1, 5 ); 
        }
      } else if( r==2 ) {
        if( outbeg==outbuf ) fwrite( outbuf, 1, outptr-outbeg, g ); else g_len>>=8;
        addout( outbuf, outbufsize );
      }
      r = coro_process(); 
    } while(r);
    fwrite( outbuf, 1,outptr-outbuf, g ); // flush
    if( DECODE==0 ) fputc( 10, g ); else fflush(g), chsize( fileno(g), g_len );
  }

} M;

int main( int argc, char** argv ) {
  if( argc<4 ) return 1;
  int DECODE = argv[1][0]=='d';
  FILE* f = fopen( argv[2], "rb" ); if( f==0 ) return 2;
  FILE* g = fopen( argv[3], "wb" ); if( g==0 ) return 3;
  M.Init( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" );
  M.ProcessFile( DECODE, f, g );
}

原文

Here's an example of a program, where coroutines really help to simplify
the algorithm - imho its hardly possible to implement otherwise.
I also tried to choose a useful task for the demo - this utility converts
a binary file to a sequence of A-Z symbols (and back), without any significant
redundancy, and it has an ability to work with any specified alphabet
(see M.Init line). Basically its something like generalized base64.
The code is tested and worked with MSC,IntelC and gcc/mingw.

Update: The algorithm is based on precise arithmetic coding, so its not a one-liner by default.
It may be possible to cut it in half by using putc/getc file i/o
(thus only a modified rangecoder class and do_process() would remain),
but then it would be very limited (eg. won't be applicable to decode a
memory block or network stream).
Actually coroutines are used as a speed optimization here, and its
the point of this post.
Unfortunately I don't have any simpler application to properly demonstrate this -
I could write a context modelling compressor instead, but that would be like 100
lines more.

Questions:
1) How to replace INCLUDE_PROCESS_TEMPLATE macro with proper C++ code?
2) Is there a way to implement this without coroutines?
(but still with in-memory encoding and buffered file i/o)
3) Any fixes/improvements?

#include <io.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <setjmp.h>
#define NOINLINE __declspec(noinline)

typedef unsigned int   uint;
typedef unsigned char  byte;
typedef unsigned long long int qword;

enum{ STKPAD=1<<16 };
struct coroutine {
  volatile int   state;
  volatile byte* inpptr;
  volatile byte* inpbeg;
  volatile byte* inpend;
  volatile byte* outptr;
  volatile byte* outbeg;
  volatile byte* outend;
  jmp_buf PointA, PointB;
  void yield( int value ) { if( setjmp(PointB)==0 ) { state=value; longjmp(PointA,value); } }
  void chkinp( void ) { if( inpptr>=inpend ) yield(1), inpptr=*&inpptr; }
  void chkout( void ) { if( outptr>=outend ) yield(2); }
  template <int f> byte get( void ) { if( f ) chkinp(); return *inpptr++; }
  template <int f> void put( uint c ) { *outptr++ = c; if( f ) chkout(); }
  void coro_init( void ) { inpptr=inpbeg=inpend=0; outptr=outbeg=outend=0; state=0; }
  void addinp( byte* inp,uint inplen ) { inpbeg=inpptr=inp; inpend=&inp[inplen]; }
  void addout( byte* out,uint outlen ) { outbeg=outptr=out; outend=&out[outlen]; }
};

#define INCLUDE_PROCESS_TEMPLATE \
NOINLINE void call_do_process() { byte stktmp[STKPAD]; state=ptrdiff_t(stktmp); do_process(); } \
int coro_process( void ) { if( setjmp(PointA)==0 ) if( state ) longjmp(PointB,3); else call_do_process(); return state; } 

struct Rangecoder_SH1x : coroutine {
  enum { SCALElog=15, SCALE=1<<SCALElog };
  enum { NUM=4, sTOP=0x01000000U, Thres=0xFF000000U };
  uint f_decode; // 0=encode, 1=decode;
  uint range, Cache, FFNum;
  union {
    struct { uint low; uint Carry; };
    qword lowc;
    uint  code; 
  };
  uint rc_BProcess( uint freq, uint bit ) { 
    uint rnew = (range>>SCALElog)*freq;
    if( f_decode ) bit = (code>=rnew);
    range = ((range-rnew-rnew)&(-bit)) + rnew;
    rnew &= -bit;
    if( f_decode ) code-=rnew; else lowc+=rnew;
    if( f_decode ) while( range<sTOP ) range<<=8, (code<<=8)+=get<1>();
    else while( range<sTOP ) range<<=8, ShiftLow();
    return bit;
  }
  void ShiftLow( void ) {
    if( low<Thres || Carry ) {
      put<1>( Cache+Carry );
      for(; FFNum!=0; FFNum-- ) put<1>( Carry-1 );
      Cache=low>>24; Carry=0;
    } else FFNum++;
    low<<=8;
  }
  void rc_Init( int DECODE ) {
    f_decode=DECODE; range=-1; lowc=FFNum=Cache=0;
    if( f_decode ) for(int _=0; _<NUM+0; _++) (code<<=8)+=get<1>(); 
  }
};

struct Model : Rangecoder_SH1x {
  uint DECODE, f_quit;
  enum{ lCNUM=8, CNUM=1<<lCNUM, ROWSIZE=80 };
  uint count[2*CNUM];
  enum{ inpbufsize=1<<16, outbufsize=1<<16 };
  byte inpbuf[inpbufsize], outbuf[outbufsize];

  void Init( const char* s ) {
    uint i,j;
    uint (&p)[CNUM] = (uint(&)[CNUM])count[CNUM];
    for( i=0; i<2*CNUM; i++) count[i]=0;
    for( i=0; s[i]; i++ ) p[byte(s[i])]++;
    for( j=0; j<lCNUM; j++ ) for( i=(CNUM>>j); i<((CNUM+CNUM)>>j); i++ ) count[i>>1] += count[i];
  }

  INCLUDE_PROCESS_TEMPLATE
  void do_process( void ) {
    uint i,j;
    rc_Init(1-DECODE);
    for( i=0; !f_quit; i++ ) {
      uint c=0, ctx=1;
      if( DECODE ) do c=get<1>(); while( c==10 );
      for( j=lCNUM-1; j!=-1; j-- ) {
        uint freq = count[ctx*2+0]*SCALE/(count[ctx*2+0]+count[ctx*2+1]);
        ctx += ctx + ((freq==0) ? 1 : (freq==SCALE) ? 0 : rc_BProcess(freq,(c>>j)&1));
      }
      if( !DECODE ) put<1>(ctx), (((i+1)%ROWSIZE==0)?put<1>(10),0:0);
    }
    yield(0);
  }

  void ProcessFile( uint Mode, FILE* f, FILE* g ) {
    volatile uint r; volatile qword g_len=0; uint f_len=0;
    DECODE=Mode; f_quit=0;
    if( DECODE ) addout( (byte*)&g_len, sizeof(f_len)+1 ), r=1;
    else f_len=filelength(fileno(f)), addinp( (byte*)&f_len, sizeof(f_len) ),addout(0,0), r=2;
    do {
      if( r==1 ) {
        uint l = fread( inpbuf, 1, inpbufsize, f );
        if( l>0 ) {
          addinp( inpbuf, l );
        } else {
          if( inpbeg==inpbuf+1 ) f_quit=1;
          memset( inpbuf, 0x80, inpbufsize );
          addinp( inpbuf+1, 5 ); 
        }
      } else if( r==2 ) {
        if( outbeg==outbuf ) fwrite( outbuf, 1, outptr-outbeg, g ); else g_len>>=8;
        addout( outbuf, outbufsize );
      }
      r = coro_process(); 
    } while(r);
    fwrite( outbuf, 1,outptr-outbuf, g ); // flush
    if( DECODE==0 ) fputc( 10, g ); else fflush(g), chsize( fileno(g), g_len );
  }

} M;

int main( int argc, char** argv ) {
  if( argc<4 ) return 1;
  int DECODE = argv[1][0]=='d';
  FILE* f = fopen( argv[2], "rb" ); if( f==0 ) return 2;
  FILE* g = fopen( argv[3], "wb" ); if( g==0 ) return 3;
  M.Init( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" );
  M.ProcessFile( DECODE, f, g );
}

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

秋千易 2024-09-18 11:55:41

只是为了笑，这里是我如何处理仅对任意字母进行编码/解码的部分的粗略想法。正如所承诺的，实际的编码/解码大约是十几行代码。整体大小较大，很大程度上是因为我自始至终都使用了模板，因此数字可以是任意整数类型，字符可以是任意字符类型，并且它都使用迭代器，因此它可以读取/写入任意集合（流、字符串流、向量等）

编辑：修改代码以从文件读取输入并将输出写入文件（并修复一两个小错误）：

#include <iterator>
#include <iostream>
#include <string>
#include <limits>
#include <vector>
#include <fstream>
#include <time.h>
#include <math.h>
#include <stdlib.h>

template <class intT>
intT log2(intT input) { 
    return intT(log10((double)input) / log10(2.0));
}

template <class intT>
class coder { 
    std::string alphabet;   
    size_t range;
    unsigned ratio;
public:
    coder(std::string const &alpha) : alphabet(alpha), range(alpha.size()) {
        ratio = ceil(double(log2(std::numeric_limits<intT>::max())/log2(range)));
    }

    template <class inIt, class outIt>
    void encode(inIt begin, inIt end, outIt out) { 
        while (begin != end) {
            intT val = *begin++;
            for (int i=0; i<ratio; i++) {
                *out++ = alphabet[val % range];
                val /= range;
            }
        }
    }

    template <class inIt, class outIt>
    void decode(inIt begin, inIt end, outIt out) { 
        while (begin != end) {
            int temp = 0;
            for (int i=0; i<ratio; i++)
                temp += alphabet.find(*begin++) * pow((double)range, i);
            *out++ = temp;
        }
    }
};

int main(int argc, char **argv) {
    if (argc != 3) {
        std::cerr << "Usage: encode <infile> <outfile>\n";
        return EXIT_FAILURE;
    }

    coder<unsigned> enc("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
    std::ifstream in(argv[1], std::ios::binary);
    std::ofstream out(argv[2]);
    clock_t start = clock();
    enc.encode(std::istream_iterator<char>(in), 
        std::istream_iterator<char>(), 
        std::ostream_iterator<char>(out, ""));
    clock_t stop = clock();
    std::cerr << "Processing time: " << double(stop-start)/CLOCKS_PER_SEC << "\n";
    return 0;
}

至少目前，我忽略了算术编码部分，但它应该（至少在我看来）遵循类似的结构，这样你就可以很容易地将东西或多或少任意地串在一起。

就比较速度和大小而言，请记住，这（根本）没有进行任何压缩，而只是进行了 baseX 编码——在这种情况下，尝试与进行压缩的东西进行比较没有任何实际意义（除了，例如，了解压缩的效果如何——但如果它确实有效，它显然会产生更小的输出）。

就可执行文件的大小而言，我只能说 gcc 生成大型可执行文件从来没有让我感到惊讶。使用 MS VC++，我获得了上述代码的 9,728 字节的可执行文件。

Just for grins, here's a rough idea of how I'd handle the part that just encodes to/decodes from an arbitrary alphabet. As promised, the actual encoding/decoding is around a dozen lines of code. The overall size is larger, largely because I've used templates throughout, so the numbers can be an arbitrary integer type, and the characters can be an arbitrary character type, and it uses iterators for both, so it can read from/write to arbitrary collections (streams, stringstreams, vectors, etc.)

Edit: modified code to read input from a file and write output to a file (and fixed a minor error or two):

#include <iterator>
#include <iostream>
#include <string>
#include <limits>
#include <vector>
#include <fstream>
#include <time.h>
#include <math.h>
#include <stdlib.h>

template <class intT>
intT log2(intT input) { 
    return intT(log10((double)input) / log10(2.0));
}

template <class intT>
class coder { 
    std::string alphabet;   
    size_t range;
    unsigned ratio;
public:
    coder(std::string const &alpha) : alphabet(alpha), range(alpha.size()) {
        ratio = ceil(double(log2(std::numeric_limits<intT>::max())/log2(range)));
    }

    template <class inIt, class outIt>
    void encode(inIt begin, inIt end, outIt out) { 
        while (begin != end) {
            intT val = *begin++;
            for (int i=0; i<ratio; i++) {
                *out++ = alphabet[val % range];
                val /= range;
            }
        }
    }

    template <class inIt, class outIt>
    void decode(inIt begin, inIt end, outIt out) { 
        while (begin != end) {
            int temp = 0;
            for (int i=0; i<ratio; i++)
                temp += alphabet.find(*begin++) * pow((double)range, i);
            *out++ = temp;
        }
    }
};

int main(int argc, char **argv) {
    if (argc != 3) {
        std::cerr << "Usage: encode <infile> <outfile>\n";
        return EXIT_FAILURE;
    }

    coder<unsigned> enc("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
    std::ifstream in(argv[1], std::ios::binary);
    std::ofstream out(argv[2]);
    clock_t start = clock();
    enc.encode(std::istream_iterator<char>(in), 
        std::istream_iterator<char>(), 
        std::ostream_iterator<char>(out, ""));
    clock_t stop = clock();
    std::cerr << "Processing time: " << double(stop-start)/CLOCKS_PER_SEC << "\n";
    return 0;
}

At least for the moment, I've ignored the arithmetic encoding part, but it should (at least IMO) follow a similar structure so you could pretty easily string things together more or less arbitrarily.

As far as comparing speed and size goes, keep in mind that this isn't doing any compression (at all) just the baseX encoding -- that being the case, attempting to compare to something that does compression makes no real sense (except, for example, to get an idea of how effective the compression is -- but if it's effective at all, it'll obviously produce smaller output).

As far as executable size goes, about all I can say is that gcc producing large executables never surprises me. Using MS VC++, I get an executable of 9,728 bytes for the code above.

回复收藏 0 原文

生生不灭 2024-09-18 11:55:41

可移植地实现协程是一项艰巨的任务。请考虑使用 Boost.coroutine 候选者。此处是库的更新。

我在 OS X 和 Linux 上经常将它与 boost::asio 一起使用，并且它们已被证明是非常稳健的实现，并且是非常有用的线程抽象，具有顺序程序的确定性行为

我不知道为什么它尚未添加到主要的 boost 发行版中。我的猜测是，这一事实背后有一些伪装成技术争论的政治争论，尽管我们鼓励您对我的偏执持保留态度

编辑：在 boost 库中有一个新的 boost 候选者，称为 Boost.Context，它的一部分一个更大的库，称为 Boost.Fiber。它还没有网页，所以我不会在这里链接它。貌似支持比较好

回复收藏 0 原文

浪荡不羁 2024-09-18 11:55:41

好吧，这就是我实际问的问题（参见[1]）——静态调用的技巧
子类中的函数。 tl;dr 显然是一个强大的力量，所以这是你的
这次是可读的标准协程斐波那契生成器。
但有一个小小的区别 - 我们实际上并不需要协程来生成
这些数字，但确实很难（如果可能的话）更快地实施
我的第一个没有协程的程序。

#include <stdio.h>
#include <stddef.h>
#include <setjmp.h>

// without noinline some compilers tend to allocate the array before setjmp()
#ifdef __GNUC__
 #define NOINLINE __attribute__((noinline))
#else
 #define NOINLINE __declspec(noinline)
#endif

enum{ STKPAD=1<<16 };
struct coroutine {
  volatile unsigned state;
  jmp_buf PointA, PointB;
  void yield( int value ) { if( setjmp(PointB)==0 ) { state=value; longjmp(PointA,value); } }
  template <typename T> NOINLINE void call_do_process() {
    char stktmp[STKPAD]; state=ptrdiff_t(stktmp); ((T*)this)->do_process();
  }
  template <typename T> unsigned coro_process( T* ) {
    if( setjmp(PointA)==0 ) if( state ) longjmp(PointB,3); else call_do_process<T>();
    return state;
  }
};

struct fibonacci : coroutine {

  void do_process( void ) {
    unsigned a=0,b=1;
    while(1) {
      yield( b );
      b = b + a;
      a = b - a;
    }
  }

  unsigned get( void ) {
    return coro_process(this); 
  }

} F;

int main( int argc, char** argv ) {

  for( int i=0; i<20; i++ ) {
    printf( "%i ", F.get() );
  } printf( "\n" );

  return 0;
}

由于 Jerry Coffin 的替代版本仍然未能产生合理的结果，
这里有一些更简单的流基准测试。很遗憾，正如我所期望的那样
使用迭代器会更慢。

事实上，我已经用算术编码器测试了各种方法 - 简单的 getc/putc，
虚方法、普通函数指针、类似迭代器的类，很明显
有很大的不同。目前，协程被证明是最好的方法 -
没有封装到字节 I/O 调用中的复杂逻辑（与迭代器不同），并且
处理不必关心 I/O 细节。当然，还有更进一步
优化，但我实际上只是试图展示协程的好处
接近这里...

#define _CRT_SECURE_NO_DEPRECATE
#define _CRT_DISABLE_PERFCRIT_LOCKS

#include <stdio.h>
#include <time.h>
#include <fstream>

int main( int argc, char** argv ) {
  if( argc<3 ) return 1;

  { 
    clock_t start = clock();
    FILE* f = fopen( argv[1], "rb" ); if( f==0 ) return 2;
    FILE* g = fopen( argv[2], "wb" ); if( g==0 ) return 3;
    while(1) {
      int c = getc(f);
      if( c<0 ) break;
      putc(c,g);
    }
    fclose(f);
    fclose(g);
    clock_t stop = clock();
    printf( "            File copy via stdio getc/putc - %7.3fs\n", float(stop-start)/CLOCKS_PER_SEC );
  }

  {
    clock_t start = clock();
    FILE* f = fopen( argv[1], "rb" ); if( f==0 ) return 2;
    FILE* g = fopen( argv[2], "wb" ); if( g==0 ) return 3;
    while(1) {
      static char buf[1<<16];
      int l = fread( buf, 1,sizeof(buf), f ); if( l<=0 ) break;
      fwrite( buf, 1,l, g ); if( l<sizeof(buf) ) break;
    }
    fclose(f);
    fclose(g);
    clock_t stop = clock();
    printf( "     File copy via stdio 64k fread/fwrite - %7.3fs\n", float(stop-start)/CLOCKS_PER_SEC );
  }

  {
    clock_t start = clock();
    std::ifstream f(argv[1],std::ios::in|std::ios::binary); if( !f.is_open() ) return 2;
    std::ofstream g(argv[2],std::ios::out|std::ios::binary); if( !g.is_open() ) return 3;
    while(1) {
      int c = f.get();
      if( c<0 ) break;
      g.put(c);
    }
    f.close();
    g.close();
    clock_t stop = clock();
    printf( "File copy via ifstream::get/ofstream::put - %.3fs\n", float(stop-start)/CLOCKS_PER_SEC );
  }

}

----- 100,000,000 byte file ----- 
[ GCC 4.5 ]
            File copy via stdio getc/putc -   0.546s
     File copy via stdio 64k fread/fwrite -   0.188s
File copy via ifstream::get/ofstream::put -  10.578s

[ IntelC 11.1 / VS 2005 ]
            File copy via stdio getc/putc -   0.500s
     File copy via stdio 64k fread/fwrite -   0.156s
File copy via ifstream::get/ofstream::put -  14.656s

[ MSC 14.0 / VS 2005 ]
            File copy via stdio getc/putc -   0.609s
     File copy via stdio 64k fread/fwrite -   0.156s
File copy via ifstream::get/ofstream::put -  19.063s

----- 1,000,000,000 byte file ----- 
[ GCC 4.5 ]
            File copy via stdio getc/putc -   7.468s
     File copy via stdio 64k fread/fwrite -   1.828s
File copy via ifstream::get/ofstream::put - 109.891s

[ IntelC 11.1 / VS 2005 ]
            File copy via stdio getc/putc -   6.718s
     File copy via stdio 64k fread/fwrite -   1.672s
File copy via ifstream::get/ofstream::put - 145.500s

[ MSC 14.0 / VS 2005 ]
            File copy via stdio getc/putc -   6.453s
     File copy via stdio 64k fread/fwrite -   1.609s
File copy via ifstream::get/ofstream::put - 191.031s

Ok, here's what I actually asked about (see [1]) - the trick to statically call
a function from child class. tl;dr is apparently a mighty power, so here's your
readable standard coroutine fibonacci generator this time.
There's a small difference though - we don't really need coroutines to generate
these numbers, but its really hard (if possible) to make a faster implementation
of my first program without coroutines.

#include <stdio.h>
#include <stddef.h>
#include <setjmp.h>

// without noinline some compilers tend to allocate the array before setjmp()
#ifdef __GNUC__
 #define NOINLINE __attribute__((noinline))
#else
 #define NOINLINE __declspec(noinline)
#endif

enum{ STKPAD=1<<16 };
struct coroutine {
  volatile unsigned state;
  jmp_buf PointA, PointB;
  void yield( int value ) { if( setjmp(PointB)==0 ) { state=value; longjmp(PointA,value); } }
  template <typename T> NOINLINE void call_do_process() {
    char stktmp[STKPAD]; state=ptrdiff_t(stktmp); ((T*)this)->do_process();
  }
  template <typename T> unsigned coro_process( T* ) {
    if( setjmp(PointA)==0 ) if( state ) longjmp(PointB,3); else call_do_process<T>();
    return state;
  }
};

struct fibonacci : coroutine {

  void do_process( void ) {
    unsigned a=0,b=1;
    while(1) {
      yield( b );
      b = b + a;
      a = b - a;
    }
  }

  unsigned get( void ) {
    return coro_process(this); 
  }

} F;

int main( int argc, char** argv ) {

  for( int i=0; i<20; i++ ) {
    printf( "%i ", F.get() );
  } printf( "\n" );

  return 0;
}

And since Jerry Coffin's alternative version still fails to produce sensible results,
here're some simpler stream benchmarks. Its a pity, as I'd expect it to be even
slower with iterators.

In fact I've tested all kinds of approaches with arithmetic coders - plain getc/putc,
virtual methods, plain functions pointers, iterator-like classes, and its clear that
there's a large difference. For now, coroutines proved to be the best way for this -
there's no complex logic encapsulated into byte i/o calls (unlike iterators), and
the processing doesn't have to care about i/o details. Sure, there're even further
optimizations, but I really only tried to demonstrate the benefits of coroutine
approach here...

#define _CRT_SECURE_NO_DEPRECATE
#define _CRT_DISABLE_PERFCRIT_LOCKS

#include <stdio.h>
#include <time.h>
#include <fstream>

int main( int argc, char** argv ) {
  if( argc<3 ) return 1;

  { 
    clock_t start = clock();
    FILE* f = fopen( argv[1], "rb" ); if( f==0 ) return 2;
    FILE* g = fopen( argv[2], "wb" ); if( g==0 ) return 3;
    while(1) {
      int c = getc(f);
      if( c<0 ) break;
      putc(c,g);
    }
    fclose(f);
    fclose(g);
    clock_t stop = clock();
    printf( "            File copy via stdio getc/putc - %7.3fs\n", float(stop-start)/CLOCKS_PER_SEC );
  }

  {
    clock_t start = clock();
    FILE* f = fopen( argv[1], "rb" ); if( f==0 ) return 2;
    FILE* g = fopen( argv[2], "wb" ); if( g==0 ) return 3;
    while(1) {
      static char buf[1<<16];
      int l = fread( buf, 1,sizeof(buf), f ); if( l<=0 ) break;
      fwrite( buf, 1,l, g ); if( l<sizeof(buf) ) break;
    }
    fclose(f);
    fclose(g);
    clock_t stop = clock();
    printf( "     File copy via stdio 64k fread/fwrite - %7.3fs\n", float(stop-start)/CLOCKS_PER_SEC );
  }

  {
    clock_t start = clock();
    std::ifstream f(argv[1],std::ios::in|std::ios::binary); if( !f.is_open() ) return 2;
    std::ofstream g(argv[2],std::ios::out|std::ios::binary); if( !g.is_open() ) return 3;
    while(1) {
      int c = f.get();
      if( c<0 ) break;
      g.put(c);
    }
    f.close();
    g.close();
    clock_t stop = clock();
    printf( "File copy via ifstream::get/ofstream::put - %.3fs\n", float(stop-start)/CLOCKS_PER_SEC );
  }

}

----- 100,000,000 byte file ----- 
[ GCC 4.5 ]
            File copy via stdio getc/putc -   0.546s
     File copy via stdio 64k fread/fwrite -   0.188s
File copy via ifstream::get/ofstream::put -  10.578s

[ IntelC 11.1 / VS 2005 ]
            File copy via stdio getc/putc -   0.500s
     File copy via stdio 64k fread/fwrite -   0.156s
File copy via ifstream::get/ofstream::put -  14.656s

[ MSC 14.0 / VS 2005 ]
            File copy via stdio getc/putc -   0.609s
     File copy via stdio 64k fread/fwrite -   0.156s
File copy via ifstream::get/ofstream::put -  19.063s

----- 1,000,000,000 byte file ----- 
[ GCC 4.5 ]
            File copy via stdio getc/putc -   7.468s
     File copy via stdio 64k fread/fwrite -   1.828s
File copy via ifstream::get/ofstream::put - 109.891s

[ IntelC 11.1 / VS 2005 ]
            File copy via stdio getc/putc -   6.718s
     File copy via stdio 64k fread/fwrite -   1.672s
File copy via ifstream::get/ofstream::put - 145.500s

[ MSC 14.0 / VS 2005 ]
            File copy via stdio getc/putc -   6.453s
     File copy via stdio 64k fread/fwrite -   1.609s
File copy via ifstream::get/ofstream::put - 191.031s

回复收藏 0 原文

~没有更多了~