提高 MacOS X 上的 mmap/munmap 性能

发布于 2024-10-01 04:06:53 字数 1765 浏览 8 评论 0原文

在 MacOS X 机器上运行以下 C 代码（2GB 文件上的一堆 mmap 和 munmap）似乎比在 Linux 机器上慢得多。

#define BUFSZ 2000000000
static u_char buf[BUFSZ];
....

// Time 10000 mmaps and munmaps from random offsets for various 
// sizes of mapped chunk.
for (msize = 4096; msize <= 1048576; msize *= 16) {
  fd = open("io_benchmark.dat", O_RDONLY);
  if (fd  < 0 ) die("can't open io_benchmark.dat for reading");
  for (i = 0; i < 10000; i++) {
    // Make sure the block to be mapped doesn't start in the
    // last meg.
    offset = (size_t) random() % (BUFSZ - 1048576);
    mblock = femmap(fd, (off_t)offset, (size_t) msize, PROT_READ, 
                    "test block");
    total = 0;
    for (j = 0; j < msize; j++) {
      total += mblock[j];
    }
    femunmap(mblock, (size_t) msize, "test block");
  }
  printf("Elapsed time to mmap and munmap 10000 blocks of %d kB: %.4f sec\n", 
         msize/1024, (time = time_since_last_call()));

  rslt = close(fd);
  if (fd  < 0 ) die("can't close io_benchmark.dat after reading");
}

具体来说，比较两台机器

CPU     Xeon E3113 dual core @ 3.00GHz           Core 2 Duo @ 2.4GHz dual core
RAM     8GB                                      4GB
Kernel  2.6.18-92.el5PAE SMP i686                MacOS 10.6.4 Snow Leopard
Disk    WD 250GB SATA 16MB cache 7200 RPM EXT3   Hitachi 250GB SATA 5400 RPM, journaled HFS+

给出以下结果

                            Linux    MacOS X
Time for 10000 4kB mmaps    0.0165   682.87
Time for 10000 64kB mmap    0.0170   657.79
Time for 10000 1MB mmaps    0.0217   633.38

即使考虑到内存量的减少，考虑到文件只有物理内存的一半，这似乎也是不寻常的。任何人都可以指出可能会提高性能的代码更改或配置更改吗？

我们尝试使用读取而不是 mmap，它确实会产生很大的差异，但这样做需要对现有代码库进行重大更改（并且 mmap 比 Linux 上的读取快得多）。

原文

Running the following C code (a bunch of mmaps and munmaps on a 2GB file) on a MacOS X machine seems to be dramatically slower than on a Linux one.

#define BUFSZ 2000000000
static u_char buf[BUFSZ];
....

// Time 10000 mmaps and munmaps from random offsets for various 
// sizes of mapped chunk.
for (msize = 4096; msize <= 1048576; msize *= 16) {
  fd = open("io_benchmark.dat", O_RDONLY);
  if (fd  < 0 ) die("can't open io_benchmark.dat for reading");
  for (i = 0; i < 10000; i++) {
    // Make sure the block to be mapped doesn't start in the
    // last meg.
    offset = (size_t) random() % (BUFSZ - 1048576);
    mblock = femmap(fd, (off_t)offset, (size_t) msize, PROT_READ, 
                    "test block");
    total = 0;
    for (j = 0; j < msize; j++) {
      total += mblock[j];
    }
    femunmap(mblock, (size_t) msize, "test block");
  }
  printf("Elapsed time to mmap and munmap 10000 blocks of %d kB: %.4f sec\n", 
         msize/1024, (time = time_since_last_call()));

  rslt = close(fd);
  if (fd  < 0 ) die("can't close io_benchmark.dat after reading");
}

Specifically, comparing two machines

CPU     Xeon E3113 dual core @ 3.00GHz           Core 2 Duo @ 2.4GHz dual core
RAM     8GB                                      4GB
Kernel  2.6.18-92.el5PAE SMP i686                MacOS 10.6.4 Snow Leopard
Disk    WD 250GB SATA 16MB cache 7200 RPM EXT3   Hitachi 250GB SATA 5400 RPM, journaled HFS+

Gives the following results

                            Linux    MacOS X
Time for 10000 4kB mmaps    0.0165   682.87
Time for 10000 64kB mmap    0.0170   657.79
Time for 10000 1MB mmaps    0.0217   633.38

Even accounting for the reduced amount of memory, it seems to be an unusual given the file is only half the physical memory. Can anyone point to a change to the code or a configuration change which might improve the performance?

We're tried using reads instead of mmaps, and it does make a substantial difference, but doing that would require a substantial change to an existing code base (and mmap is a lot faster than the read on linux).

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

此刻的回忆 2024-10-08 04:06:53

我认为你只是没有衡量正确的事情。我检查了你的测试的内部部分，我的 gcc 版本能够完全优化循环。

例如，当我将 mblock 指针声明为指向 易失性 数据的指针时，这种情况就会发生变化。然后编译器有义务对循环中的语句执行所有副作用，特别是从内存中对其进行充电。

因此，您可以从测试中得出的唯一结论是：

MacOS X 上的编译器不是很好
smart
总是检查汇编器
因此

，如果您可以真正重做测试，我将有兴趣了解两个系统在该功能方面的真正差异。

回复收藏 0 原文

深海蓝天 2024-10-08 04:06:53

看起来这是“设计使然”。根据此 https://developer.apple.com/library/content/documentation/FileManagement/Conceptual/FileSystemAdvancedPT/MappingFilesIntoMemory/MappingFilesIntoMemory.html#//apple_ref/doc/uid/TP40010765-CH2-SW1:

在以下情况下不应使用文件映射：

您想要顺序读取文件
从开始到结束只需要一次。
文件有几百兆
或更大的尺寸。（映射大文件
快速填满虚拟内存空间。
此外，您的程序可能不会
有可用空间（如果有）
已经运行了一段时间或其
内存空间碎片。）

对于大型顺序读取操作，最好禁用磁盘缓存并将文件读入较小的内存缓冲区。有关详细信息，请参阅“选择性缓存文件”。

这是演示该问题的代码片段：

off_t       file_target = off_t(3) << 29; //  1.5GB
const char* path = "B9361194.data";

//  Touch the output file
{
    FILE*       fp = fopen( path, "a");
    fclose(fp);
}

//  Open the output file
FILE*       fp = fopen( path, "rb+");
int         fd = fileno(fp);
off_t       file_physical = 0;
off_t       file_logical = 0;
ftruncate( fd, file_physical );

//  Declare the mapping
off_t       map_start = 0;
off_t       map_count = 0;
char*       map_address = 0;

//  Set up the input buffer.
//  We are just going to write this out until we have written total bytes
size_t      requested = 1024;
char    input[requested];
for ( size_t i = 0; i < requested; ++i ) input[i] = 1;

//  Write the buffer, resizing and mapping as we go.
while ( file_logical < file_target ) {
    //  Figure out how much to write.
    size_t limit = requested;
    if ( ( file_target - file_logical ) < (off_t)limit ) 
        limit = size_t( file_target - file_logical );

    //  If we can't fit the buffer inside the allocation
    //  unmap and grow everything
    if ( file_logical + (off_t)limit > file_physical ) {
        //  Unmap
        if ( map_address ) munmap( map_address, map_count );

        //  Grow the file by 64K
        off_t   new_physical = off_t(1) << 16;  //  64K allocation
        if ( new_physical < (off_t)limit ) new_physical = limit;
        file_physical += new_physical;
        ftruncate( fd, file_physical );

        //  Map the end
        map_count = off_t(1) << 23;    //  8MB
        if ( map_count > file_physical ) map_count = file_physical;
        map_start = file_physical - map_count;
        void* address = mmap( 0, map_count, ( PROT_WRITE | PROT_READ ), MAP_SHARED, fd, map_start );
        // int err = errno;
        // if ( address == MAP_FAILED ) CPPUNIT_ASSERT_EQUAL_MESSAGE( strerror(err), 0, errno );
        map_address = reinterpret_cast<char*>( address );
    }

    //  Copy the buffer in
    size_t  offset = size_t(file_logical - map_start);
    memcpy( map_address + offset, input, limit );
    file_logical += limit;
}

//  Clean up
if ( map_address ) munmap( map_address, map_count );
ftruncate( fd, file_logical );
fclose( fp );

不知道他们是否以及何时会修复它。

Looks like this is "by design". According to this https://developer.apple.com/library/content/documentation/FileManagement/Conceptual/FileSystemAdvancedPT/MappingFilesIntoMemory/MappingFilesIntoMemory.html#//apple_ref/doc/uid/TP40010765-CH2-SW1:

You should not use file mapping in the following situations:

You want to read a file sequentially
from start to finish only once.
The file is several hundred megabytes
or more in size. (Mapping large files
fills virtual memory space quickly.
In addition, your program may not
have the available space if it has
been running for a while or its
memory space is fragmented.)

For large sequential read operations, you are better off disabling disk caching and reading the file into a small memory buffer. See “Cache Files Selectively” for more information.

Here is a code snippet that demonstrates the problem:

off_t       file_target = off_t(3) << 29; //  1.5GB
const char* path = "B9361194.data";

//  Touch the output file
{
    FILE*       fp = fopen( path, "a");
    fclose(fp);
}

//  Open the output file
FILE*       fp = fopen( path, "rb+");
int         fd = fileno(fp);
off_t       file_physical = 0;
off_t       file_logical = 0;
ftruncate( fd, file_physical );

//  Declare the mapping
off_t       map_start = 0;
off_t       map_count = 0;
char*       map_address = 0;

//  Set up the input buffer.
//  We are just going to write this out until we have written total bytes
size_t      requested = 1024;
char    input[requested];
for ( size_t i = 0; i < requested; ++i ) input[i] = 1;

//  Write the buffer, resizing and mapping as we go.
while ( file_logical < file_target ) {
    //  Figure out how much to write.
    size_t limit = requested;
    if ( ( file_target - file_logical ) < (off_t)limit ) 
        limit = size_t( file_target - file_logical );

    //  If we can't fit the buffer inside the allocation
    //  unmap and grow everything
    if ( file_logical + (off_t)limit > file_physical ) {
        //  Unmap
        if ( map_address ) munmap( map_address, map_count );

        //  Grow the file by 64K
        off_t   new_physical = off_t(1) << 16;  //  64K allocation
        if ( new_physical < (off_t)limit ) new_physical = limit;
        file_physical += new_physical;
        ftruncate( fd, file_physical );

        //  Map the end
        map_count = off_t(1) << 23;    //  8MB
        if ( map_count > file_physical ) map_count = file_physical;
        map_start = file_physical - map_count;
        void* address = mmap( 0, map_count, ( PROT_WRITE | PROT_READ ), MAP_SHARED, fd, map_start );
        // int err = errno;
        // if ( address == MAP_FAILED ) CPPUNIT_ASSERT_EQUAL_MESSAGE( strerror(err), 0, errno );
        map_address = reinterpret_cast<char*>( address );
    }

    //  Copy the buffer in
    size_t  offset = size_t(file_logical - map_start);
    memcpy( map_address + offset, input, limit );
    file_logical += limit;
}

//  Clean up
if ( map_address ) munmap( map_address, map_count );
ftruncate( fd, file_logical );
fclose( fp );

No idea if and when they will fix it.

回复收藏 0 原文