如何最好地写出 std::vector < std::string > HDF5 数据集的容器?

发布于 2024-07-14 06:18:51 字数 1777 浏览 8 评论 0 原文

给定一个字符串向量,将它们写入 HDF5 数据集的最佳方法是什么? 目前我正在做类似以下的事情:

  const unsigned int MaxStrLength = 512;

  struct TempContainer {
    char string[MaxStrLength];
  };

  void writeVector (hid_t group, std::vector<std::string> const & v)
  {
    //
    // Firstly copy the contents of the vector into a temporary container
    std::vector<TempContainer> tc;
    for (std::vector<std::string>::const_iterator i = v.begin ()
                                              , end = v.end ()
      ; i != end
      ; ++i)
    {
      TempContainer t;
      strncpy (t.string, i->c_str (), MaxStrLength);
      tc.push_back (t);
    }


    //
    // Write the temporary container to a dataset
    hsize_t     dims[] = { tc.size () } ;
    hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                               , dims
                               , NULL);

    hid_t strtype = H5Tcopy (H5T_C_S1);
    H5Tset_size (strtype, MaxStrLength);

    hid_t datatype = H5Tcreate (H5T_COMPOUND, sizeof (TempConainer));
    H5Tinsert (datatype
      , "string"
      , HOFFSET(TempContainer, string)
      , strtype);

    hid_t dataset = H5Dcreate1 (group
                          , "files"
                          , datatype
                          , dataspace
                          , H5P_DEFAULT);

    H5Dwrite (dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, &tc[0] );

    H5Dclose (dataset);
    H5Sclose (dataspace);
    H5Tclose (strtype);
    H5Tclose (datatype);
}

至少,我真的很想改变上面的内容,以便:

  1. 它使用可变长度字符串
  2. 我不需要有一个临时容器

我对存储方式没有限制例如,如果有更好的方法来做到这一点,它不必是COMPOUND数据类型。

编辑:为了缩小问题范围,我相对熟悉在 C++ 端处理数据,这是我需要最多帮助的 HDF5 端。

感谢您的帮助。

Given a vector of strings, what is the best way to write them out to a HDF5 dataset? At the moment I'm doing something like the following:

  const unsigned int MaxStrLength = 512;

  struct TempContainer {
    char string[MaxStrLength];
  };

  void writeVector (hid_t group, std::vector<std::string> const & v)
  {
    //
    // Firstly copy the contents of the vector into a temporary container
    std::vector<TempContainer> tc;
    for (std::vector<std::string>::const_iterator i = v.begin ()
                                              , end = v.end ()
      ; i != end
      ; ++i)
    {
      TempContainer t;
      strncpy (t.string, i->c_str (), MaxStrLength);
      tc.push_back (t);
    }


    //
    // Write the temporary container to a dataset
    hsize_t     dims[] = { tc.size () } ;
    hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                               , dims
                               , NULL);

    hid_t strtype = H5Tcopy (H5T_C_S1);
    H5Tset_size (strtype, MaxStrLength);

    hid_t datatype = H5Tcreate (H5T_COMPOUND, sizeof (TempConainer));
    H5Tinsert (datatype
      , "string"
      , HOFFSET(TempContainer, string)
      , strtype);

    hid_t dataset = H5Dcreate1 (group
                          , "files"
                          , datatype
                          , dataspace
                          , H5P_DEFAULT);

    H5Dwrite (dataset, datatype, H5S_ALL, H5S_ALL, H5P_DEFAULT, &tc[0] );

    H5Dclose (dataset);
    H5Sclose (dataspace);
    H5Tclose (strtype);
    H5Tclose (datatype);
}

At a minimum, I would really like to change the above so that:

  1. It uses variable length strings
  2. I don't need to have a temporary container

I have no restrictions over how I store the data so for example, it doesn't have to be a COMPOUND datatype if there is a better way to do this.

EDIT: Just to narrow the problem down, I'm relatively familiar with playing with the data on the C++ side, it's the HDF5 side where I need most of the help.

Thanks for your help.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(9

情绪失控 2024-07-21 06:18:51

[非常感谢 dirkgently 帮助回答这个问题。]

要在 HDF5 中编写可变长度字符串,请使用以下

// Create the datatype as follows
hid_t datatype = H5Tcopy (H5T_C_S1);
H5Tset_size (datatype, H5T_VARIABLE);

// 
// Pass the string to be written to H5Dwrite
// using the address of the pointer!
const char * s = v.c_str ();
H5Dwrite (dataset
  , datatype
  , H5S_ALL
  , H5S_ALL
  , H5P_DEFAULT
  , &s );

命令 :编写容器的解决方案是单独编写每个元素。 这可以使用 hyperslabs 来实现。

例如:

class WriteString
{
public:
  WriteString (hid_t dataset, hid_t datatype
      , hid_t dataspace, hid_t memspace)
    : m_dataset (dataset), m_datatype (datatype)
    , m_dataspace (dataspace), m_memspace (memspace)
    , m_pos () {}

private:
  hid_t m_dataset;
  hid_t m_datatype;
  hid_t m_dataspace;
  hid_t m_memspace;
  int m_pos;

//...

public:
  void operator ()(std::vector<std::string>::value_type const & v)
  {
    // Select the file position, 1 record at position 'pos'
    hsize_t count[] = { 1 } ;
    hsize_t offset[] = { m_pos++ } ;
    H5Sselect_hyperslab( m_dataspace
      , H5S_SELECT_SET
      , offset
      , NULL
      , count
      , NULL );

    const char * s = v.c_str ();
    H5Dwrite (m_dataset
      , m_datatype
      , m_memspace
      , m_dataspace
      , H5P_DEFAULT
      , &s );
    }    
};

// ...

void writeVector (hid_t group, std::vector<std::string> const & v)
{
  hsize_t     dims[] = { m_files.size ()  } ;
  hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                                    , dims, NULL);

  dims[0] = 1;
  hid_t memspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                                    , dims, NULL);

  hid_t datatype = H5Tcopy (H5T_C_S1);
  H5Tset_size (datatype, H5T_VARIABLE);

  hid_t dataset = H5Dcreate1 (group, "files", datatype
                             , dataspace, H5P_DEFAULT);

  // 
  // Select the "memory" to be written out - just 1 record.
  hsize_t offset[] = { 0 } ;
  hsize_t count[] = { 1 } ;
  H5Sselect_hyperslab( memspace, H5S_SELECT_SET, offset
                     , NULL, count, NULL );

  std::for_each (v.begin ()
      , v.end ()
      , WriteStrings (dataset, datatype, dataspace, memspace));

  H5Dclose (dataset);
  H5Sclose (dataspace);
  H5Sclose (memspace);
  H5Tclose (datatype);
}      

[Many thanks to dirkgently for his help in answering this.]

To write a variable length string in HDF5 use the following:

// Create the datatype as follows
hid_t datatype = H5Tcopy (H5T_C_S1);
H5Tset_size (datatype, H5T_VARIABLE);

// 
// Pass the string to be written to H5Dwrite
// using the address of the pointer!
const char * s = v.c_str ();
H5Dwrite (dataset
  , datatype
  , H5S_ALL
  , H5S_ALL
  , H5P_DEFAULT
  , &s );

One solution for writing a container is to write each element individually. This can be achieved using hyperslabs.

For example:

class WriteString
{
public:
  WriteString (hid_t dataset, hid_t datatype
      , hid_t dataspace, hid_t memspace)
    : m_dataset (dataset), m_datatype (datatype)
    , m_dataspace (dataspace), m_memspace (memspace)
    , m_pos () {}

private:
  hid_t m_dataset;
  hid_t m_datatype;
  hid_t m_dataspace;
  hid_t m_memspace;
  int m_pos;

//...

public:
  void operator ()(std::vector<std::string>::value_type const & v)
  {
    // Select the file position, 1 record at position 'pos'
    hsize_t count[] = { 1 } ;
    hsize_t offset[] = { m_pos++ } ;
    H5Sselect_hyperslab( m_dataspace
      , H5S_SELECT_SET
      , offset
      , NULL
      , count
      , NULL );

    const char * s = v.c_str ();
    H5Dwrite (m_dataset
      , m_datatype
      , m_memspace
      , m_dataspace
      , H5P_DEFAULT
      , &s );
    }    
};

// ...

void writeVector (hid_t group, std::vector<std::string> const & v)
{
  hsize_t     dims[] = { m_files.size ()  } ;
  hid_t dataspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                                    , dims, NULL);

  dims[0] = 1;
  hid_t memspace = H5Screate_simple(sizeof(dims)/sizeof(*dims)
                                    , dims, NULL);

  hid_t datatype = H5Tcopy (H5T_C_S1);
  H5Tset_size (datatype, H5T_VARIABLE);

  hid_t dataset = H5Dcreate1 (group, "files", datatype
                             , dataspace, H5P_DEFAULT);

  // 
  // Select the "memory" to be written out - just 1 record.
  hsize_t offset[] = { 0 } ;
  hsize_t count[] = { 1 } ;
  H5Sselect_hyperslab( memspace, H5S_SELECT_SET, offset
                     , NULL, count, NULL );

  std::for_each (v.begin ()
      , v.end ()
      , WriteStrings (dataset, datatype, dataspace, memspace));

  H5Dclose (dataset);
  H5Sclose (dataspace);
  H5Sclose (memspace);
  H5Tclose (datatype);
}      
戏剧牡丹亭 2024-07-21 06:18:51

下面是一些使用 HDF5 c++ API 编写可变长度字符串向量的工作代码。

我在其他帖子中纳入了一些建议:

  1. 使用 H5T_C_S1 和 H5T_VARIABLE
  2. 使用 string::c_str() 获取指向字符串的指针
  3. ,将指针放入 vector 中code>char* 并传递给 HDF5 API

没有必要创建字符串的昂贵副本(例如使用 strdup())。 c_str() 返回指向基础字符串的 null 终止数据的指针。 这正是该函数的目的。 当然,嵌入空值的字符串将无法使用此...

std::vector 保证具有连续的底层存储,因此使用 vectorvector: :data() 与使用原始数组相同,但当然比笨重、老式的 C 处理方式更简洁、更安全。

#include "H5Cpp.h"
void write_hdf5(H5::H5File file, const std::string& data_set_name,
                const std::vector<std::string>& strings )
{
    H5::Exception::dontPrint();

    try
    {
        // HDF5 only understands vector of char* :-(
        std::vector<const char*> arr_c_str;
        for (unsigned ii = 0; ii < strings.size(); ++ii) 
            arr_c_str.push_back(strings[ii].c_str());

        //
        //  one dimension
        // 
        hsize_t     str_dimsf[1] {arr_c_str.size()};
        H5::DataSpace   dataspace(1, str_dimsf);

        // Variable length string
        H5::StrType datatype(H5::PredType::C_S1, H5T_VARIABLE); 
        H5::DataSet str_dataset = file.createDataSet(data_set_name, datatype, dataspace);

        str_dataset.write(arr_c_str.data(), datatype);
    }
    catch (H5::Exception& err)
    {
        throw std::runtime_error(string("HDF5 Error in " ) 
                                    + err.getFuncName()
                                    + ": "
                                    + err.getDetailMsg());


    }
}

Here is some working code for writing a vector of variable length strings using the HDF5 c++ API.

I incorporate some of the suggestions in the other posts:

  1. use H5T_C_S1 and H5T_VARIABLE
  2. use string::c_str() to obtain pointers to the strings
  3. place the pointers into a vector of char* and pass to the HDF5 API

It is not necessary to create expensive copies of the string (e.g. with strdup()). c_str() returns a pointer to the null terminated data of the underlying string. This is precisely what the function is intended for. Of course, strings with embedded nulls will not work with this...

std::vector is guaranteed to have contiguous underlying storage, so using vector and vector::data() is the same as using raw arrays but is of course much neater and safer than the clunky, old-fashioned c way of doing things.

#include "H5Cpp.h"
void write_hdf5(H5::H5File file, const std::string& data_set_name,
                const std::vector<std::string>& strings )
{
    H5::Exception::dontPrint();

    try
    {
        // HDF5 only understands vector of char* :-(
        std::vector<const char*> arr_c_str;
        for (unsigned ii = 0; ii < strings.size(); ++ii) 
            arr_c_str.push_back(strings[ii].c_str());

        //
        //  one dimension
        // 
        hsize_t     str_dimsf[1] {arr_c_str.size()};
        H5::DataSpace   dataspace(1, str_dimsf);

        // Variable length string
        H5::StrType datatype(H5::PredType::C_S1, H5T_VARIABLE); 
        H5::DataSet str_dataset = file.createDataSet(data_set_name, datatype, dataspace);

        str_dataset.write(arr_c_str.data(), datatype);
    }
    catch (H5::Exception& err)
    {
        throw std::runtime_error(string("HDF5 Error in " ) 
                                    + err.getFuncName()
                                    + ": "
                                    + err.getDetailMsg());


    }
}
不奢求什么 2024-07-21 06:18:51

如果您正在寻找更清晰的代码:我建议您创建一个函子,它将接受一个字符串并将其保存到 HDF5 容器(以所需的模式)。 Richard,我使用了错误的算法,请重新检查!

std::for_each(v.begin(), v.end(), write_hdf5);

struct hdf5 : public std::unary_function<std::string, void> {
    hdf5() : _dataset(...) {} // initialize the HDF5 db
    ~hdf5() : _dataset(...) {} // close the the HDF5 db
    void operator(std::string& s) {
            // append 
            // use s.c_str() ?
    }
};

这有助于入门吗?

If you are looking at cleaner code: I suggest you create a functor that'll take a string and save it to the HDF5 Container (in a desired mode). Richard, I used the wrong algorithm, please re-check!

std::for_each(v.begin(), v.end(), write_hdf5);

struct hdf5 : public std::unary_function<std::string, void> {
    hdf5() : _dataset(...) {} // initialize the HDF5 db
    ~hdf5() : _dataset(...) {} // close the the HDF5 db
    void operator(std::string& s) {
            // append 
            // use s.c_str() ?
    }
};

Does that help get started?

爱,才寂寞 2024-07-21 06:18:51

我遇到了类似的问题,但需要注意的是,我想要将字符串向量存储为属性。 属性的棘手之处在于我们不能使用像 hyperslab 这样的花哨的数据空间功能(至少对于 C++ API 来说)。

但无论哪种情况,将字符串向量输入到数据集中的单个条目中可能会很有用(例如,如果您总是希望一起阅读它们)。 在这种情况下,所有的魔力都来自于类型,而不是数据空间本身。

基本上有 4 个步骤:

  1. 创建一个指向字符串的 vector
  2. 创建一个指向向量并包含其长度的 hvl_t 结构。
  3. 创建数据类型。 这是一个包装(可变长度)H5::StrTypeH5::VarLenType
  4. hvl_t 类型写入数据集。

此方法真正好的部分是您将整个条目填充到 HDF5 认为的标量值中。 这意味着将其设为属性(而不是数据集)很简单。

无论您选择此解决方案还是选择每个字符串都在其自己的数据集条目中的解决方案,也可能与所需的性能有关:如果您正在寻找对特定字符串的随机访问,最好将字符串写在数据集中,以便它们可以被索引。 如果您总是要一起阅读它们,那么此解决方案可能也同样有效。

以下是如何使用 C++ API 和简单标量数据集执行此操作的简短示例:

#include <vector>
#include <string>
#include "H5Cpp.h"

int main(int argc, char* argv[]) {
  // Part 0: make up some data
  std::vector<std::string> strings;
  for (int iii = 0; iii < 10; iii++) {
    strings.push_back("this is " + std::to_string(iii));
  }

  // Part 1: grab pointers to the chars
  std::vector<const char*> chars;
  for (const auto& str: strings) {
    chars.push_back(str.data());
  }

  // Part 2: create the variable length type
  hvl_t hdf_buffer;
  hdf_buffer.p = chars.data();
  hdf_buffer.len = chars.size();

  // Part 3: create the type
  auto s_type = H5::StrType(H5::PredType::C_S1, H5T_VARIABLE);
  s_type.setCset(H5T_CSET_UTF8); // just for fun, you don't need this
  auto svec_type = H5::VarLenType(&s_type);

  // Part 4: write the output to a scalar dataset
  H5::H5File out_file("vtest.h5", H5F_ACC_EXCL);
  H5::DataSet dataset(
    out_file.createDataSet("the_ds", svec_type, H5S_SCALAR));
  dataset.write(&hdf_buffer, svec_type);

  return 0;
}

I had a similar issue, with the caveat that I wanted a vector of strings stored as an attribute. The tricky thing with attributes is that we can't use fancy dataspace features like hyperslabs (at least with the C++ API).

But in either case, it may be useful to enter a vector of strings into a single entry in a dataset (if, for example, you always expect to read them together). In this case all the magic comes with the type, not with the dataspace itself.

There are basically 4 steps:

  1. Make a vector<const char*> which points to the strings.
  2. Create a hvl_t structure that points to the vector and contains it's length.
  3. Create the datatype. This is a H5::VarLenType wrapping a (variable length) H5::StrType.
  4. Write the hvl_t type to a dataset.

The really nice part of this method is that you're stuffing the whole entry into what HDF5 considers a scalar value. This means that making it an attribute (rather than a dataset) is trivial.

Whether you choose this solution or the one with each string in its own dataset entry is probably also a matter of the desired performance: if you're looking for random access to specific strings, it's probably better to write the strings out in a dataset so they can be indexed. If you're always going to read them all out together this solution may work just as well.

Here's a short example of how to do this, using the C++ API and a simple scalar dataset:

#include <vector>
#include <string>
#include "H5Cpp.h"

int main(int argc, char* argv[]) {
  // Part 0: make up some data
  std::vector<std::string> strings;
  for (int iii = 0; iii < 10; iii++) {
    strings.push_back("this is " + std::to_string(iii));
  }

  // Part 1: grab pointers to the chars
  std::vector<const char*> chars;
  for (const auto& str: strings) {
    chars.push_back(str.data());
  }

  // Part 2: create the variable length type
  hvl_t hdf_buffer;
  hdf_buffer.p = chars.data();
  hdf_buffer.len = chars.size();

  // Part 3: create the type
  auto s_type = H5::StrType(H5::PredType::C_S1, H5T_VARIABLE);
  s_type.setCset(H5T_CSET_UTF8); // just for fun, you don't need this
  auto svec_type = H5::VarLenType(&s_type);

  // Part 4: write the output to a scalar dataset
  H5::H5File out_file("vtest.h5", H5F_ACC_EXCL);
  H5::DataSet dataset(
    out_file.createDataSet("the_ds", svec_type, H5S_SCALAR));
  dataset.write(&hdf_buffer, svec_type);

  return 0;
}
橙味迷妹 2024-07-21 06:18:51

我参加聚会迟到了,但我根据有关段错误的评论修改了 Leo Goodstadt 的答案。 我是linux,但是没有出现这样的问题。 我编写了 2 个函数,一个将 std::string 向量写入打开的 H5File 中给定名称的数据集,另一个将结果数据集读回 std::string 向量。 请注意,类型之间可能会有几次不必要的复制,这可以进一步优化。 这是用于写入和读取的工作代码:

void write_varnames( const std::string& dsetname, const std::vector<std::string>& strings, H5::H5File& f)
  {
    H5::Exception::dontPrint();

    try
      {
        // HDF5 only understands vector of char* :-(
        std::vector<const char*> arr_c_str;
        for (size_t ii = 0; ii < strings.size(); ++ii)
      {
        arr_c_str.push_back(strings[ii].c_str());
      }

        //
        //  one dimension
        // 
        hsize_t     str_dimsf[1] {arr_c_str.size()};
        H5::DataSpace   dataspace(1, str_dimsf);

        // Variable length string
        H5::StrType datatype(H5::PredType::C_S1, H5T_VARIABLE); 
        H5::DataSet str_dataset = f.createDataSet(dsetname, datatype, dataspace);

        str_dataset.write(arr_c_str.data(), datatype);
      }
    catch (H5::Exception& err)
      {
        throw std::runtime_error(std::string("HDF5 Error in ")  
                 + err.getFuncName()
                 + ": "
                 + err.getDetailMsg());


      }
  }

以及读取:

std::vector<std::string> read_string_dset( const std::string& dsname, H5::H5File& f )
  {
    H5::DataSet cdataset = f.openDataSet( dsname );


    H5::DataSpace space = cdataset.getSpace();

    int rank = space.getSimpleExtentNdims();

    hsize_t dims_out[1];

    int ndims = space.getSimpleExtentDims( dims_out, NULL);

    size_t length = dims_out[0];

    std::vector<const char*> tmpvect( length, NULL );

    fprintf(stdout, "In read STRING dataset, got number of strings: [%ld]\n", length );

    std::vector<std::string> strs(length);
    H5::StrType datatype(H5::PredType::C_S1, H5T_VARIABLE); 
    cdataset.read( tmpvect.data(), datatype);

    for(size_t x=0; x<tmpvect.size(); ++x)
      {
        fprintf(stdout, "GOT STRING [%s]\n", tmpvect[x] );
        strs[x] = tmpvect[x];
      }

    return strs;
  }

I am late to the party but I've modified Leo Goodstadt's answer based on the comments regarding segfaults. I am on linux, but I don't have such problems. I wrote 2 functions, one to write a vector of std::string to a dataset of a given name in an open H5File, and another to read back the resulting data sets into a vector of std::string. Note there may unnecessary copying between types a few times that can be more optimised. Here is working code for writing and reading:

void write_varnames( const std::string& dsetname, const std::vector<std::string>& strings, H5::H5File& f)
  {
    H5::Exception::dontPrint();

    try
      {
        // HDF5 only understands vector of char* :-(
        std::vector<const char*> arr_c_str;
        for (size_t ii = 0; ii < strings.size(); ++ii)
      {
        arr_c_str.push_back(strings[ii].c_str());
      }

        //
        //  one dimension
        // 
        hsize_t     str_dimsf[1] {arr_c_str.size()};
        H5::DataSpace   dataspace(1, str_dimsf);

        // Variable length string
        H5::StrType datatype(H5::PredType::C_S1, H5T_VARIABLE); 
        H5::DataSet str_dataset = f.createDataSet(dsetname, datatype, dataspace);

        str_dataset.write(arr_c_str.data(), datatype);
      }
    catch (H5::Exception& err)
      {
        throw std::runtime_error(std::string("HDF5 Error in ")  
                 + err.getFuncName()
                 + ": "
                 + err.getDetailMsg());


      }
  }

And to read:

std::vector<std::string> read_string_dset( const std::string& dsname, H5::H5File& f )
  {
    H5::DataSet cdataset = f.openDataSet( dsname );


    H5::DataSpace space = cdataset.getSpace();

    int rank = space.getSimpleExtentNdims();

    hsize_t dims_out[1];

    int ndims = space.getSimpleExtentDims( dims_out, NULL);

    size_t length = dims_out[0];

    std::vector<const char*> tmpvect( length, NULL );

    fprintf(stdout, "In read STRING dataset, got number of strings: [%ld]\n", length );

    std::vector<std::string> strs(length);
    H5::StrType datatype(H5::PredType::C_S1, H5T_VARIABLE); 
    cdataset.read( tmpvect.data(), datatype);

    for(size_t x=0; x<tmpvect.size(); ++x)
      {
        fprintf(stdout, "GOT STRING [%s]\n", tmpvect[x] );
        strs[x] = tmpvect[x];
      }

    return strs;
  }
淡水深流 2024-07-21 06:18:51

如你所知,hdf5文件只接受char*格式的数据,即
所以最自然的方式就是动态创建连续的地址(给定空间大小),并将向量的值复制到其中。

char* strs = NULL;
strs = (char*)malloc(date.size() * (date[0].size() + 1) * (char)sizeof(char));

for (int i = 0; i < date.size(); i++) {
    string s = date[i];
    strcpy(strs + i * (date[0].size() + 1), date[i].c_str());
}

完整代码如下所示,

 bool writeString(hid_t file_id, vector<string>& date, string dateSetName) {
    hid_t dataset_id, dataspace_id;  /* identifiers */
    herr_t status;
    hid_t dtype;
    size_t size;
    hsize_t dims[1] = { date.size() };
    dataspace_id = H5Screate_simple(1, dims, NULL);


    dtype = H5Tcopy(H5T_C_S1);
    size = (date[0].size() + 1) * sizeof(char);
    status = H5Tset_size(dtype, size);

    char* strs = NULL;
    strs = (char*)malloc(date.size() * (date[0].size() + 1) * (char)sizeof(char));

    for (int i = 0; i < date.size(); i++) {
        string s = date[i];
        strcpy(strs + i * (date[0].size() + 1), date[i].c_str());
        
    }

    dataset_id = H5Dcreate(file_id, dateSetName.c_str(), dtype, dataspace_id, H5P_DEFAULT,
        H5P_DEFAULT, H5P_DEFAULT);

    status = H5Dwrite(dataset_id, dtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, strs);

    status = H5Dclose(dataset_id);
    status = H5Sclose(dataspace_id);
    status = H5Tclose(dtype);
    free(strs);
    return true;
}

不要忘记释放指针。

As you know,hdf5 file only accept data with the format as char*, which is
an address.So the most natural way is like dynamically creating consecutive address (the space size is given), and copying the value of vector into it.

char* strs = NULL;
strs = (char*)malloc(date.size() * (date[0].size() + 1) * (char)sizeof(char));

for (int i = 0; i < date.size(); i++) {
    string s = date[i];
    strcpy(strs + i * (date[0].size() + 1), date[i].c_str());
}

The complete code is shown as below,

 bool writeString(hid_t file_id, vector<string>& date, string dateSetName) {
    hid_t dataset_id, dataspace_id;  /* identifiers */
    herr_t status;
    hid_t dtype;
    size_t size;
    hsize_t dims[1] = { date.size() };
    dataspace_id = H5Screate_simple(1, dims, NULL);


    dtype = H5Tcopy(H5T_C_S1);
    size = (date[0].size() + 1) * sizeof(char);
    status = H5Tset_size(dtype, size);

    char* strs = NULL;
    strs = (char*)malloc(date.size() * (date[0].size() + 1) * (char)sizeof(char));

    for (int i = 0; i < date.size(); i++) {
        string s = date[i];
        strcpy(strs + i * (date[0].size() + 1), date[i].c_str());
        
    }

    dataset_id = H5Dcreate(file_id, dateSetName.c_str(), dtype, dataspace_id, H5P_DEFAULT,
        H5P_DEFAULT, H5P_DEFAULT);

    status = H5Dwrite(dataset_id, dtype, H5S_ALL, H5S_ALL, H5P_DEFAULT, strs);

    status = H5Dclose(dataset_id);
    status = H5Sclose(dataspace_id);
    status = H5Tclose(dtype);
    free(strs);
    return true;
}

Don't forget to free the pointer.

离笑几人歌 2024-07-21 06:18:51

您可以使用简单的 std::vector 来代替 TempContainer (您也可以将其模板化以匹配 T -> basic_string 。
像这样的东西:

#include <algorithm>
#include <vector>
#include <string>
#include <functional>

class StringToVector
  : std::unary_function<std::vector<char>, std::string> {
public:
  std::vector<char> operator()(const std::string &s) const {
    // assumes you want a NUL-terminated string
    const char* str = s.c_str();
    std::size_t size = 1 + std::strlen(str);
    // s.size() != strlen(s.c_str())
    std::vector<char> buf(&str[0], &str[size]);
    return buf;
  }
};

void conv(const std::vector<std::string> &vi,
          std::vector<std::vector<char> > &vo)
{
  // assert vo.size() == vi.size()
  std::transform(vi.begin(), vi.end(),
                 vo.begin(),
                 StringToVector());
}

Instead of a TempContainer, you can use a simple std::vector (you could also templatized it to match T -> basic_string .
Something like this:

#include <algorithm>
#include <vector>
#include <string>
#include <functional>

class StringToVector
  : std::unary_function<std::vector<char>, std::string> {
public:
  std::vector<char> operator()(const std::string &s) const {
    // assumes you want a NUL-terminated string
    const char* str = s.c_str();
    std::size_t size = 1 + std::strlen(str);
    // s.size() != strlen(s.c_str())
    std::vector<char> buf(&str[0], &str[size]);
    return buf;
  }
};

void conv(const std::vector<std::string> &vi,
          std::vector<std::vector<char> > &vo)
{
  // assert vo.size() == vi.size()
  std::transform(vi.begin(), vi.end(),
                 vo.begin(),
                 StringToVector());
}
我是男神闪亮亮 2024-07-21 06:18:51

为了能够读取 std::vector,我根据 Leo 的提示在这里发布了我的解决方案 https://stackoverflow.com/a/15220532/364818

我混合了 C 和 C++ API。 请随意编辑它并使其更简单。

请注意,当您调用 read 时,HDF5 API 返回一个 char* 指针列表。 这些char*指针在使用后必须释放,否则会出现内存泄漏。

用法示例

H5::Attribute Foo = file.openAttribute("Foo");
std::vector<std::string> foos
Foo >> foos;

这是代码

  const H5::Attribute& operator>>(const H5::Attribute& attr0, std::vector<std::string>& array)
  {
      H5::Exception::dontPrint();

      try
      {
          hid_t attr = attr0.getId();

          hid_t atype = H5Aget_type(attr);
          hid_t aspace = H5Aget_space(attr);
          int rank = H5Sget_simple_extent_ndims(aspace);
          if (rank != 1) throw PBException("Attribute " + attr0.getName() + " is not a string array");

          hsize_t sdim[1];
          herr_t ret = H5Sget_simple_extent_dims(aspace, sdim, NULL);
          size_t size = H5Tget_size (atype);
          if (size != sizeof(void*))
          {
              throw PBException("Internal inconsistency. Expected pointer size element");
          }

          // HDF5 only understands vector of char* :-(
          std::vector<char*> arr_c_str(sdim[0]);

          H5::StrType stringType(H5::PredType::C_S1, H5T_VARIABLE);
          attr0.read(stringType, arr_c_str.data());
          array.resize(sdim[0]);
          for(int i=0;i<sdim[0];i++)
          {
              // std::cout << i << "=" << arr_c_str[i] << std::endl;
              array[i] = arr_c_str[i];
              free(arr_c_str[i]);
          }

      }
      catch (H5::Exception& err)
      {
          throw std::runtime_error(string("HDF5 Error in " )
                                    + err.getFuncName()
                                    + ": "
                                    + err.getDetailMsg());


      }

      return attr0;
  }

In the interest of having the ability to read std::vector<std::string> I'm posting my solution, based on the hints from Leo here https://stackoverflow.com/a/15220532/364818.

I've mixed C and C++ APIs. Please feel free to edit this and make it simpler.

Note that the HDF5 API returns a list of char*pointers when you call read. These char* pointers must be freed after use, otherwise there is a memory leak.

Usage example

H5::Attribute Foo = file.openAttribute("Foo");
std::vector<std::string> foos
Foo >> foos;

Here's the code

  const H5::Attribute& operator>>(const H5::Attribute& attr0, std::vector<std::string>& array)
  {
      H5::Exception::dontPrint();

      try
      {
          hid_t attr = attr0.getId();

          hid_t atype = H5Aget_type(attr);
          hid_t aspace = H5Aget_space(attr);
          int rank = H5Sget_simple_extent_ndims(aspace);
          if (rank != 1) throw PBException("Attribute " + attr0.getName() + " is not a string array");

          hsize_t sdim[1];
          herr_t ret = H5Sget_simple_extent_dims(aspace, sdim, NULL);
          size_t size = H5Tget_size (atype);
          if (size != sizeof(void*))
          {
              throw PBException("Internal inconsistency. Expected pointer size element");
          }

          // HDF5 only understands vector of char* :-(
          std::vector<char*> arr_c_str(sdim[0]);

          H5::StrType stringType(H5::PredType::C_S1, H5T_VARIABLE);
          attr0.read(stringType, arr_c_str.data());
          array.resize(sdim[0]);
          for(int i=0;i<sdim[0];i++)
          {
              // std::cout << i << "=" << arr_c_str[i] << std::endl;
              array[i] = arr_c_str[i];
              free(arr_c_str[i]);
          }

      }
      catch (H5::Exception& err)
      {
          throw std::runtime_error(string("HDF5 Error in " )
                                    + err.getFuncName()
                                    + ": "
                                    + err.getDetailMsg());


      }

      return attr0;
  }
青衫负雪 2024-07-21 06:18:51

我不知道 HDF5,但您可以使用

struct TempContainer {
    char* string;
};

这种方式然后复制字符串:

TempContainer t;
t.string = strdup(i->c_str());
tc.push_back (t);

这将分配一个具有确切大小的字符串,并且在插入或从容器中读取时也有很大改进(在您的示例中复制了一个数组) ,在这种情况下只有一个指针)。 您还可以使用 std::vector:

std::vector<char *> tc;
...
tc.push_back(strdup(i->c_str());

I don't know about HDF5, but you can use

struct TempContainer {
    char* string;
};

and then copy the strings this way:

TempContainer t;
t.string = strdup(i->c_str());
tc.push_back (t);

This will allocate a string with the exact size, and also improves a lot when inserting or reading from the container (in your example there's an array copied, in this case only a pointer). You can also use std::vector:

std::vector<char *> tc;
...
tc.push_back(strdup(i->c_str());
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文