CURLOPT_WRITEFUNCTION 指向成员函数的指针

发布于 2024-10-03 15:24:11 字数 5786 浏览 2 评论 0原文

我试图将此curl 函数包含到我的类中,但在CURLOPT_WRITEFUNCTION 方面遇到问题。编译后没有找到解决方案。还尝试了一些基于 stackoverflow 的东西,但不可用。

这是我的尝试(替换此代码中的“writer”)

node::writer &节点::编写者 std::bind1st(std::mem_fun(&node::writer), this);

这是我的代码:

#ifndef NODE_H_
#define NODE_H_

int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer);

/*
 * function prototypes
 */

class node {
 /*
  * general struct to hold html element properties
  */
 struct tag;

 /*
  * the url and source of the page
  */
 std::string url;
 std::string source;

 /*
  *  vector of structures that store tag elements
  */
 std::vector<tag> heading;
 std::vector<tag> anchor;

 /*
  * grab source with curl
  */
 std::string curlHttpget(const std::string &url);

 /*
  * add tag structs to vector
  * @see std::vector<tag> heading
  * @see std::vector<tag> anchor
  */
 void add_heading(std::string, std::string);
 void add_anchor(std::string, std::string);

public:
 /*
  * constructors
  */
 node(){}
 node(std::string);

 /*
  * deconstructors
  */
 ~node(){}

 /*
  * crawl page
  */
 void load(std::string seed);//crawls the page

 /*
  * anchor tags
  */
 void get_anchors();// scrape the anchor tags
 void display_anchors();

 /*
  * heading tags
  */
 void get_headings();// scrape heading tags
 void display_headings();
};
/*
 * for all stored html elements
 */
struct node::tag {
 std::string text;
 std::string properties;
 tag(std::string t, std::string p) : text(t), properties(p) {}
};

/*
 * constructors
 */
node::node(std::string seed) {
 load(seed);
 get_anchors();
 get_headings();
}
/*
 * araneus::subroutines
 */

// crawl the page
void node::load(std::string seed) {
 url = seed;
 source = curlHttpget(url);
}


//scrape html source
std::string node::curlHttpget(const std::string &url) {
 std::string buffer;

 CURL *curl;
 CURLcode result;

 curl = curl_easy_init();

 if (curl) {
  curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
  curl_easy_setopt(curl, CURLOPT_HEADER, 0);
  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer);
  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);

  result = curl_easy_perform(curl);//http get performed

  curl_easy_cleanup(curl);//must cleanup

  //error codes: http://curl.haxx.se/libcurl/c/libcurl-errors.html
  if (result == CURLE_OK) {
   return buffer;
  }
  //curl_easy_strerror was added in libcurl 7.12.0
  std::cerr << "error: " << result << " " << curl_easy_strerror(result) << std::endl;
  return "";
 }

 std::cerr << "error: could not initalize curl" << std::endl;
 return "";
}

void node::get_headings() {
 static const regex expression("<[hH][1-6]\\s*(?<properties>.*?)\\s*>(?<name>.*?)</\\s*[hH][1-6]\\s*>");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text;
 string properties;

 int count = 0;
 for (;p != end; count++, ++p)
 {
  string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_heading(text, properties);
  }
  else {
   properties = m;
  }
 }
}

//use regex to find anchors in source
void node::get_anchors() {
 static const regex expression("<[a|A].*?[href|HREF]\\s*=[\"\"'](?<url>.*?)[\"\"'].*?>(?<name>.*?)</[a|A]>");
 static const regex relative("^\\/");
 static const regex firstChar("^[A-Za-z0-9\\-_\\$\\.\\+!\\*'\\(\\)#]"); // valid url characters
 static const regex protocol("^[http:\\/\\/|HTTP:\\/\\/|https:\\/\\/|HTTPS:\\/\\/|ftp:\\/\\/|FTP:\\/\\/|sftp:\\/\\/|SFTP:\\/\\/]");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text, properties;

 int count = 0;
 for (; p != end; count++, ++p) {
  std::string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_anchor(text, properties);
  }
  else {
   if(regex_search(m, relative)) { //if link is in "/somewhere" format
    properties = url + m;
   }
   else if(regex_search(m, protocol)) { //if link is absolute "http://www.somewhere.com"
    properties = m;
   }
   else if(regex_search(m, firstChar)) { //if link starts with a valid url char "somewhere.html"
    properties = url + "/" + m;
   }
   else {
    std::cout << "link of unknown protocol: " << m << std::endl;
   }
  }
 }
}

void node::add_heading(std::string text, std::string properties) {
 heading.push_back(tag(text, properties));
}

void node::display_headings() {
 for(int i = 0; i < (int)heading.size(); i++) {
  std::cout<< "[h]: " << heading[i].text << endl;
  std::cout<< "[h.properties]: " << heading[i].properties << endl;
 }
 cout << "found " << (int)heading.size() << " <h[1-6]> tags" << endl;
}

void node::add_anchor(std::string text, std::string properties) {
 anchor.push_back(tag(text, properties));
}

void node::display_anchors() {
 for(int i = 0; i < (int)anchor.size(); i++) {
  std::cout<< "[a]: " << anchor[i].text << endl;
  std::cout<< "[a.properties]: " << anchor[i].properties << endl;
 }
 cout << "found " << (int)anchor.size() << " <a> tags" << endl;
}

//required by libcurl
int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer) {
 int result = 0;

 if (buffer != NULL) {
  buffer->append(data, size * nmemb);
  result = size * nmemb;
 }
 return result;
}

#endif /* NODE_H_ */

寻找一种解决方案,使函数“int writer”成为“int node::writer”。当我调用 CURLOPT_WRITEFUNCTION 时,问题出现在 std::string node::curlHttpget 中。

&node::writer 编译但出现段错误 =/

谢谢

I'm trying to include this curl function in to my class but having trouble with CURLOPT_WRITEFUNCTION. Following the compilation didn't find me a solution. Also tried some things based on stackoverflow to no available.

Here's my attempt (replacing 'writer' in this code)

node::writer
&node::writer
std::bind1st(std::mem_fun(&node::writer), this);

Here's my code:

#ifndef NODE_H_
#define NODE_H_

int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer);

/*
 * function prototypes
 */

class node {
 /*
  * general struct to hold html element properties
  */
 struct tag;

 /*
  * the url and source of the page
  */
 std::string url;
 std::string source;

 /*
  *  vector of structures that store tag elements
  */
 std::vector<tag> heading;
 std::vector<tag> anchor;

 /*
  * grab source with curl
  */
 std::string curlHttpget(const std::string &url);

 /*
  * add tag structs to vector
  * @see std::vector<tag> heading
  * @see std::vector<tag> anchor
  */
 void add_heading(std::string, std::string);
 void add_anchor(std::string, std::string);

public:
 /*
  * constructors
  */
 node(){}
 node(std::string);

 /*
  * deconstructors
  */
 ~node(){}

 /*
  * crawl page
  */
 void load(std::string seed);//crawls the page

 /*
  * anchor tags
  */
 void get_anchors();// scrape the anchor tags
 void display_anchors();

 /*
  * heading tags
  */
 void get_headings();// scrape heading tags
 void display_headings();
};
/*
 * for all stored html elements
 */
struct node::tag {
 std::string text;
 std::string properties;
 tag(std::string t, std::string p) : text(t), properties(p) {}
};

/*
 * constructors
 */
node::node(std::string seed) {
 load(seed);
 get_anchors();
 get_headings();
}
/*
 * araneus::subroutines
 */

// crawl the page
void node::load(std::string seed) {
 url = seed;
 source = curlHttpget(url);
}


//scrape html source
std::string node::curlHttpget(const std::string &url) {
 std::string buffer;

 CURL *curl;
 CURLcode result;

 curl = curl_easy_init();

 if (curl) {
  curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
  curl_easy_setopt(curl, CURLOPT_HEADER, 0);
  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer);
  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);

  result = curl_easy_perform(curl);//http get performed

  curl_easy_cleanup(curl);//must cleanup

  //error codes: http://curl.haxx.se/libcurl/c/libcurl-errors.html
  if (result == CURLE_OK) {
   return buffer;
  }
  //curl_easy_strerror was added in libcurl 7.12.0
  std::cerr << "error: " << result << " " << curl_easy_strerror(result) << std::endl;
  return "";
 }

 std::cerr << "error: could not initalize curl" << std::endl;
 return "";
}

void node::get_headings() {
 static const regex expression("<[hH][1-6]\\s*(?<properties>.*?)\\s*>(?<name>.*?)</\\s*[hH][1-6]\\s*>");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text;
 string properties;

 int count = 0;
 for (;p != end; count++, ++p)
 {
  string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_heading(text, properties);
  }
  else {
   properties = m;
  }
 }
}

//use regex to find anchors in source
void node::get_anchors() {
 static const regex expression("<[a|A].*?[href|HREF]\\s*=[\"\"'](?<url>.*?)[\"\"'].*?>(?<name>.*?)</[a|A]>");
 static const regex relative("^\\/");
 static const regex firstChar("^[A-Za-z0-9\\-_\\$\\.\\+!\\*'\\(\\)#]"); // valid url characters
 static const regex protocol("^[http:\\/\\/|HTTP:\\/\\/|https:\\/\\/|HTTPS:\\/\\/|ftp:\\/\\/|FTP:\\/\\/|sftp:\\/\\/|SFTP:\\/\\/]");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text, properties;

 int count = 0;
 for (; p != end; count++, ++p) {
  std::string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_anchor(text, properties);
  }
  else {
   if(regex_search(m, relative)) { //if link is in "/somewhere" format
    properties = url + m;
   }
   else if(regex_search(m, protocol)) { //if link is absolute "http://www.somewhere.com"
    properties = m;
   }
   else if(regex_search(m, firstChar)) { //if link starts with a valid url char "somewhere.html"
    properties = url + "/" + m;
   }
   else {
    std::cout << "link of unknown protocol: " << m << std::endl;
   }
  }
 }
}

void node::add_heading(std::string text, std::string properties) {
 heading.push_back(tag(text, properties));
}

void node::display_headings() {
 for(int i = 0; i < (int)heading.size(); i++) {
  std::cout<< "[h]: " << heading[i].text << endl;
  std::cout<< "[h.properties]: " << heading[i].properties << endl;
 }
 cout << "found " << (int)heading.size() << " <h[1-6]> tags" << endl;
}

void node::add_anchor(std::string text, std::string properties) {
 anchor.push_back(tag(text, properties));
}

void node::display_anchors() {
 for(int i = 0; i < (int)anchor.size(); i++) {
  std::cout<< "[a]: " << anchor[i].text << endl;
  std::cout<< "[a.properties]: " << anchor[i].properties << endl;
 }
 cout << "found " << (int)anchor.size() << " <a> tags" << endl;
}

//required by libcurl
int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer) {
 int result = 0;

 if (buffer != NULL) {
  buffer->append(data, size * nmemb);
  result = size * nmemb;
 }
 return result;
}

#endif /* NODE_H_ */

looking for a solution to get the function 'int writer' to be "int node::writer". the problem occurs in std::string node::curlHttpget, when I call CURLOPT_WRITEFUNCTION.

&node::writer compiles but gives a seg fault =/

thanks

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

风情万种。 2024-10-10 15:24:11

不要使用 std::string* 使用 node* 作为参数,或者使用另一个类似 HttpGet 的类,该类具有 std::string 和返回节点的指针,以便它可以写入字符串并在每次调用时访问您的节点。

boost::bind 不适用于 C-API 回调。

它可以编译,因为curl_easy_setopt 使用...所以完全不是类型安全的。你可以将任何你想要的类型传递给它,它就会编译。但它可能不会运行,正如您发现的那样,需要付出代价。

我会寻求额外的类型安全性,使您的函数具有与 Curl_write_callback 完全相同的签名,即 void* 作为第四个参数,并在函数实现中进行转换。

Instead of using std::string* use node* as the parameter or another class like HttpGet that has a std::string and a pointer back to your node so it can write to the string and access your node on each call.

boost::bind won't work for C-API callbacks.

It compiles because curl_easy_setopt uses ... so is totally not typesafe. You can pass it any type you want under the sun and it will compile. It probably won't run though, as you found to your cost.

I would go for the extra type-safety of making your function have exactly the same signature as Curl_write_callback i.e. void* as the 4th parameter, and do the casting in the function implementation.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文