CURLOPT_WRITEFUNCTION 指向成员函数的指针

发布于 2024-10-03 15:24:11 字数 5786 浏览 4 评论 0原文

我试图将此curl 函数包含到我的类中，但在CURLOPT_WRITEFUNCTION 方面遇到问题。编译后没有找到解决方案。还尝试了一些基于 stackoverflow 的东西，但不可用。

这是我的尝试（替换此代码中的“writer”）

node::writer &节点::编写者 std::bind1st(std::mem_fun(&node::writer), this);

这是我的代码：

#ifndef NODE_H_
#define NODE_H_

int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer);

/*
 * function prototypes
 */

class node {
 /*
  * general struct to hold html element properties
  */
 struct tag;

 /*
  * the url and source of the page
  */
 std::string url;
 std::string source;

 /*
  *  vector of structures that store tag elements
  */
 std::vector<tag> heading;
 std::vector<tag> anchor;

 /*
  * grab source with curl
  */
 std::string curlHttpget(const std::string &url);

 /*
  * add tag structs to vector
  * @see std::vector<tag> heading
  * @see std::vector<tag> anchor
  */
 void add_heading(std::string, std::string);
 void add_anchor(std::string, std::string);

public:
 /*
  * constructors
  */
 node(){}
 node(std::string);

 /*
  * deconstructors
  */
 ~node(){}

 /*
  * crawl page
  */
 void load(std::string seed);//crawls the page

 /*
  * anchor tags
  */
 void get_anchors();// scrape the anchor tags
 void display_anchors();

 /*
  * heading tags
  */
 void get_headings();// scrape heading tags
 void display_headings();
};
/*
 * for all stored html elements
 */
struct node::tag {
 std::string text;
 std::string properties;
 tag(std::string t, std::string p) : text(t), properties(p) {}
};

/*
 * constructors
 */
node::node(std::string seed) {
 load(seed);
 get_anchors();
 get_headings();
}
/*
 * araneus::subroutines
 */

// crawl the page
void node::load(std::string seed) {
 url = seed;
 source = curlHttpget(url);
}


//scrape html source
std::string node::curlHttpget(const std::string &url) {
 std::string buffer;

 CURL *curl;
 CURLcode result;

 curl = curl_easy_init();

 if (curl) {
  curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
  curl_easy_setopt(curl, CURLOPT_HEADER, 0);
  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer);
  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);

  result = curl_easy_perform(curl);//http get performed

  curl_easy_cleanup(curl);//must cleanup

  //error codes: http://curl.haxx.se/libcurl/c/libcurl-errors.html
  if (result == CURLE_OK) {
   return buffer;
  }
  //curl_easy_strerror was added in libcurl 7.12.0
  std::cerr << "error: " << result << " " << curl_easy_strerror(result) << std::endl;
  return "";
 }

 std::cerr << "error: could not initalize curl" << std::endl;
 return "";
}

void node::get_headings() {
 static const regex expression("<[hH][1-6]\\s*(?<properties>.*?)\\s*>(?<name>.*?)</\\s*[hH][1-6]\\s*>");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text;
 string properties;

 int count = 0;
 for (;p != end; count++, ++p)
 {
  string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_heading(text, properties);
  }
  else {
   properties = m;
  }
 }
}

//use regex to find anchors in source
void node::get_anchors() {
 static const regex expression("<[a|A].*?[href|HREF]\\s*=[\"\"'](?<url>.*?)[\"\"'].*?>(?<name>.*?)</[a|A]>");
 static const regex relative("^\\/");
 static const regex firstChar("^[A-Za-z0-9\\-_\\$\\.\\+!\\*'\\(\\)#]"); // valid url characters
 static const regex protocol("^[http:\\/\\/|HTTP:\\/\\/|https:\\/\\/|HTTPS:\\/\\/|ftp:\\/\\/|FTP:\\/\\/|sftp:\\/\\/|SFTP:\\/\\/]");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text, properties;

 int count = 0;
 for (; p != end; count++, ++p) {
  std::string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_anchor(text, properties);
  }
  else {
   if(regex_search(m, relative)) { //if link is in "/somewhere" format
    properties = url + m;
   }
   else if(regex_search(m, protocol)) { //if link is absolute "http://www.somewhere.com"
    properties = m;
   }
   else if(regex_search(m, firstChar)) { //if link starts with a valid url char "somewhere.html"
    properties = url + "/" + m;
   }
   else {
    std::cout << "link of unknown protocol: " << m << std::endl;
   }
  }
 }
}

void node::add_heading(std::string text, std::string properties) {
 heading.push_back(tag(text, properties));
}

void node::display_headings() {
 for(int i = 0; i < (int)heading.size(); i++) {
  std::cout<< "[h]: " << heading[i].text << endl;
  std::cout<< "[h.properties]: " << heading[i].properties << endl;
 }
 cout << "found " << (int)heading.size() << " <h[1-6]> tags" << endl;
}

void node::add_anchor(std::string text, std::string properties) {
 anchor.push_back(tag(text, properties));
}

void node::display_anchors() {
 for(int i = 0; i < (int)anchor.size(); i++) {
  std::cout<< "[a]: " << anchor[i].text << endl;
  std::cout<< "[a.properties]: " << anchor[i].properties << endl;
 }
 cout << "found " << (int)anchor.size() << " <a> tags" << endl;
}

//required by libcurl
int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer) {
 int result = 0;

 if (buffer != NULL) {
  buffer->append(data, size * nmemb);
  result = size * nmemb;
 }
 return result;
}

#endif /* NODE_H_ */

寻找一种解决方案，使函数“int writer”成为“int node::writer”。当我调用 CURLOPT_WRITEFUNCTION 时，问题出现在 std::string node::curlHttpget 中。

&node::writer 编译但出现段错误 =/

谢谢

原文

I'm trying to include this curl function in to my class but having trouble with CURLOPT_WRITEFUNCTION. Following the compilation didn't find me a solution. Also tried some things based on stackoverflow to no available.

Here's my attempt (replacing 'writer' in this code)

node::writer
&node::writer
std::bind1st(std::mem_fun(&node::writer), this);

Here's my code:

#ifndef NODE_H_
#define NODE_H_

int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer);

/*
 * function prototypes
 */

class node {
 /*
  * general struct to hold html element properties
  */
 struct tag;

 /*
  * the url and source of the page
  */
 std::string url;
 std::string source;

 /*
  *  vector of structures that store tag elements
  */
 std::vector<tag> heading;
 std::vector<tag> anchor;

 /*
  * grab source with curl
  */
 std::string curlHttpget(const std::string &url);

 /*
  * add tag structs to vector
  * @see std::vector<tag> heading
  * @see std::vector<tag> anchor
  */
 void add_heading(std::string, std::string);
 void add_anchor(std::string, std::string);

public:
 /*
  * constructors
  */
 node(){}
 node(std::string);

 /*
  * deconstructors
  */
 ~node(){}

 /*
  * crawl page
  */
 void load(std::string seed);//crawls the page

 /*
  * anchor tags
  */
 void get_anchors();// scrape the anchor tags
 void display_anchors();

 /*
  * heading tags
  */
 void get_headings();// scrape heading tags
 void display_headings();
};
/*
 * for all stored html elements
 */
struct node::tag {
 std::string text;
 std::string properties;
 tag(std::string t, std::string p) : text(t), properties(p) {}
};

/*
 * constructors
 */
node::node(std::string seed) {
 load(seed);
 get_anchors();
 get_headings();
}
/*
 * araneus::subroutines
 */

// crawl the page
void node::load(std::string seed) {
 url = seed;
 source = curlHttpget(url);
}


//scrape html source
std::string node::curlHttpget(const std::string &url) {
 std::string buffer;

 CURL *curl;
 CURLcode result;

 curl = curl_easy_init();

 if (curl) {
  curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
  curl_easy_setopt(curl, CURLOPT_HEADER, 0);
  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer);
  curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);

  result = curl_easy_perform(curl);//http get performed

  curl_easy_cleanup(curl);//must cleanup

  //error codes: http://curl.haxx.se/libcurl/c/libcurl-errors.html
  if (result == CURLE_OK) {
   return buffer;
  }
  //curl_easy_strerror was added in libcurl 7.12.0
  std::cerr << "error: " << result << " " << curl_easy_strerror(result) << std::endl;
  return "";
 }

 std::cerr << "error: could not initalize curl" << std::endl;
 return "";
}

void node::get_headings() {
 static const regex expression("<[hH][1-6]\\s*(?<properties>.*?)\\s*>(?<name>.*?)</\\s*[hH][1-6]\\s*>");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text;
 string properties;

 int count = 0;
 for (;p != end; count++, ++p)
 {
  string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_heading(text, properties);
  }
  else {
   properties = m;
  }
 }
}

//use regex to find anchors in source
void node::get_anchors() {
 static const regex expression("<[a|A].*?[href|HREF]\\s*=[\"\"'](?<url>.*?)[\"\"'].*?>(?<name>.*?)</[a|A]>");
 static const regex relative("^\\/");
 static const regex firstChar("^[A-Za-z0-9\\-_\\$\\.\\+!\\*'\\(\\)#]"); // valid url characters
 static const regex protocol("^[http:\\/\\/|HTTP:\\/\\/|https:\\/\\/|HTTPS:\\/\\/|ftp:\\/\\/|FTP:\\/\\/|sftp:\\/\\/|SFTP:\\/\\/]");

 int const subMatches[] = { 1, 2 };

 sregex_token_iterator p(source.begin(), source.end(), expression, subMatches);
 sregex_token_iterator end;

 string text, properties;

 int count = 0;
 for (; p != end; count++, ++p) {
  std::string m(p->first, p->second);

  if(count % 2) {
   text = m;
   add_anchor(text, properties);
  }
  else {
   if(regex_search(m, relative)) { //if link is in "/somewhere" format
    properties = url + m;
   }
   else if(regex_search(m, protocol)) { //if link is absolute "http://www.somewhere.com"
    properties = m;
   }
   else if(regex_search(m, firstChar)) { //if link starts with a valid url char "somewhere.html"
    properties = url + "/" + m;
   }
   else {
    std::cout << "link of unknown protocol: " << m << std::endl;
   }
  }
 }
}

void node::add_heading(std::string text, std::string properties) {
 heading.push_back(tag(text, properties));
}

void node::display_headings() {
 for(int i = 0; i < (int)heading.size(); i++) {
  std::cout<< "[h]: " << heading[i].text << endl;
  std::cout<< "[h.properties]: " << heading[i].properties << endl;
 }
 cout << "found " << (int)heading.size() << " <h[1-6]> tags" << endl;
}

void node::add_anchor(std::string text, std::string properties) {
 anchor.push_back(tag(text, properties));
}

void node::display_anchors() {
 for(int i = 0; i < (int)anchor.size(); i++) {
  std::cout<< "[a]: " << anchor[i].text << endl;
  std::cout<< "[a.properties]: " << anchor[i].properties << endl;
 }
 cout << "found " << (int)anchor.size() << " <a> tags" << endl;
}

//required by libcurl
int writer(char *data, std::size_t size, std::size_t nmemb, std::string *buffer) {
 int result = 0;

 if (buffer != NULL) {
  buffer->append(data, size * nmemb);
  result = size * nmemb;
 }
 return result;
}

#endif /* NODE_H_ */

looking for a solution to get the function 'int writer' to be "int node::writer". the problem occurs in std::string node::curlHttpget, when I call CURLOPT_WRITEFUNCTION.

&node::writer compiles but gives a seg fault =/

thanks

分享到QQ

分享到微博