在Boost :: Spirit :: Qi中解析未完成的文字

发布于 2025-02-05 15:35:54 字数 4279 浏览 1 评论 0 原文

我想通过使用ifstream ::读取读取文本。我面临的问题是,解析器在阅读未完成的文本时总是会返回期望失败。这是我的解析器代码。

template <typename It, typename Skipper= qi::space_type>
struct xmlparser: qi::grammar<It, std::string(), Skipper>{
    xmlparser(): xmlparser::base_type(xml_parser){
        using qi::lit;
        using qi::lexeme;
        using ascii::char_;
        using boost::phoenix::ref;
        using qi::debug;
        using boost::spirit::ascii::space;
        
        skipper= qi::char_("\t\r\n "); //qi::skip(skipper.alias())
        text = !lit('<') >> +(qi::char_ - qi::char_("<")) | lit('\'') | lit('\"');
        prolog = "<?" >> +(qi::char_ - '?') >> "?>";
        name = lexeme[qi::char_("a-zA-Z:_") >> *qi::char_("-a-zA-Z0-9:_")];
        attribute_value =
                    '"' > +(char_ - char_("<&\"")) > '"'
                    | '\'' > +(char_ - char_("<&'")) > '\''
                ;
        attribute = name[print_action("ATT")] >  '=' >  attribute_value[print_action("ATT VALUE")];
        start_tag %= '<' >> !lit('/') >> name >> *(attribute) >> !lit('/')>> '>';
        end_tag = "</" >> name >> '>';
        empty_tag =  '<' >> name >> *(attribute) >> "/>";
        xml_parser = 
            *(text/*[print_action("TEXT")]*/ 
                |start_tag[/*++ref(open_tag_count)*/print_action("OPEN")] 
                | end_tag[/*++ref(end_tag_count)*/print_action("END")] 
                | empty_tag[/*++ref(empty_tag_count)*/print_action("EMPTY")] 
                | prolog
                | skipper
            );
        }

        int get_empty_tag_count(){
            return empty_tag_count;
        }

        int get_open_tag_count(){
            return open_tag_count;
        }

        int get_end_tag_count(){
            return end_tag_count;
        }

        private:
            int open_tag_count= 0;
            int end_tag_count= 0;
            int empty_tag_count= 0;
            int text_count=0;

            qi::rule<It> skipper;
            qi::rule<It, std::string()> text;
            qi::rule<It, std::string()> prolog;
            qi::rule<It, std::string(),Skipper> name;
            qi::rule<It, std::string()> attribute_value;
            qi::rule<It, std::string(),Skipper> attribute;
            qi::rule<It, std::string(),Skipper> start_tag;
            qi::rule<It, std::string(),Skipper> end_tag;
            qi::rule<It, std::string(),Skipper> empty_tag;
            qi::rule<It, std::string(),Skipper> xml_parser;
};

当我阅读使用ifstream :: getline时,我没有任何问题,因为可以将馈入解析器的文本被视为完整。但是,当我通过使用ifstream ::读取文本时,例如,如果发生char [bufsize]在解析XML属性的中间停止,然后将返回期望失败。

未完成文本的示例

</description>
<shipping>Will ship only within country, See description for charges</shipping>
<incategory category="category317"/>
<incategory category="categ

读取字符的函数,

char * buffer= new char[bufsize];
input_file.read(buffer,bufsize);
std::string bufstring(buffer);
if (extra != ""){
   bufstring = extra + bufstring;
   extra= "";
}

我希望知道是否可以返回失败分析值,然后添加到后续读取中,因为随后的读取包含未完成的文本的延续。我尝试写作尝试和捕捉,以便将未能解析为下一个字符读取的文本,但似乎不起作用。

    if (extra != ""){
        bufstring = extra + bufstring;
        extra= "";
    }
    // std::cout << bufstring << std::endl << std::endl;
    std::string::const_iterator iter= bufstring.begin();
    std::string::const_iterator end= bufstring.end();
    try{
        bool r= qi::phrase_parse(iter,end,xml_parser,qi::space);
        if (!r){
            std::cout << "Error found" << std::endl;
            extra = std::string(iter,end);
            std::cout << extra << std::endl;
            delete[] buffer;
            return;
        }
        if (iter!=end){
            extra = std::string(iter,end);
            // std::cout << extra << std::endl;
        }
    } catch (expectation_failure<char const*> const& e){
        std::cout<< std::string(iter,end) << std::endl;
        extra = std::string(iter,end);
    }

I want to parse a text which is read by using ifstream::read. The problem I'm facing is that the parser always return an expectation failure when reading an unfinished text. Here is my code for the parser.

template <typename It, typename Skipper= qi::space_type>
struct xmlparser: qi::grammar<It, std::string(), Skipper>{
    xmlparser(): xmlparser::base_type(xml_parser){
        using qi::lit;
        using qi::lexeme;
        using ascii::char_;
        using boost::phoenix::ref;
        using qi::debug;
        using boost::spirit::ascii::space;
        
        skipper= qi::char_("\t\r\n "); //qi::skip(skipper.alias())
        text = !lit('<') >> +(qi::char_ - qi::char_("<")) | lit('\'') | lit('\"');
        prolog = "<?" >> +(qi::char_ - '?') >> "?>";
        name = lexeme[qi::char_("a-zA-Z:_") >> *qi::char_("-a-zA-Z0-9:_")];
        attribute_value =
                    '"' > +(char_ - char_("<&\"")) > '"'
                    | '\'' > +(char_ - char_("<&'")) > '\''
                ;
        attribute = name[print_action("ATT")] >  '=' >  attribute_value[print_action("ATT VALUE")];
        start_tag %= '<' >> !lit('/') >> name >> *(attribute) >> !lit('/')>> '>';
        end_tag = "</" >> name >> '>';
        empty_tag =  '<' >> name >> *(attribute) >> "/>";
        xml_parser = 
            *(text/*[print_action("TEXT")]*/ 
                |start_tag[/*++ref(open_tag_count)*/print_action("OPEN")] 
                | end_tag[/*++ref(end_tag_count)*/print_action("END")] 
                | empty_tag[/*++ref(empty_tag_count)*/print_action("EMPTY")] 
                | prolog
                | skipper
            );
        }

        int get_empty_tag_count(){
            return empty_tag_count;
        }

        int get_open_tag_count(){
            return open_tag_count;
        }

        int get_end_tag_count(){
            return end_tag_count;
        }

        private:
            int open_tag_count= 0;
            int end_tag_count= 0;
            int empty_tag_count= 0;
            int text_count=0;

            qi::rule<It> skipper;
            qi::rule<It, std::string()> text;
            qi::rule<It, std::string()> prolog;
            qi::rule<It, std::string(),Skipper> name;
            qi::rule<It, std::string()> attribute_value;
            qi::rule<It, std::string(),Skipper> attribute;
            qi::rule<It, std::string(),Skipper> start_tag;
            qi::rule<It, std::string(),Skipper> end_tag;
            qi::rule<It, std::string(),Skipper> empty_tag;
            qi::rule<It, std::string(),Skipper> xml_parser;
};

I do not have any issues when I'm reading the text in using ifstream::getline since the text fed into the parser can be considered complete. However, when I'm reading the text by using ifstream::read, for example if it happened that the char[bufsize] stops at the middle of parsing the xml attribute and then it will return an expectation failure.

Example of unfinished text

</description>
<shipping>Will ship only within country, See description for charges</shipping>
<incategory category="category317"/>
<incategory category="categ

The function to read characters

char * buffer= new char[bufsize];
input_file.read(buffer,bufsize);
std::string bufstring(buffer);
if (extra != ""){
   bufstring = extra + bufstring;
   extra= "";
}

I wish to know if it is possible to return the fail parsed value and then added to the subsequent read from the buffer, since the subsequent read contains the continuation of the unfinished text. I have tried writing try and catch in order to put the failed to be parsed text to the next character read, but it doesn't seem to work.

    if (extra != ""){
        bufstring = extra + bufstring;
        extra= "";
    }
    // std::cout << bufstring << std::endl << std::endl;
    std::string::const_iterator iter= bufstring.begin();
    std::string::const_iterator end= bufstring.end();
    try{
        bool r= qi::phrase_parse(iter,end,xml_parser,qi::space);
        if (!r){
            std::cout << "Error found" << std::endl;
            extra = std::string(iter,end);
            std::cout << extra << std::endl;
            delete[] buffer;
            return;
        }
        if (iter!=end){
            extra = std::string(iter,end);
            // std::cout << extra << std::endl;
        }
    } catch (expectation_failure<char const*> const& e){
        std::cout<< std::string(iter,end) << std::endl;
        extra = std::string(iter,end);
    }

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

不乱于心 2025-02-12 15:35:54

不要滚动自己的XML解析器。 XML库为此目的具有流解析器。他们一堆。如果您有兴趣,我可以去寻找我的XPathReader实现。

就是说,也许您只是在尝试学习精神气。让我们挖掘。

很多小观察:

  • 船长似乎是一个非常不好的名字,因为您已经有 skipper (QI功能)

  • qi :: char _(“ \ t \ t \ r \ n” )非常接近 qi :: space - 如果不是完全相同的。

  • 断言!lit('/')当您期望 name eyways(不能合法地以'/'的'/':frem以'/'的开头。

  • char_ -char_(set)只是〜char(set)

  • 您似乎确实有太多的期望点。其中一些人毫无意义。请参阅eg


  • 属性值可以为空

  • 名称是lexeme,那么为什么要声明船长? (请参阅 Boost Spirit Skipper Skipper问题
  • 您没有定义

    代码> print_action 。也许只是

      boost :: phoenix :: function print_action = [](auto&amp;&amp; arg){std :: cout&lt;&lt; arg&lt;&lt; “ \ n”; };
     

    或功能更大的

     使用命名空间qi :: labels;
     auto print_action = [](自动字幕){
         返回std :: cout&lt;&lt; boost :: phoenix :: val(标题)&lt;&lt; ” [&lt;&lt; _1&lt;&lt; ”]”
     };
     

    主要是因为所有规则...公开字符串属性。

  • 关于那。

      text = +〜qi :: char _(“&lt;”)| '\''| '\“”;
     

    具有接受所有文本直至XML标签或单个报价字符的净效应。即使您实际上是指:

      text = +(〜qi :: char _(“&lt;'\“”)|'\''|'“”);
     

    它将接受所有文本,直到XML标签但吞咽'''和“”“出于...原因?

  • 所有规则都揭示字符串属性?!

    示例

     &lt;/description&gt;
     &lt;运输仅在国家内运送,请参阅说明&lt;/shipping&gt;
     &lt; incategory类别=“类别317”/&gt;
     &lt; incategory类别=“类别
     

    建议您只是象征化。如果是这样,您可能想揭示输入序列( QI :: RAW ),并且您不想在现在进行的插入等词汇。

  • 从可转换容器而不是输入迭代器中解析,您将允许您避免复制源序列(要么使用std :: string_view或boost :: iterator :: iterator_range)。

    )。

  • 而不是定制 print_action 考虑使用内置语法调试:

      boost_spirit_debug_nodes((Skipper)(text)(prolog)(attribute)(start_tag)(start_tag)(start_tag)(start_tag)(
         end_tag)(empty_tag)(xml_parser)(name)(attribute_value))
     
  • 在PEG语法中订购规则有概念性的事情。钉子贪婪,从左到右意味着可以重新排序EG

      xml_parser =*(//
         text [print_action(“ text”)] | //
         start_tag [(++ ref(open_tag_count),print_action(“ open”))|
         end_tag [(++ ref(end_tag_count),print_action(“ end”))|
         empty_tag [(++ ref(empty_tag_count),print_action(“ empty”))|
         prolog |船长);
     

    更像

      xml_parser =*(//
                   prolog |
         end_tag [(++ ref(end_tag_count),print_action(“ end”))|
         empty_tag [(++ ref(empty_tag_count),print_action(“ empty”))|
         start_tag [(++ ref(open_tag_count),print_action(“ open”))|
         text [print_action(“ text”)] | //
         船长);
     

    (除非您将其XML评论意识到,否则船长没有意义)

    现在您可以从 text> text 的开头删除

  • 语义动作存在问题( boost spirit:“语义动作”是邪恶的? )和容器属性上的副作用( boost :: Spirit tum tum tum the当默认值)。问题主要是语义动作。另外,我建议您通常将“空”标签(在许多感觉上都不为空)作为开放/关闭组合。这可以防止所有属性双重解析

  • 更新语义动作中的解析器成员也很惊讶 - 违反 const 的合同 .parse(...)const ,因为语义动作嵌入了对成员数据的可变引用。


  • 在有效XML方面存在很多正确的问题。您的语法甚至没有尝试验证匹配的开放/关闭标签。 Prolog 是实际的处理指令。我尚未检查>?&gt; 之前无法单独发生?

    吗?

    我不认为有效的XML元素可以以开头:。没有关于名称空间,实体参考,CDATA,pcdata的规定。而且,我们甚至不会打开XSD分辨率/验证的蠕虫罐头。在已经提到的“不要写自己的XML解析器”的情况下,该子弹将被遗忘。公平地说,野外的许多较小的XML库也有类似的局限性。

  • 期望点。您需要决定要发生的事情。您只想解析有效的XML吗?然后,您需要严格的期望,因为XML 需要元素标签,&lt; 是在文本/字符串上下文之外遇到的。

    这与解析相一致。也许您可以选择“吞下”所有期望:

      qi :: on_error(开始,
                  boost :: phoenix :: val(qi :: error_handler_result :: fail));
     

    另外,您可以在输入结束时有选择地处理它们:

      qi :: on_error&lt; qi :: fail&gt;(
         属性,
         boost :: phoenix ::如果_(_ 3!= _2)//输入结束时?
             [boost :: phoenix :: throw_(
                  std :: runtime_error(“期望失败”)]]
                 .else_ [std :: cout&lt;&lt; val(“ [Eof]期望”)&lt;&lt; _4
                                  &lt;&lt; std :: endl]);
     

将演示放在一起

在coliru上进行

//#define BOOST_SPIRIT_DEBUG
#include <boost/phoenix.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_match.hpp>
#include <boost/spirit/include/support_istream_iterator.hpp>
#include <iomanip>
namespace qi = boost::spirit::qi;

template <typename It, typename Skipper = qi::space_type>
struct xmlparser : qi::grammar<It, std::string(), Skipper> {
    xmlparser() : xmlparser::base_type(start) {
        using boost::phoenix::ref;
        using boost::phoenix::val;
        using qi::lit;

        skipper         = qi::eps(false); // qi::char_("\t\r\n ");
        text            = +~qi::char_("<");
        prolog          = "<?" >> +(qi::char_ - '?') >> "?>";
        name            = qi::char_("a-zA-Z:_") >> *qi::char_("-a-zA-Z0-9:_");
        attribute_value = //
            '"' > *~qi::char_("<&\"") > '"' |
            '\'' > *~qi::char_("<&'") > '\'';

        //boost::phoenix::function print_action = [](auto&& arg) { std::cout << arg << "\n"; };
        using namespace qi::labels;
        auto print_action = [](auto caption) {
            return std::cout << boost::phoenix::val(caption) << "[" << _1 << "] ";
        };

#define INIT = // %=
        attribute INIT name[print_action("ATT")] > '=' >
            attribute_value[print_action("ATT VALUE")];
        start_tag INIT '<' >> !lit('/') >> name >> *attribute >> !lit('/') >> '>';
        end_tag   INIT "</" >> name >> '>';
        empty_tag INIT '<' >> name >> *attribute >> "/>";
        start INIT *(//
                      prolog |
            end_tag   [ (++ref(end_tag_count), print_action("END"))     ] |
            empty_tag [ (++ref(empty_tag_count), print_action("EMPTY")) ] |
            start_tag [ (++ref(open_tag_count), print_action("OPEN"))   ] |
            text      [ print_action("TEXT")                            ] |
            skipper);

        qi::on_error<qi::fail>(
            attribute,
            boost::phoenix::if_(_3 != _2) // at end of input?
                [boost::phoenix::throw_(
                     std::runtime_error("Expectation failure"))]
                    .else_[std::cout << val("[EOF] Expecting ") << _4
                                     << std::endl]);

        BOOST_SPIRIT_DEBUG_NODES((skipper)(text)(prolog)(attribute)(start_tag)(
            end_tag)(empty_tag)(start)(name)(attribute_value))

    }

    int get_empty_tag_count() { return empty_tag_count; }
    int get_open_tag_count()  { return open_tag_count;  }
    int get_end_tag_count()   { return end_tag_count;   }

    void report(std::ostream& os) const {
        os << "open_tag_count  : " << open_tag_count  << std::endl;
        os << "end_tag_count   : " << end_tag_count   << std::endl;
        os << "empty_tag_count : " << empty_tag_count << std::endl;
        os << "text_count      : " << text_count      << std::endl;
    }

  private:

    int mutable open_tag_count  = 0;
    int mutable end_tag_count   = 0;
    int mutable empty_tag_count = 0;
    int mutable text_count      = 0;

    qi::rule<It, std::string(), Skipper> start, //
        attribute, start_tag, end_tag, empty_tag;

    qi::rule<It, std::string()> // lexemes
        prolog, text, name, attribute_value;
    qi::rule<It> skipper;
};

int main() {
    xmlparser<boost::spirit::istream_iterator> const p;
    std::istringstream iss(R"(
        </description>
        <shipping>Will ship only within country, See description for charges</shipping>
        <incategory category="category317"/>
        <incategory category="categ)");

    std::cout << iss.str() << "\n--------------------\n";

    for (std::string output;
         iss >> std::noskipws >> qi::phrase_match(p, qi::space, output);
         output.clear()) //
    {
        std::cout << "\n -- Output: " << std::quoted(output) << "\n";
    }

    p.report(std::cout << "\n");

    iss.clear();
    std::cout << "\n -- Remaining: " << iss.rdbuf() << std::endl;
}

打印

        </description>
        <shipping>Will ship only within country, See description for charges</shipping>
        <incategory category="category317"/>
        <incategory category="categ
--------------------
END[description] OPEN[shipping] TEXT[Will ship only within country, See description for charges] END[shipping] ATT[category] ATT VALUE[category317] EMPTY[incategory] ATT[category] [EOF] Expecting """
ATT[category] [EOF] Expecting """

 -- Output: ""

open_tag_count  : 1
end_tag_count   : 2
empty_tag_count : 1
text_count      : 0

 -- Remaining: 

,如果启用了调试输出:

<start>
  <try>\n        </descripti</try>
  <prolog>
    <try></description>\n     </try>
    <fail/>
  </prolog>
  <end_tag>
    <try></description>\n     </try>
    <name>
      <try>description>\n       </try>
      <success>>\n        <shipping></success>
      <attributes>[[d, e, s, c, r, i, p, t, i, o, n]]</attributes>
    </name>
    <success>\n        <shipping>W</success>
    <attributes>[[d, e, s, c, r, i, p, t, i, o, n]]</attributes>
  </end_tag>
  <prolog>
    <try><shipping>Will ship </try>
    <fail/>
  </prolog>
  <end_tag>
    <try><shipping>Will ship </try>
    <fail/>
  </end_tag>
  <empty_tag>
    <try><shipping>Will ship </try>
    <name>
      <try>shipping>Will ship o</try>
      <success>>Will ship only with</success>
      <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
    </name>
    <attribute>
      <try>>Will ship only with</try>
      <name>
        <try>>Will ship only with</try>
        <fail/>
      </name>
      <fail/>
    </attribute>
    <fail/>
  </empty_tag>
  <start_tag>
    <try><shipping>Will ship </try>
    <name>
      <try>shipping>Will ship o</try>
      <success>>Will ship only with</success>
      <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
    </name>
    <attribute>
      <try>>Will ship only with</try>
      <name>
        <try>>Will ship only with</try>
        <fail/>
      </name>
      <fail/>
    </attribute>
    <success>Will ship only withi</success>
    <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
  </start_tag>
  <prolog>
    <try>Will ship only withi</try>
    <fail/>
  </prolog>
  <end_tag>
    <try>Will ship only withi</try>
    <fail/>
  </end_tag>
  <empty_tag>
    <try>Will ship only withi</try>
    <fail/>
  </empty_tag>
  <start_tag>
    <try>Will ship only withi</try>
    <fail/>
  </start_tag>
  <text>
    <try>Will ship only withi</try>
    <success></shipping>\n        </success>
    <attributes>[[W, i, l, l,  , s, h, i, p,  , o, n, l, y,  , w, i, t, h, i, n,  , c, o, u, n, t, r, y, ,,  , S, e, e,  , d, e, s, c, r, i, p, t, i, o, n,  , f, o, r,  , c, h, a, r, g, e, s]]</attributes>
  </text>
  <prolog>
    <try></shipping>\n        </try>
    <fail/>
  </prolog>
  <end_tag>
    <try></shipping>\n        </try>
    <name>
      <try>shipping>\n        <i</try>
      <success>>\n        <incategor</success>
      <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
    </name>
    <success>\n        <incategory</success>
    <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
  </end_tag>
  <prolog>
    <try><incategory category</try>
    <fail/>
  </prolog>
  <end_tag>
    <try><incategory category</try>
    <fail/>
  </end_tag>
  <empty_tag>
    <try><incategory category</try>
    <name>
      <try>incategory category=</try>
      <success> category="category3</success>
      <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
    </name>
    <attribute>
      <try> category="category3</try>
      <name>
        <try>category="category31</try>
        <success>="category317"/>\n   </success>
        <attributes>[[c, a, t, e, g, o, r, y]]</attributes>
      </name>
      <attribute_value>
        <try>"category317"/>\n    </try>
        <success>/>\n        <incatego</success>
        <attributes>[[c, a, t, e, g, o, r, y, 3, 1, 7]]</attributes>
      </attribute_value>
      <success>/>\n        <incatego</success>
      <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
    </attribute>
    <attribute>
      <try>/>\n        <incatego</try>
      <name>
        <try>/>\n        <incatego</try>
        <fail/>
      </name>
      <fail/>
    </attribute>
    <success>\n        <incategory</success>
    <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
  </empty_tag>
  <prolog>
    <try><incategory category</try>
    <fail/>
  </prolog>
  <end_tag>
    <try><incategory category</try>
    <fail/>
  </end_tag>
  <empty_tag>
    <try><incategory category</try>
    <name>
      <try>incategory category=</try>
      <success> category="categ</success>
      <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
    </name>
    <attribute>
      <try> category="categ</try>
      <name>
        <try>category="categ</try>
        <success>="categ</success>
        <attributes>[[c, a, t, e, g, o, r, y]]</attributes>
      </name>
      <attribute_value>
        <try>"categ</try>
        <fail/>
      </attribute_value>
      <fail/>
    </attribute>
    <fail/>
  </empty_tag>
  <start_tag>
    <try><incategory category</try>
    <name>
      <try>incategory category=</try>
      <success> category="categ</success>
      <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
    </name>
    <attribute>
      <try> category="categ</try>
      <name>
        <try>category="categ</try>
        <success>="categ</success>
        <attributes>[[c, a, t, e, g, o, r, y]]</attributes>
      </name>
      <attribute_value>
        <try>"categ</try>
        <fail/>
      </attribute_value>
      <fail/>
    </attribute>
    <fail/>
  </start_tag>
  <text>
    <try><incategory category</try>
    <fail/>
  </text>
  <skipper>
    <try><incategory category</try>
    <fail/>
  </skipper>
  <success><incategory category</success>
  <attributes>[[]]</attributes>
</start>
<start>
  <try></try>
  <prolog>
    <try></try>
    <fail/>
  </prolog>
  <end_tag>
    <try></try>
    <fail/>
  </end_tag>
  <empty_tag>
    <try></try>
    <fail/>
  </empty_tag>
  <start_tag>
    <try></try>
    <fail/>
  </start_tag>
  <text>
    <try></try>
    <fail/>
  </text>
  <skipper>
    <try></try>
    <fail/>
  </skipper>
  <success></success>
  <attributes>[[]]</attributes>
</start>

ps

如果您 #define init%= 您会看到问题在每个字符串上展示 std :: String()

END[description] OPEN[descriptionshippingshipping] TEXT[descriptionshippingshippingWill ship only within country, See description for charges] END[descriptionshippingshippingWill ship only within country, See description for chargesshipping] ATT[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategory] ATT VALUE[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317] EMPTY[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317] ATT[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317incategorycategory] [EOF] Expecting """
ATT[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317incategorycategorycategincategorycategory] [EOF] Expecting """

 -- Output: "descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317incategorycategorycategincategorycategorycateg"

Don't roll your own XML parser. XML libraries have stream parsers for this purpose. They work a bunch. I could go look for my xpathreader implementation if you're interested.

That said, maybe you're just trying learn Spirit Qi. Let's dig in.

Lots of small observations:

  • skipper seems like a Very Bad name since you already have Skipper (the Qi feature)

  • qi::char_("\t\r\n ") is very close to qi::space - if not identical.

  • asserting !lit('/') is useless when you expect name anyways (which cannot legally start with '/').

  • char_ - char_(set) is just ~char(set)

  • You do seem to have too many expectation points. Some of them make little sense. See e.g. boost::spirit::qi Expectation Parser and parser grouping unexpected behaviour

  • Attribute values can be empty

  • Names are lexeme, so why declare a skipper? (see Boost spirit skipper issues)

  • You didn't define print_action. Perhaps it is simply

     boost::phoenix::function print_action = [](auto&& arg) { std::cout << arg << "\n"; };
    

    Or a slightly more functional

     using namespace qi::labels;
     auto print_action = [](auto caption) {
         return std::cout << boost::phoenix::val(caption) << "[" << _1 << "] ";
     };
    

    Which mainly works because all rules ... expose a string attribute.

  • About that.

     text = +~qi::char_("<") | '\'' | '\"';
    

    Has the net effect of accepting all text until an xml tag, or a single quote character. Even if you actually meant:

     text  = +(~qi::char_("<'\"") | '\'' | '"');
    

    it would be accepting all text until an xml tag but swallowing '' and '"' for... reasons?

  • All rule expose a string attribute?! What are you parsing? What are you parsing for?

    The example

     </description>
     <shipping>Will ship only within country, See description for charges</shipping>
     <incategory category="category317"/>
     <incategory category="categ
    

    Suggest that you're merely tokenizing. If so, you probably want to expose the input sequences (qi::raw) and you wouldn't want to drop lexical items like the interpunction as you're doing now.

  • Parsing from a forward-traversable container instead of input iterator would allow you to avoid copying the source sequences at all (either using a std::string_view or boost::iterator_range instead).

  • Instead of a bespoke print_action consider using the builtin grammar debugging:

     BOOST_SPIRIT_DEBUG_NODES((skipper)(text)(prolog)(attribute)(start_tag)(
         end_tag)(empty_tag)(xml_parser)(name)(attribute_value))
    
  • There's a conceptual thing on ordering rules in PEG grammars. PEG being greedy, left-to-right means that can reorder e.g.

     xml_parser =*(//
         text      [ print_action("TEXT")                            ] | //
         start_tag [ (++ref(open_tag_count), print_action("OPEN"))   ] |
         end_tag   [ (++ref(end_tag_count), print_action("END"))     ] |
         empty_tag [ (++ref(empty_tag_count), print_action("EMPTY")) ] |
         prolog | skipper);
    

    To be more like

     xml_parser =*(//
                   prolog |
         end_tag   [ (++ref(end_tag_count), print_action("END"))     ] |
         empty_tag [ (++ref(empty_tag_count), print_action("EMPTY")) ] |
         start_tag [ (++ref(open_tag_count), print_action("OPEN"))   ] |
         text      [ print_action("TEXT")                            ] | //
         skipper);
    

    (Where skipper doesn't make sense unless you were to make it XML comment-aware)

    Now you can drop the !lit('<') from the start of text as well.

  • There's a problem with semantic actions (Boost Spirit: "Semantic actions are evil"?) and side-effects on container attributes (Boost::Spirit doubles character when followed by a default value). The problem mainly being semantic actions. Also, I suggest that you would normally parse an "empty" tag (which isn't empty in many senses) as an open/close combo. This prevents all attributes being parsed doubly

  • Updating parser members from semantic actions is also a surprise - violating the const contract of .parse(...) const because the semantic actions embed mutable references to member data.

  • There's a lot of correctness issues with respect to valid XML. Your grammar doesn't even try to validate matching pairs of open/close tags. prolog is actual a processing instruction. I haven't checked that ? cannot occur on its own before ?>, have you?

    I don't think valid XML elements can start with :. There's no provision for namespaces, entity references, CDATA, PCDATA. And we'll not even open the can of worms that is XSD resolution/validation. This bullet will just be forgotten under the already mentioned "don't write your own XML parser". In fairness, many smaller XML libraries in the wild also have limitations like these.

  • The expectation points. You need to decide what you want to happen. Do you want to parse only valid XML? Then you need strict expectations, as XML requires element tags as soon as < is encountered outside text/string context.

    However this is at odds with parsing. Maybe you can optionally "swallow" all expectations:

     qi::on_error(start,
                  boost::phoenix::val(qi::error_handler_result::fail));
    

    Alternatively you could selectively deal with them when at end of input:

     qi::on_error<qi::fail>(
         attribute,
         boost::phoenix::if_(_3 != _2) // at end of input?
             [boost::phoenix::throw_(
                  std::runtime_error("Expectation failure"))]
                 .else_[std::cout << val("[EOF] Expecting ") << _4
                                  << std::endl]);
    

Putting Together A Demo

Live On Coliru

//#define BOOST_SPIRIT_DEBUG
#include <boost/phoenix.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/qi_match.hpp>
#include <boost/spirit/include/support_istream_iterator.hpp>
#include <iomanip>
namespace qi = boost::spirit::qi;

template <typename It, typename Skipper = qi::space_type>
struct xmlparser : qi::grammar<It, std::string(), Skipper> {
    xmlparser() : xmlparser::base_type(start) {
        using boost::phoenix::ref;
        using boost::phoenix::val;
        using qi::lit;

        skipper         = qi::eps(false); // qi::char_("\t\r\n ");
        text            = +~qi::char_("<");
        prolog          = "<?" >> +(qi::char_ - '?') >> "?>";
        name            = qi::char_("a-zA-Z:_") >> *qi::char_("-a-zA-Z0-9:_");
        attribute_value = //
            '"' > *~qi::char_("<&\"") > '"' |
            '\'' > *~qi::char_("<&'") > '\'';

        //boost::phoenix::function print_action = [](auto&& arg) { std::cout << arg << "\n"; };
        using namespace qi::labels;
        auto print_action = [](auto caption) {
            return std::cout << boost::phoenix::val(caption) << "[" << _1 << "] ";
        };

#define INIT = // %=
        attribute INIT name[print_action("ATT")] > '=' >
            attribute_value[print_action("ATT VALUE")];
        start_tag INIT '<' >> !lit('/') >> name >> *attribute >> !lit('/') >> '>';
        end_tag   INIT "</" >> name >> '>';
        empty_tag INIT '<' >> name >> *attribute >> "/>";
        start INIT *(//
                      prolog |
            end_tag   [ (++ref(end_tag_count), print_action("END"))     ] |
            empty_tag [ (++ref(empty_tag_count), print_action("EMPTY")) ] |
            start_tag [ (++ref(open_tag_count), print_action("OPEN"))   ] |
            text      [ print_action("TEXT")                            ] |
            skipper);

        qi::on_error<qi::fail>(
            attribute,
            boost::phoenix::if_(_3 != _2) // at end of input?
                [boost::phoenix::throw_(
                     std::runtime_error("Expectation failure"))]
                    .else_[std::cout << val("[EOF] Expecting ") << _4
                                     << std::endl]);

        BOOST_SPIRIT_DEBUG_NODES((skipper)(text)(prolog)(attribute)(start_tag)(
            end_tag)(empty_tag)(start)(name)(attribute_value))

    }

    int get_empty_tag_count() { return empty_tag_count; }
    int get_open_tag_count()  { return open_tag_count;  }
    int get_end_tag_count()   { return end_tag_count;   }

    void report(std::ostream& os) const {
        os << "open_tag_count  : " << open_tag_count  << std::endl;
        os << "end_tag_count   : " << end_tag_count   << std::endl;
        os << "empty_tag_count : " << empty_tag_count << std::endl;
        os << "text_count      : " << text_count      << std::endl;
    }

  private:

    int mutable open_tag_count  = 0;
    int mutable end_tag_count   = 0;
    int mutable empty_tag_count = 0;
    int mutable text_count      = 0;

    qi::rule<It, std::string(), Skipper> start, //
        attribute, start_tag, end_tag, empty_tag;

    qi::rule<It, std::string()> // lexemes
        prolog, text, name, attribute_value;
    qi::rule<It> skipper;
};

int main() {
    xmlparser<boost::spirit::istream_iterator> const p;
    std::istringstream iss(R"(
        </description>
        <shipping>Will ship only within country, See description for charges</shipping>
        <incategory category="category317"/>
        <incategory category="categ)");

    std::cout << iss.str() << "\n--------------------\n";

    for (std::string output;
         iss >> std::noskipws >> qi::phrase_match(p, qi::space, output);
         output.clear()) //
    {
        std::cout << "\n -- Output: " << std::quoted(output) << "\n";
    }

    p.report(std::cout << "\n");

    iss.clear();
    std::cout << "\n -- Remaining: " << iss.rdbuf() << std::endl;
}

Prints

        </description>
        <shipping>Will ship only within country, See description for charges</shipping>
        <incategory category="category317"/>
        <incategory category="categ
--------------------
END[description] OPEN[shipping] TEXT[Will ship only within country, See description for charges] END[shipping] ATT[category] ATT VALUE[category317] EMPTY[incategory] ATT[category] [EOF] Expecting """
ATT[category] [EOF] Expecting """

 -- Output: ""

open_tag_count  : 1
end_tag_count   : 2
empty_tag_count : 1
text_count      : 0

 -- Remaining: 

And, if enabled, the debug output:

<start>
  <try>\n        </descripti</try>
  <prolog>
    <try></description>\n     </try>
    <fail/>
  </prolog>
  <end_tag>
    <try></description>\n     </try>
    <name>
      <try>description>\n       </try>
      <success>>\n        <shipping></success>
      <attributes>[[d, e, s, c, r, i, p, t, i, o, n]]</attributes>
    </name>
    <success>\n        <shipping>W</success>
    <attributes>[[d, e, s, c, r, i, p, t, i, o, n]]</attributes>
  </end_tag>
  <prolog>
    <try><shipping>Will ship </try>
    <fail/>
  </prolog>
  <end_tag>
    <try><shipping>Will ship </try>
    <fail/>
  </end_tag>
  <empty_tag>
    <try><shipping>Will ship </try>
    <name>
      <try>shipping>Will ship o</try>
      <success>>Will ship only with</success>
      <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
    </name>
    <attribute>
      <try>>Will ship only with</try>
      <name>
        <try>>Will ship only with</try>
        <fail/>
      </name>
      <fail/>
    </attribute>
    <fail/>
  </empty_tag>
  <start_tag>
    <try><shipping>Will ship </try>
    <name>
      <try>shipping>Will ship o</try>
      <success>>Will ship only with</success>
      <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
    </name>
    <attribute>
      <try>>Will ship only with</try>
      <name>
        <try>>Will ship only with</try>
        <fail/>
      </name>
      <fail/>
    </attribute>
    <success>Will ship only withi</success>
    <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
  </start_tag>
  <prolog>
    <try>Will ship only withi</try>
    <fail/>
  </prolog>
  <end_tag>
    <try>Will ship only withi</try>
    <fail/>
  </end_tag>
  <empty_tag>
    <try>Will ship only withi</try>
    <fail/>
  </empty_tag>
  <start_tag>
    <try>Will ship only withi</try>
    <fail/>
  </start_tag>
  <text>
    <try>Will ship only withi</try>
    <success></shipping>\n        </success>
    <attributes>[[W, i, l, l,  , s, h, i, p,  , o, n, l, y,  , w, i, t, h, i, n,  , c, o, u, n, t, r, y, ,,  , S, e, e,  , d, e, s, c, r, i, p, t, i, o, n,  , f, o, r,  , c, h, a, r, g, e, s]]</attributes>
  </text>
  <prolog>
    <try></shipping>\n        </try>
    <fail/>
  </prolog>
  <end_tag>
    <try></shipping>\n        </try>
    <name>
      <try>shipping>\n        <i</try>
      <success>>\n        <incategor</success>
      <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
    </name>
    <success>\n        <incategory</success>
    <attributes>[[s, h, i, p, p, i, n, g]]</attributes>
  </end_tag>
  <prolog>
    <try><incategory category</try>
    <fail/>
  </prolog>
  <end_tag>
    <try><incategory category</try>
    <fail/>
  </end_tag>
  <empty_tag>
    <try><incategory category</try>
    <name>
      <try>incategory category=</try>
      <success> category="category3</success>
      <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
    </name>
    <attribute>
      <try> category="category3</try>
      <name>
        <try>category="category31</try>
        <success>="category317"/>\n   </success>
        <attributes>[[c, a, t, e, g, o, r, y]]</attributes>
      </name>
      <attribute_value>
        <try>"category317"/>\n    </try>
        <success>/>\n        <incatego</success>
        <attributes>[[c, a, t, e, g, o, r, y, 3, 1, 7]]</attributes>
      </attribute_value>
      <success>/>\n        <incatego</success>
      <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
    </attribute>
    <attribute>
      <try>/>\n        <incatego</try>
      <name>
        <try>/>\n        <incatego</try>
        <fail/>
      </name>
      <fail/>
    </attribute>
    <success>\n        <incategory</success>
    <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
  </empty_tag>
  <prolog>
    <try><incategory category</try>
    <fail/>
  </prolog>
  <end_tag>
    <try><incategory category</try>
    <fail/>
  </end_tag>
  <empty_tag>
    <try><incategory category</try>
    <name>
      <try>incategory category=</try>
      <success> category="categ</success>
      <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
    </name>
    <attribute>
      <try> category="categ</try>
      <name>
        <try>category="categ</try>
        <success>="categ</success>
        <attributes>[[c, a, t, e, g, o, r, y]]</attributes>
      </name>
      <attribute_value>
        <try>"categ</try>
        <fail/>
      </attribute_value>
      <fail/>
    </attribute>
    <fail/>
  </empty_tag>
  <start_tag>
    <try><incategory category</try>
    <name>
      <try>incategory category=</try>
      <success> category="categ</success>
      <attributes>[[i, n, c, a, t, e, g, o, r, y]]</attributes>
    </name>
    <attribute>
      <try> category="categ</try>
      <name>
        <try>category="categ</try>
        <success>="categ</success>
        <attributes>[[c, a, t, e, g, o, r, y]]</attributes>
      </name>
      <attribute_value>
        <try>"categ</try>
        <fail/>
      </attribute_value>
      <fail/>
    </attribute>
    <fail/>
  </start_tag>
  <text>
    <try><incategory category</try>
    <fail/>
  </text>
  <skipper>
    <try><incategory category</try>
    <fail/>
  </skipper>
  <success><incategory category</success>
  <attributes>[[]]</attributes>
</start>
<start>
  <try></try>
  <prolog>
    <try></try>
    <fail/>
  </prolog>
  <end_tag>
    <try></try>
    <fail/>
  </end_tag>
  <empty_tag>
    <try></try>
    <fail/>
  </empty_tag>
  <start_tag>
    <try></try>
    <fail/>
  </start_tag>
  <text>
    <try></try>
    <fail/>
  </text>
  <skipper>
    <try></try>
    <fail/>
  </skipper>
  <success></success>
  <attributes>[[]]</attributes>
</start>

PS

If you #define INIT %= you will see the problem with exposing std::string() on each string:

END[description] OPEN[descriptionshippingshipping] TEXT[descriptionshippingshippingWill ship only within country, See description for charges] END[descriptionshippingshippingWill ship only within country, See description for chargesshipping] ATT[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategory] ATT VALUE[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317] EMPTY[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317] ATT[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317incategorycategory] [EOF] Expecting """
ATT[descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317incategorycategorycategincategorycategory] [EOF] Expecting """

 -- Output: "descriptionshippingshippingWill ship only within country, See description for chargesshippingincategorycategorycategory317incategorycategorycategincategorycategorycateg"
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文