更新解析器以允许带引号的字符串中包含括号

发布于 2025-01-19 14:47:07 字数 4185 浏览 4 评论 0原文

我需要更新解析器以接受这些新功能，但我无法一次管理所有这些新功能：

命令必须接受不确定数量的参数（> 0）。
参数可以是数字、不带引号的字符串或带引号的字符串。
参数之间用逗号分隔。
在引用的字符串中，应允许使用左/右括号。

（查看源代码示例更容易理解这些要求）

我当前的代码（包括检查）如下：

Godbolt 链接： https://godbolt.org/z/5d6o53n9h

#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>

namespace script
{
    struct Command
    {
        enum Type { NONE, WRITE_LOG, INSERT_LABEL, START_PROCESS, END_PROCESS, COMMENT, FAIL };

        Type type{ Type::NONE };
        std::vector<std::string> args;
    };

    using Commands = std::vector<Command>;
}//namespace script

BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)

namespace script
{
    namespace qi = boost::spirit::qi;

    template <typename It>
    class Parser : public qi::grammar<It, Commands()>
    {
    private:
        qi::symbols<char, Command::Type> type;
        qi::rule<It, Command(), qi::blank_type> none, command, comment, fail;//By its very nature "fail" must be the last one to be checked
        qi::rule<It, Commands()> start;

    public:
        Parser() : Parser::base_type(start)
        {
            using namespace qi;//NOTE: "as_string" is neccessary in all args due to std::vector<std::string>
            auto empty_args = copy(attr(std::vector<std::string>{}));

            type.add
                ("WriteLog", Command::WRITE_LOG)
                ("InsertLabel", Command::INSERT_LABEL)
                ("StartProcess", Command::START_PROCESS)
                ("EndProcess", Command::END_PROCESS);

            none = omit[*blank] >> &(eol | eoi)
                >> attr(Command::NONE)
                >> empty_args;//ignore args

            command = type >> '('
                >> as_string[lexeme[+~char_("(),\r\n")]] % ',' >> ')';

            comment = lit("//")
                >> attr(Command::COMMENT)
                >> as_string[lexeme[*~char_("\r\n")]];

            fail = omit[*~char_("\r\n")]
                >> attr(Command::FAIL)
                >> empty_args;//ignore args

            start = skip(blank)[(none | command | comment | fail) % eol] >> eoi;
        }
    };

    Commands parse(std::istream& in)
    {
        using It = boost::spirit::istream_iterator;
        static const Parser<It> parser;

        Commands commands;

        It first(in >> std::noskipws), last;//No white space skipping
        if (!qi::parse(first, last, parser, commands))
            throw std::runtime_error("command parse error");

        return commands;
    }
}//namespace script

std::stringstream ss{
R"(// just a comment

WriteLog("this is a log")
WriteLog("this is also (in another way) a log")
WriteLog("but this is just a fail)

StartProcess(17, "program.exe", True)
StartProcess(17, "this_is_a_fail.exe, True)
)"};

int main()
{
    using namespace script;

    try
    {
        auto commands = script::parse(ss);

        std::array args{ 0, 0, 1, 1, -1, 0, 3, -1, 0 };//Fails may have any number of arguments. It doesn't care. Sets as -1 by convenience flag
        std::array types{ Command::COMMENT, Command::NONE, Command::WRITE_LOG, Command::WRITE_LOG, Command::FAIL, Command::NONE, Command::START_PROCESS, Command::FAIL, Command::NONE };
        std::cout << std::boolalpha << "size correct? " << (commands.size() == 9) << std::endl;
        std::cout << "types correct? " << std::equal(commands.begin(), commands.end(), types.begin(), types.end(), [](auto& cmd, auto& type) { return cmd.type == type; }) << std::endl;
        std::cout << "arguments correct? " << std::equal(commands.begin(), commands.end(), args.begin(), args.end(), [](auto& cmd, auto arg) { return cmd.args.size() == arg || arg == -1; }) << std::endl;
    }
    catch (std::exception const& e)
    {
        std::cout << e.what() << "\n";
    }
}

任何有关此问题的帮助将不胜感激。

原文

I need to update a parser to admit these new features, but I am not able to manage all them at a time:

The commands must admit an indeterminate number of parameters (> 0).
Parameters might be numbers, unquoted strings or quoted strings.
Parameters are separate by commas.
Within quoted strings, it shall be permitted to use opening/closing parenthesis.

(It easier to understand these requirements looking at source code example)

My current code, including checks, is as follows:

Godbolt link: https://godbolt.org/z/5d6o53n9h

#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>

namespace script
{
    struct Command
    {
        enum Type { NONE, WRITE_LOG, INSERT_LABEL, START_PROCESS, END_PROCESS, COMMENT, FAIL };

        Type type{ Type::NONE };
        std::vector<std::string> args;
    };

    using Commands = std::vector<Command>;
}//namespace script

BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)

namespace script
{
    namespace qi = boost::spirit::qi;

    template <typename It>
    class Parser : public qi::grammar<It, Commands()>
    {
    private:
        qi::symbols<char, Command::Type> type;
        qi::rule<It, Command(), qi::blank_type> none, command, comment, fail;//By its very nature "fail" must be the last one to be checked
        qi::rule<It, Commands()> start;

    public:
        Parser() : Parser::base_type(start)
        {
            using namespace qi;//NOTE: "as_string" is neccessary in all args due to std::vector<std::string>
            auto empty_args = copy(attr(std::vector<std::string>{}));

            type.add
                ("WriteLog", Command::WRITE_LOG)
                ("InsertLabel", Command::INSERT_LABEL)
                ("StartProcess", Command::START_PROCESS)
                ("EndProcess", Command::END_PROCESS);

            none = omit[*blank] >> &(eol | eoi)
                >> attr(Command::NONE)
                >> empty_args;//ignore args

            command = type >> '('
                >> as_string[lexeme[+~char_("(),\r\n")]] % ',' >> ')';

            comment = lit("//")
                >> attr(Command::COMMENT)
                >> as_string[lexeme[*~char_("\r\n")]];

            fail = omit[*~char_("\r\n")]
                >> attr(Command::FAIL)
                >> empty_args;//ignore args

            start = skip(blank)[(none | command | comment | fail) % eol] >> eoi;
        }
    };

    Commands parse(std::istream& in)
    {
        using It = boost::spirit::istream_iterator;
        static const Parser<It> parser;

        Commands commands;

        It first(in >> std::noskipws), last;//No white space skipping
        if (!qi::parse(first, last, parser, commands))
            throw std::runtime_error("command parse error");

        return commands;
    }
}//namespace script

std::stringstream ss{
R"(// just a comment

WriteLog("this is a log")
WriteLog("this is also (in another way) a log")
WriteLog("but this is just a fail)

StartProcess(17, "program.exe", True)
StartProcess(17, "this_is_a_fail.exe, True)
)"};

int main()
{
    using namespace script;

    try
    {
        auto commands = script::parse(ss);

        std::array args{ 0, 0, 1, 1, -1, 0, 3, -1, 0 };//Fails may have any number of arguments. It doesn't care. Sets as -1 by convenience flag
        std::array types{ Command::COMMENT, Command::NONE, Command::WRITE_LOG, Command::WRITE_LOG, Command::FAIL, Command::NONE, Command::START_PROCESS, Command::FAIL, Command::NONE };
        std::cout << std::boolalpha << "size correct? " << (commands.size() == 9) << std::endl;
        std::cout << "types correct? " << std::equal(commands.begin(), commands.end(), types.begin(), types.end(), [](auto& cmd, auto& type) { return cmd.type == type; }) << std::endl;
        std::cout << "arguments correct? " << std::equal(commands.begin(), commands.end(), args.begin(), args.end(), [](auto& cmd, auto arg) { return cmd.args.size() == arg || arg == -1; }) << std::endl;
    }
    catch (std::exception const& e)
    {
        std::cout << e.what() << "\n";
    }
}

Any help with this will be appreciated.

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

笑红尘 2025-01-26 14:47:07

您说您希望在带引号的字符串中允许使用括号。但你甚至不支持带引号的字符串！

所以问题是你的论证规则。这甚至不存在。它大概是这部分：

argument = +~char_("(),\r\n");
command = type >> '(' >> argument % ',' >> ')';

其中 argument 可能被声明为

qi::rule<It, Argument()> argument;

事实上，以有组织的方式重写测试，这就是我们现在得到的：

Live On Compiler Explorer

static const Commands expected{
    {Command::COMMENT, {"just a comment"}},
    {Command::NONE, {}},
    {Command::WRITE_LOG, {"this is a log"}},
    {Command::WRITE_LOG, {"this is also (in another way) a log"}},
    {Command::FAIL, {}},
    {Command::NONE, {}},
    {Command::START_PROCESS, {"17", "program.exe", "True"}},
    {Command::FAIL, {}},
    {Command::NONE, {}},
};

try {
    auto parsed = script::parse(ss);
    fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
               (parsed == expected), parsed.size(), expected.size());

    for (auto i = 0u; i < std::min(expected.size(), parsed.size()); ++i) {
        if (expected[i] != parsed[i]) {
            fmt::print("index #{} expected {}\n"
                       "          actual:  {}\n",
                       i, expected[i], parsed[i]);
        } else {
            fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
        }
    }
} catch (std::exception const& e) {
    fmt::print("Exception: {}\n", e.what());
}

打印

Parsed all correct? false -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 expected Command(WRITE_LOG, ["this is a log"])
          actual:  Command(WRITE_LOG, ["\"this is a log\""])
index #3 expected Command(WRITE_LOG, ["this is also (in another way) a log"])
          actual:  Command(FAIL, [])
index #4 expected Command(FAIL, [])
          actual:  Command(WRITE_LOG, ["\"but this is just a fail"])
index #5 CORRECT (Command(NONE, []))
index #6 expected Command(START_PROCESS, ["17", "program.exe", "True"])
          actual:  Command(START_PROCESS, ["17", "\"program.exe\"", "True"])
index #7 expected Command(FAIL, [])
          actual:  Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])
index #8 CORRECT (Command(NONE, []))

如您所见，在我的预期中，它也无法引用带引号的字符串。这是因为引用是一种语言结构。在 AST（解析结果）中，您不关心它是如何用代码编写的。例如，“hello\ world\041”也可能等效于“hello world!”，因此两者都应产生参数值hello world!。

因此，让我们按照我们所说的去做：

argument = quoted_string | number | boolean | raw_string;

我们可以添加一些规则：

// notice these are lexemes (no internal skipping):
qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;

并定义它们：

quoted_string = '"' >> *~char_('"') >> '"';
number        = raw[double_];
boolean       = raw[bool_];
raw_string    = +~char_("(),\r\n");
argument      = quoted_string | number | boolean | raw_string;

（如果您想允许转义引号，如下所示：
 Quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >>> *('\\' >> char_ | ~char_('"')) >> '''；

现在，我想说您可能希望 Argument 类似于 variant code>，而不仅仅是 std::string，

仅通过此更改，所有问题都实际上消失了：Live On Compiler Explorer：

Parsed all correct? false -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
index #4 CORRECT (Command(FAIL, []))
index #5 CORRECT (Command(NONE, []))
index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
index #7 expected Command(FAIL, [])
          actual:  Command(START_PROCESS, ["17", "this_is_a_fail.exe, True)\n\"this_is_a_fail.exe", "True"])
index #8 CORRECT (Command(NONE, []))

现在，索引#7看起来非常时髦，但这实际上是 Spirit 中的一个众所周知的现象。 BOOST_SPIRIT_DEBUG 演示了这一点：

  <argument>
    <try>"this_is_a_fail.exe,</try>
    <quoted_string>
      <try>"this_is_a_fail.exe,</try>
      <fail/>
    </quoted_string>
    <number>
      <try>"this_is_a_fail.exe,</try>
      <fail/>
    </number>
    <boolean>
      <try>"this_is_a_fail.exe,</try>
      <fail/>
    </boolean>
    <raw_string>
      <try>"this_is_a_fail.exe,</try>
      <success>, True)</success>
      <attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,,  , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
    </raw_string>
    <success>, True)</success>
    <attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,,  , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
  </argument>

因此，该字符串被接受为原始字符串，即使它以 " 开头。这很容易修复，但我们甚至不需要。我们可以直接应用qi::hold 以避免重复：

argument = qi::hold[quoted_string] | number | boolean | raw_string;

结果：

actual:  Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])

但是，如果您预计它会失败，请修复其他问题：

raw_string    = +~char_("\"(),\r\n"); // note the \"

注意：在极少数情况下，您实际上只需要它不开始
引用：
raw_string = !lit('"') >> +~char_("(),\r\n");
我想现在你已经看到了这样的“宽松规则”的问题，所以我
不推荐。
不过，您可以用另一种方式表达要求，比如“如果
参数以 '"' 开头，然后必须是 quoted_string。使用
那里有一个期望点：
quoted_string = '"' > *('\\' >> char_ | ~char_('"')) > > '''；
这会导致无法解析完整的 quoted_string
将抛出 expectation_failed 异常。

摘要/清单

这就是我们最终得到的结果：

Live On Compiler Explorer

//#define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <fmt/ranges.h>

namespace script {
    using Argument = std::string;
    using Arguments = std::vector<Argument>;

    struct Command {
        enum Type {
            NONE,
            WRITE_LOG,
            INSERT_LABEL,
            START_PROCESS,
            END_PROCESS,
            COMMENT,
            FAIL
        };

        Type      type{Type::NONE};
        Arguments args;

        auto operator<=>(Command const&) const = default;
    };

    using Commands = std::vector<Command>;
} // namespace script

BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)

namespace script {
    namespace qi = boost::spirit::qi;

    template <typename It> class Parser : public qi::grammar<It, Commands()> {
    public:
        Parser() : Parser::base_type(start) {
            using namespace qi; // NOTE: "as_string" is neccessary in all args
            auto empty_args = copy(attr(Arguments{}));

            type.add //
                ("WriteLog",     Command::WRITE_LOG)     //
                ("InsertLabel",  Command::INSERT_LABEL)  //
                ("StartProcess", Command::START_PROCESS) //
                ("EndProcess",   Command::END_PROCESS);  //

            none = omit[*blank] >> &(eol | eoi) //
                >> attr(Command{Command::NONE, {}});

            quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
            number        = raw[double_];
            boolean       = raw[bool_];
            raw_string    = +~char_("\"(),\r\n");
            argument = qi::hold[quoted_string] | number | boolean | raw_string;

            command = type >> '(' >> argument % ',' >> ')';

            comment = "//"                             //
                >> attr(Command::COMMENT)              //
                >> as_string[lexeme[*~char_("\r\n")]]; //

            fail = omit[*~char_("\r\n")] >> attr(Command{Command::FAIL, {}});

            line  = none | command | comment | fail; // keep fail last
            start = skip(blank)[line % eol] >> eoi;

            BOOST_SPIRIT_DEBUG_NODES((start)(line)(fail)(comment)(command)(
                argument)(none)(quoted_string)(raw_string)(boolean)(number))
        }

    private:
        qi::symbols<char, Command::Type>         type;
        qi::rule<It, Command(), qi::blank_type>  line, none, command, comment, fail;
        // notice these are lexemes (no internal skipping):
        qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;
        qi::rule<It, Commands()> start;
    };

    Commands parse(std::istream& in)
    {
        using It = boost::spirit::istream_iterator;
        static const Parser<It> parser;

        Commands commands;

        return qi::parse(It{in >> std::noskipws}, {}, parser, commands)
            ? commands
            : throw std::runtime_error("command parse error");
    }

    struct Formatter {
        static constexpr auto name(script::Command::Type type) {
            return std::array{"NONE",          "WRITE_LOG",   "INSERT_LABEL",
                            "START_PROCESS", "END_PROCESS", "COMMENT",
                            "FAIL"}
                .at(static_cast<int>(type));
        }

        auto parse(auto& ctx) const { return ctx.begin(); }
        auto format(script::Command const& cmd, auto& ctx) const {
            return format_to(ctx.out(), "Command({}, {})", name(cmd.type), cmd.args);
        }
    };
} // namespace script

template <> struct fmt::formatter<script::Command> : script::Formatter {};

std::stringstream ss{
    R"(// just a comment

    WriteLog("this is a log")
    WriteLog("this is also (in another way) a log")
    WriteLog("but this is just a fail)

    StartProcess(17, "program.exe", True)
    StartProcess(17, "this_is_a_fail.exe, True)
    )"};

int main() {
    using namespace script;
    static const Commands expected{
        {Command::COMMENT, {"just a comment"}},
        {Command::NONE, {}},
        {Command::WRITE_LOG, {"this is a log"}},
        {Command::WRITE_LOG, {"this is also (in another way) a log"}},
        {Command::FAIL, {}},
        {Command::NONE, {}},
        {Command::START_PROCESS, {"17", "program.exe", "True"}},
        {Command::FAIL, {}},
        {Command::NONE, {}},
    };

    try {
        auto parsed = script::parse(ss);
        fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
                (parsed == expected), parsed.size(), expected.size());

        for (auto i = 0u; i < std::min(expected.size(), parsed.size()); ++i) {
            if (expected[i] != parsed[i]) {
                fmt::print("index #{} expected {}\n"
                        "          actual:  {}\n",
                        i, expected[i], parsed[i]);
            } else {
                fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
            }
        }
    } catch (std::exception const& e) {
        fmt::print("Exception: {}\n", e.what());
    }
}

打印

Parsed all correct? true -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
index #4 CORRECT (Command(FAIL, []))
index #5 CORRECT (Command(NONE, []))
index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
index #7 CORRECT (Command(FAIL, []))
index #8 CORRECT (Command(NONE, []))

1 请参见例如 boost::spirit 替代解析器返回重复项（链接到另外三个同类）

You say you want to allow parentheses within quoted strings. But you don't even support quoted strings!

So the problem is your argument rule. Which doesn't even exist. It whould be roughly this part:

argument = +~char_("(),\r\n");
command = type >> '(' >> argument % ',' >> ')';

Where argument might be declared as

qi::rule<It, Argument()> argument;

In fact, rewriting the tests in an organized fashion, here's what we get right now:

Live On Compiler Explorer

static const Commands expected{
    {Command::COMMENT, {"just a comment"}},
    {Command::NONE, {}},
    {Command::WRITE_LOG, {"this is a log"}},
    {Command::WRITE_LOG, {"this is also (in another way) a log"}},
    {Command::FAIL, {}},
    {Command::NONE, {}},
    {Command::START_PROCESS, {"17", "program.exe", "True"}},
    {Command::FAIL, {}},
    {Command::NONE, {}},
};

try {
    auto parsed = script::parse(ss);
    fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
               (parsed == expected), parsed.size(), expected.size());

    for (auto i = 0u; i < std::min(expected.size(), parsed.size()); ++i) {
        if (expected[i] != parsed[i]) {
            fmt::print("index #{} expected {}\n"
                       "          actual:  {}\n",
                       i, expected[i], parsed[i]);
        } else {
            fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
        }
    }
} catch (std::exception const& e) {
    fmt::print("Exception: {}\n", e.what());
}

Prints

Parsed all correct? false -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 expected Command(WRITE_LOG, ["this is a log"])
          actual:  Command(WRITE_LOG, ["\"this is a log\""])
index #3 expected Command(WRITE_LOG, ["this is also (in another way) a log"])
          actual:  Command(FAIL, [])
index #4 expected Command(FAIL, [])
          actual:  Command(WRITE_LOG, ["\"but this is just a fail"])
index #5 CORRECT (Command(NONE, []))
index #6 expected Command(START_PROCESS, ["17", "program.exe", "True"])
          actual:  Command(START_PROCESS, ["17", "\"program.exe\"", "True"])
index #7 expected Command(FAIL, [])
          actual:  Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])
index #8 CORRECT (Command(NONE, []))

As you can see, it fails quoted strings too, in my expectation. That's because the quoting is a language construct. In the AST (parsed results) you donot care about how exactly it was written in code. E.g. "hello\ world\041" might be equivalent too "hello world!" so both should result in the argument value hello world!.

So, let's do as we say:

argument = quoted_string | number | boolean | raw_string;

We can add a few rules:

// notice these are lexemes (no internal skipping):
qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;

And define them:

quoted_string = '"' >> *~char_('"') >> '"';
number        = raw[double_];
boolean       = raw[bool_];
raw_string    = +~char_("(),\r\n");
argument      = quoted_string | number | boolean | raw_string;

(If you want to allow escaped quotes, something like this:
 quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';

Now, I'd say you probably want Argument to be something like variant<double, std::string, bool>, instead of just std::string.

With only this change, all the problems have practically vanished: Live On Compiler Explorer:

Parsed all correct? false -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
index #4 CORRECT (Command(FAIL, []))
index #5 CORRECT (Command(NONE, []))
index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
index #7 expected Command(FAIL, [])
          actual:  Command(START_PROCESS, ["17", "this_is_a_fail.exe, True)\n\"this_is_a_fail.exe", "True"])
index #8 CORRECT (Command(NONE, []))

Now, index #7 looks very funky, but it's actually a well-known phenomenon in Spirit¹. Enabling BOOST_SPIRIT_DEBUG demonstrates it:

  <argument>
    <try>"this_is_a_fail.exe,</try>
    <quoted_string>
      <try>"this_is_a_fail.exe,</try>
      <fail/>
    </quoted_string>
    <number>
      <try>"this_is_a_fail.exe,</try>
      <fail/>
    </number>
    <boolean>
      <try>"this_is_a_fail.exe,</try>
      <fail/>
    </boolean>
    <raw_string>
      <try>"this_is_a_fail.exe,</try>
      <success>, True)</success>
      <attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,,  , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
    </raw_string>
    <success>, True)</success>
    <attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,,  , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
  </argument>

So, the string gets accepted as a raw string, even though it started with ". That's easily fixed, but we don't even need to. We could just apply qi::hold to avoid the duplication:

argument = qi::hold[quoted_string] | number | boolean | raw_string;

Result:

actual:  Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])

However, if you expect it to fail, fix that other problem:

raw_string    = +~char_("\"(),\r\n"); // note the \"

Note: In the off-chance you really only require it to not start with
a quote:
raw_string    = !lit('"') >> +~char_("(),\r\n");
I guess by now you see the problem with a "loose rule" like that, so I
don't recommend it.
You could express the requirement another way though, saying "if an
argument starts with '"' then is MUST be a quoted_string. Use
an expectation point there:
quoted_string = '"' > *('\\' >> char_ | ~char_('"')) > '"';
This has the effect that failure to parse a complete quoted_string
will throw an expectation_failed exception.

Summary / Listing

This is what we end up with:

Live On Compiler Explorer

//#define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <fmt/ranges.h>

namespace script {
    using Argument = std::string;
    using Arguments = std::vector<Argument>;

    struct Command {
        enum Type {
            NONE,
            WRITE_LOG,
            INSERT_LABEL,
            START_PROCESS,
            END_PROCESS,
            COMMENT,
            FAIL
        };

        Type      type{Type::NONE};
        Arguments args;

        auto operator<=>(Command const&) const = default;
    };

    using Commands = std::vector<Command>;
} // namespace script

BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)

namespace script {
    namespace qi = boost::spirit::qi;

    template <typename It> class Parser : public qi::grammar<It, Commands()> {
    public:
        Parser() : Parser::base_type(start) {
            using namespace qi; // NOTE: "as_string" is neccessary in all args
            auto empty_args = copy(attr(Arguments{}));

            type.add //
                ("WriteLog",     Command::WRITE_LOG)     //
                ("InsertLabel",  Command::INSERT_LABEL)  //
                ("StartProcess", Command::START_PROCESS) //
                ("EndProcess",   Command::END_PROCESS);  //

            none = omit[*blank] >> &(eol | eoi) //
                >> attr(Command{Command::NONE, {}});

            quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
            number        = raw[double_];
            boolean       = raw[bool_];
            raw_string    = +~char_("\"(),\r\n");
            argument = qi::hold[quoted_string] | number | boolean | raw_string;

            command = type >> '(' >> argument % ',' >> ')';

            comment = "//"                             //
                >> attr(Command::COMMENT)              //
                >> as_string[lexeme[*~char_("\r\n")]]; //

            fail = omit[*~char_("\r\n")] >> attr(Command{Command::FAIL, {}});

            line  = none | command | comment | fail; // keep fail last
            start = skip(blank)[line % eol] >> eoi;

            BOOST_SPIRIT_DEBUG_NODES((start)(line)(fail)(comment)(command)(
                argument)(none)(quoted_string)(raw_string)(boolean)(number))
        }

    private:
        qi::symbols<char, Command::Type>         type;
        qi::rule<It, Command(), qi::blank_type>  line, none, command, comment, fail;
        // notice these are lexemes (no internal skipping):
        qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;
        qi::rule<It, Commands()> start;
    };

    Commands parse(std::istream& in)
    {
        using It = boost::spirit::istream_iterator;
        static const Parser<It> parser;

        Commands commands;

        return qi::parse(It{in >> std::noskipws}, {}, parser, commands)
            ? commands
            : throw std::runtime_error("command parse error");
    }

    struct Formatter {
        static constexpr auto name(script::Command::Type type) {
            return std::array{"NONE",          "WRITE_LOG",   "INSERT_LABEL",
                            "START_PROCESS", "END_PROCESS", "COMMENT",
                            "FAIL"}
                .at(static_cast<int>(type));
        }

        auto parse(auto& ctx) const { return ctx.begin(); }
        auto format(script::Command const& cmd, auto& ctx) const {
            return format_to(ctx.out(), "Command({}, {})", name(cmd.type), cmd.args);
        }
    };
} // namespace script

template <> struct fmt::formatter<script::Command> : script::Formatter {};

std::stringstream ss{
    R"(// just a comment

    WriteLog("this is a log")
    WriteLog("this is also (in another way) a log")
    WriteLog("but this is just a fail)

    StartProcess(17, "program.exe", True)
    StartProcess(17, "this_is_a_fail.exe, True)
    )"};

int main() {
    using namespace script;
    static const Commands expected{
        {Command::COMMENT, {"just a comment"}},
        {Command::NONE, {}},
        {Command::WRITE_LOG, {"this is a log"}},
        {Command::WRITE_LOG, {"this is also (in another way) a log"}},
        {Command::FAIL, {}},
        {Command::NONE, {}},
        {Command::START_PROCESS, {"17", "program.exe", "True"}},
        {Command::FAIL, {}},
        {Command::NONE, {}},
    };

    try {
        auto parsed = script::parse(ss);
        fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
                (parsed == expected), parsed.size(), expected.size());

        for (auto i = 0u; i < std::min(expected.size(), parsed.size()); ++i) {
            if (expected[i] != parsed[i]) {
                fmt::print("index #{} expected {}\n"
                        "          actual:  {}\n",
                        i, expected[i], parsed[i]);
            } else {
                fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
            }
        }
    } catch (std::exception const& e) {
        fmt::print("Exception: {}\n", e.what());
    }
}

Prints

Parsed all correct? true -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
index #4 CORRECT (Command(FAIL, []))
index #5 CORRECT (Command(NONE, []))
index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
index #7 CORRECT (Command(FAIL, []))
index #8 CORRECT (Command(NONE, []))

¹ see e.g. boost::spirit alternative parsers return duplicates (which links to three more of the same kind)

回复收藏 0 原文

~没有更多了~