为什么这种ANTLR语法报告错误？

发布于 2025-02-09 00:20:04 字数 7095 浏览 4 评论 0原文

我有一个相当简单的语法，旨在解析uris。它是在antlr4-Maven-Plugin的帮助下编译的。编译不会产生任何警告或错误。我写了一个简单的测试。

uri.g4：

/**
 * Uniform Resource Identifier (RFC 3986).
 *
 * @author Oliver Yasuna
 * @see <a href="https://www.rfc-editor.org/rfc/rfc3986.html">RFC 3986</a>
 * @since 1.0.0
 */

grammar Uri;

options {
  tokenVocab = Common;
}

@header {
  package com.oliveryasuna.http.antlr;
}

// Parser
//--------------------------------------------------

pctEncoded
  : '%' HEXDIG HEXDIG
  ;

reserved
  : genDelims | subDelims
  ;

genDelims
  : ':' | '/' | '?' | '#' | '[' | ']' | '@'
  ;

subDelims
  : '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '='
  ;

unreserved
  : ALPHA | DIGIT | '-' | '.' | '_' | '~'
  ;

uri
  : scheme ':' hierPart ('?' query)? ('#' fragment_)?
  ;

hierPart
  : '//' authority pathAbEmpty
  | pathAbsolute
  | pathRootless
  | pathEmpty
  ;

scheme
  : ALPHA (ALPHA | DIGIT | '+' | '-' | '.')*
  ;

authority
  : (userinfo '@')? host (':' port)?
  ;

userinfo
  : (unreserved | pctEncoded | subDelims | ':')*
  ;

host
  : ipLiteral
  | ipv4Address
  | regName
  ;

ipLiteral
  : '[' (ipv6Address | ipvFuture) ']'
  ;

ipvFuture
  : 'v' HEXDIG+ '.' (unreserved | subDelims | ':')+
  ;

ipv6Address
:                                                                            '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
|                                                                            '::'           (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                                    h16?  '::'                     (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                        ((h16 ':')? h16)? '::'                               (h16 ':') (h16 ':') (h16 ':') ls32
  |                                             ((h16 ':')? (h16 ':')? h16)? '::'                                         (h16 ':') (h16 ':') ls32
  |                                  ((h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                    h16 ':'  ls32
  |                       ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             ls32
  |            ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             h16
  | ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'
  ;

ls32
  : (h16 ':' h16)
  | ipv4Address
  ;

h16
  : HEXDIG HEXDIG? HEXDIG? HEXDIG?
  ;

ipv4Address
  : decOctet '.' decOctet '.' decOctet '.' decOctet
  ;

decOctet
  : DIGIT
  | ('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') DIGIT
  | '1' DIGIT DIGIT
  | '2' ('0' | '1' | '2' | '3' | '4') DIGIT
  | '2' '5' ('0' | '1' | '2' | '3' | '4' | '5')
  ;

regName
  : (unreserved | pctEncoded | subDelims)*
  ;

port
  : DIGIT*
  ;

path
  : pathAbEmpty
  | pathAbsolute
  | pathNoScheme
  | pathRootless
  | pathEmpty
  ;

pathAbEmpty
  : ('/' segment)*
  ;

pathAbsolute
  : '/' (segmentNz ('/' segment)?)?
  ;

pathNoScheme
  : segmentNzNc ('/' segment)?
  ;

pathRootless
  : segmentNz ('/' segment)?
  ;

pathEmpty
  : // TODO: 0<pchar>.
  ;

segment
  : pchar*
  ;

segmentNz
  : pchar+
  ;

segmentNzNc
  : (unreserved | pctEncoded | subDelims | '@')+
  ;

pchar
  : unreserved | pctEncoded | subDelims | ':' | '@'
  ;

query
  : (pchar | '/' | '?')*
  ;

fragment_
  : (pchar | '/' | '?')*
  ;

uriReference
  : uri
  | relativeRef
  ;

relativeRef
  : relativePart ('?' query)? ('#' fragment_)?
  ;

relativePart
  : '//' authority pathAbEmpty
  | pathAbEmpty
  | pathNoScheme
  | pathEmpty
  ;

absoluteUri
  : scheme ':' hierPart ('?' query)?
  ;

common.g4：

lexer grammar Common;

// ASCII
//--------------------------------------------------

BANG                  : '!'  ;
//DOUBLE_QUOTE          : '"'  ;
HASH                  : '#'  ;
DOLLAR                : '$'  ;
PERCENT               : '%'  ;
AND                   : '&'  ;
SINGLE_QUOTE          : '\'' ;
LEFT_PARENTHESES      : '('  ;
RIGHT_PARENTHESES     : ')'  ;
STAR                  : '*'  ;
PLUS                  : '+'  ;
COMMA                 : ','  ;
MINUS                 : '-'  ;
DOT                   : '.'  ;
SLASH                 : '/'  ;
COLON                 : ':'  ;
SEMICOLON             : ';'  ;
LEFT_ANGLE_BRACKET    : '<'  ;
EQUAL                 : '='  ;
RIGHT_ANGLE_BRACKET   : '>'  ;
QUESTION              : '?'  ;
AT                    : '@'  ;
LEFT_SQUARE_BRACKET   : '['  ;
BACKSLASH             : '\\' ;
RIGHT_SQUARE_BRACKET  : ']'  ;
CARROT                : '^'  ;
UNDERSCORE            : '_'  ;
BACKTICK              : '`'  ;
LEFT_CURLY_BRACKET    : '{'  ;
BAR                   : '|'  ;
RIGHT_CURLY_BRACKET   : '}'  ;
TILDE                 : '~'  ;

// Core
//--------------------------------------------------

// Taken from ABNF.
ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;
HEXDIG  : [0-9a-fA-F]           ;
DQUOTE  : '"'                   ;
SP      : ' '                   ;
HTAB    : '\t'                  ;
WSP     : SP | HTAB             ;
//LWSP    : (WSP | CRLF WSP)*     ;
VCHAR   : [\u0021-\u007F]       ;
CHAR    : [\u0001-\u007F]       ;
OCTET   : [\u0000-\u00FF]       ;
CTL     : [\u0000-\u001F\u007F] ;
CR      : '\r'                  ;
LF      : '\n'                  ;
CRLF    : CR LF                 ;
BIT     : '0' | '1'             ;

// Miscellaneous
//--------------------------------------------------

DOUBLE_SLASH  : '//' ;
DOUBLE_COLON  : '::' ;

LOWER_V       : 'v'  ;

ZERO          : '0'  ;
ONE           : '1'  ;
TWO           : '2'  ;
THREE         : '3'  ;
FOUR          : '4'  ;
FIVE          : '5'  ;
SIX           : '6'  ;
SEVEN         : '7'  ;
EIGHT         : '8'  ;
NINE          : '9'  ;

测试方法：

@Test
final void google() {
  final String uri = "https://www.google.com/";

  final UriLexer lexer = new UriLexer(new ANTLRInputStream(uri));
  final UriParser parser = new UriParser(new CommonTokenStream(lexer));

  parser.addErrorListener(new BaseErrorListener() {
    @Override
    public void syntaxError(final Recognizer<?, ?> recognizer, final Object offendingSymbol, final int line, final int charPositionInLine, final String msg, final RecognitionException e) {
      throw new IllegalStateException("[" + line + ":" + charPositionInLine + "] Symbol [" + offendingSymbol + "] produced error: " + msg + ".", e);
    }
  });

  Assertions.assertDoesNotThrow(parser::uri);
}

我输入https://www.google.com/ 。

我绝对不知道是什么原因导致这些解析错误。有人有主意吗？

输出：

line 1:0 token recognition error at: 'h'
line 1:1 token recognition error at: 't'
line 1:2 token recognition error at: 't'
line 1:3 token recognition error at: 'p'
line 1:4 token recognition error at: 's'
line 1:5 missing '6' at ':'

原文

I have a fairly simple grammar designed to parse URIs. It is compiled with the help of antlr4-maven-plugin. Compiling produces no warnings or errors. I wrote a simple test.

Uri.g4:

/**
 * Uniform Resource Identifier (RFC 3986).
 *
 * @author Oliver Yasuna
 * @see <a href="https://www.rfc-editor.org/rfc/rfc3986.html">RFC 3986</a>
 * @since 1.0.0
 */

grammar Uri;

options {
  tokenVocab = Common;
}

@header {
  package com.oliveryasuna.http.antlr;
}

// Parser
//--------------------------------------------------

pctEncoded
  : '%' HEXDIG HEXDIG
  ;

reserved
  : genDelims | subDelims
  ;

genDelims
  : ':' | '/' | '?' | '#' | '[' | ']' | '@'
  ;

subDelims
  : '!' | '
Common.g4:
lexer grammar Common;

// ASCII
//--------------------------------------------------

BANG                  : '!'  ;
//DOUBLE_QUOTE          : '"'  ;
HASH                  : '#'  ;
DOLLAR                : '
Test method:
@Test
final void google() {
  final String uri = "https://www.google.com/";

  final UriLexer lexer = new UriLexer(new ANTLRInputStream(uri));
  final UriParser parser = new UriParser(new CommonTokenStream(lexer));

  parser.addErrorListener(new BaseErrorListener() {
    @Override
    public void syntaxError(final Recognizer<?, ?> recognizer, final Object offendingSymbol, final int line, final int charPositionInLine, final String msg, final RecognitionException e) {
      throw new IllegalStateException("[" + line + ":" + charPositionInLine + "] Symbol [" + offendingSymbol + "] produced error: " + msg + ".", e);
    }
  });

  Assertions.assertDoesNotThrow(parser::uri);
}

I get the following errors when I input https://www.google.com/.
I have absolute no idea what is causing these parsing errors. Does anyone have an idea?
Output:
line 1:0 token recognition error at: 'h'
line 1:1 token recognition error at: 't'
line 1:2 token recognition error at: 't'
line 1:3 token recognition error at: 'p'
line 1:4 token recognition error at: 's'
line 1:5 missing '6' at ':'

 | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '='
  ;

unreserved
  : ALPHA | DIGIT | '-' | '.' | '_' | '~'
  ;

uri
  : scheme ':' hierPart ('?' query)? ('#' fragment_)?
  ;

hierPart
  : '//' authority pathAbEmpty
  | pathAbsolute
  | pathRootless
  | pathEmpty
  ;

scheme
  : ALPHA (ALPHA | DIGIT | '+' | '-' | '.')*
  ;

authority
  : (userinfo '@')? host (':' port)?
  ;

userinfo
  : (unreserved | pctEncoded | subDelims | ':')*
  ;

host
  : ipLiteral
  | ipv4Address
  | regName
  ;

ipLiteral
  : '[' (ipv6Address | ipvFuture) ']'
  ;

ipvFuture
  : 'v' HEXDIG+ '.' (unreserved | subDelims | ':')+
  ;

ipv6Address
:                                                                            '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
|                                                                            '::'           (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                                    h16?  '::'                     (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                        ((h16 ':')? h16)? '::'                               (h16 ':') (h16 ':') (h16 ':') ls32
  |                                             ((h16 ':')? (h16 ':')? h16)? '::'                                         (h16 ':') (h16 ':') ls32
  |                                  ((h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                    h16 ':'  ls32
  |                       ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             ls32
  |            ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             h16
  | ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'
  ;

ls32
  : (h16 ':' h16)
  | ipv4Address
  ;

h16
  : HEXDIG HEXDIG? HEXDIG? HEXDIG?
  ;

ipv4Address
  : decOctet '.' decOctet '.' decOctet '.' decOctet
  ;

decOctet
  : DIGIT
  | ('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') DIGIT
  | '1' DIGIT DIGIT
  | '2' ('0' | '1' | '2' | '3' | '4') DIGIT
  | '2' '5' ('0' | '1' | '2' | '3' | '4' | '5')
  ;

regName
  : (unreserved | pctEncoded | subDelims)*
  ;

port
  : DIGIT*
  ;

path
  : pathAbEmpty
  | pathAbsolute
  | pathNoScheme
  | pathRootless
  | pathEmpty
  ;

pathAbEmpty
  : ('/' segment)*
  ;

pathAbsolute
  : '/' (segmentNz ('/' segment)?)?
  ;

pathNoScheme
  : segmentNzNc ('/' segment)?
  ;

pathRootless
  : segmentNz ('/' segment)?
  ;

pathEmpty
  : // TODO: 0<pchar>.
  ;

segment
  : pchar*
  ;

segmentNz
  : pchar+
  ;

segmentNzNc
  : (unreserved | pctEncoded | subDelims | '@')+
  ;

pchar
  : unreserved | pctEncoded | subDelims | ':' | '@'
  ;

query
  : (pchar | '/' | '?')*
  ;

fragment_
  : (pchar | '/' | '?')*
  ;

uriReference
  : uri
  | relativeRef
  ;

relativeRef
  : relativePart ('?' query)? ('#' fragment_)?
  ;

relativePart
  : '//' authority pathAbEmpty
  | pathAbEmpty
  | pathNoScheme
  | pathEmpty
  ;

absoluteUri
  : scheme ':' hierPart ('?' query)?
  ;

Common.g4:

Test method:

I get the following errors when I input https://www.google.com/.
I have absolute no idea what is causing these parsing errors. Does anyone have an idea?
Output:

  ;
PERCENT               : '%'  ;
AND                   : '&'  ;
SINGLE_QUOTE          : '\'' ;
LEFT_PARENTHESES      : '('  ;
RIGHT_PARENTHESES     : ')'  ;
STAR                  : '*'  ;
PLUS                  : '+'  ;
COMMA                 : ','  ;
MINUS                 : '-'  ;
DOT                   : '.'  ;
SLASH                 : '/'  ;
COLON                 : ':'  ;
SEMICOLON             : ';'  ;
LEFT_ANGLE_BRACKET    : '<'  ;
EQUAL                 : '='  ;
RIGHT_ANGLE_BRACKET   : '>'  ;
QUESTION              : '?'  ;
AT                    : '@'  ;
LEFT_SQUARE_BRACKET   : '['  ;
BACKSLASH             : '\\' ;
RIGHT_SQUARE_BRACKET  : ']'  ;
CARROT                : '^'  ;
UNDERSCORE            : '_'  ;
BACKTICK              : '`'  ;
LEFT_CURLY_BRACKET    : '{'  ;
BAR                   : '|'  ;
RIGHT_CURLY_BRACKET   : '}'  ;
TILDE                 : '~'  ;

// Core
//--------------------------------------------------

// Taken from ABNF.
ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;
HEXDIG  : [0-9a-fA-F]           ;
DQUOTE  : '"'                   ;
SP      : ' '                   ;
HTAB    : '\t'                  ;
WSP     : SP | HTAB             ;
//LWSP    : (WSP | CRLF WSP)*     ;
VCHAR   : [\u0021-\u007F]       ;
CHAR    : [\u0001-\u007F]       ;
OCTET   : [\u0000-\u00FF]       ;
CTL     : [\u0000-\u001F\u007F] ;
CR      : '\r'                  ;
LF      : '\n'                  ;
CRLF    : CR LF                 ;
BIT     : '0' | '1'             ;

// Miscellaneous
//--------------------------------------------------

DOUBLE_SLASH  : '//' ;
DOUBLE_COLON  : '::' ;

LOWER_V       : 'v'  ;

ZERO          : '0'  ;
ONE           : '1'  ;
TWO           : '2'  ;
THREE         : '3'  ;
FOUR          : '4'  ;
FIVE          : '5'  ;
SIX           : '6'  ;
SEVEN         : '7'  ;
EIGHT         : '8'  ;
NINE          : '9'  ;

Test method:

I get the following errors when I input https://www.google.com/.

I have absolute no idea what is causing these parsing errors. Does anyone have an idea?

Output:

| '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' ; unreserved : ALPHA | DIGIT | '-' | '.' | '_' | '~' ; uri : scheme ':' hierPart ('?' query)? ('#' fragment_)? ; hierPart : '//' authority pathAbEmpty | pathAbsolute | pathRootless | pathEmpty ; scheme : ALPHA (ALPHA | DIGIT | '+' | '-' | '.')* ; authority : (userinfo '@')? host (':' port)? ; userinfo : (unreserved | pctEncoded | subDelims | ':')* ; host : ipLiteral | ipv4Address | regName ; ipLiteral : '[' (ipv6Address | ipvFuture) ']' ; ipvFuture : 'v' HEXDIG+ '.' (unreserved | subDelims | ':')+ ; ipv6Address : '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32 | '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32 | h16? '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32 | ((h16 ':')? h16)? '::' (h16 ':') (h16 ':') (h16 ':') ls32 | ((h16 ':')? (h16 ':')? h16)? '::' (h16 ':') (h16 ':') ls32 | ((h16 ':')? (h16 ':')? (h16 ':')? h16)? '::' h16 ':' ls32 | ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::' ls32 | ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::' h16 | ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::' ; ls32 : (h16 ':' h16) | ipv4Address ; h16 : HEXDIG HEXDIG? HEXDIG? HEXDIG? ; ipv4Address : decOctet '.' decOctet '.' decOctet '.' decOctet ; decOctet : DIGIT | ('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') DIGIT | '1' DIGIT DIGIT | '2' ('0' | '1' | '2' | '3' | '4') DIGIT | '2' '5' ('0' | '1' | '2' | '3' | '4' | '5') ; regName : (unreserved | pctEncoded | subDelims)* ; port : DIGIT* ; path : pathAbEmpty | pathAbsolute | pathNoScheme | pathRootless | pathEmpty ; pathAbEmpty : ('/' segment)* ; pathAbsolute : '/' (segmentNz ('/' segment)?)? ; pathNoScheme : segmentNzNc ('/' segment)? ; pathRootless : segmentNz ('/' segment)? ; pathEmpty : // TODO: 0<pchar>. ; segment : pchar* ; segmentNz : pchar+ ; segmentNzNc : (unreserved | pctEncoded | subDelims | '@')+ ; pchar : unreserved | pctEncoded | subDelims | ':' | '@' ; query : (pchar | '/' | '?')* ; fragment_ : (pchar | '/' | '?')* ; uriReference : uri | relativeRef ; relativeRef : relativePart ('?' query)? ('#' fragment_)? ; relativePart : '//' authority pathAbEmpty | pathAbEmpty | pathNoScheme | pathEmpty ; absoluteUri : scheme ':' hierPart ('?' query)? ;

Common.g4:

Test method:

I get the following errors when I input https://www.google.com/.

I have absolute no idea what is causing these parsing errors. Does anyone have an idea?

Output:

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

韵柒 2025-02-16 00:20:04

Antlr的Lexer在解析和令牌化/Lexing之间具有严格的分离。 Lexer还可以独立于解析器工作，并基于2个简单规则创建令牌：

在2个或多个Lexer规则匹配相同字符时，尝试将任意数量的字符消耗为单个Lexer规则
请首先定义“ win”

，如果我们现在，查看您的规则：

ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;
HEXDIG  : [0-9a-fA-F]           ;

很明显，Lexer Rule Hexdig永远不会匹配，因为alpha或 digit 将与任何hexdig匹配匹配并在hexdig之前定义。切换订单：

HEXDIG  : [0-9a-fA-F]           ;
ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;

不起作用，因为现在任何数字将永远不会成为digit令牌，而f现在也将永远不会成为alpha。

请注意，这只是一个示例：您的Lexer语法中有更多这样的情况。

一种解决方案是将一些责任移至解析器而不是lexer：

A : [aA];
B : [bB];
C : [cC];
D : [dD];
E : [eE];
F : [fF];
G : [gG];
H : [hH];
I : [iI];
J : [jJ];
K : [kK];
L : [lL];
M : [mM];
N : [nN];
O : [oO];
P : [pP];
Q : [qQ];
R : [rR];
S : [sS];
T : [tT];
U : [uU];
V : [vV];
W : [wW];
X : [xX];
Y : [yY];
Z : [zZ];

D0 : '0';
D1 : '1';
D2 : '2';
D3 : '3';
D4 : '4';
D5 : '5';
D6 : '6';
D7 : '7';
D8 : '8';
D9 : '9';

然后在解析器中，您要做：

alpha
 : A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z
 ;

digit
 : D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8 | D9
 ;

hexdig
 : A | B | C | D | E | F | digit
 ;

另外，删除 all 诸如'6'从解析器中使用，然后使用适当的Lexer规则（在这种情况下，d6）。每当解析器看到这样的字面令牌（未在Lexer中定义的文字令牌）时，它“神奇地”为其创建了一个新的令牌，从而产生了神秘的错误/警告信息。最好从解析器中删除所有（我的意思是全部！）这样的字面令牌。

ANTLR's lexer has a strict separation between parsing and tokenizing/lexing. The lexer also works independently from the parser and creates tokens based on 2 simple rules:

try to consume as many characters for a single lexer rule
when 2 or more lexer rules match the same characters, let the one defined first "win"

If we now look at your rules:

ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;
HEXDIG  : [0-9a-fA-F]           ;

it is clear that the lexer rule HEXDIG will never be matched because either ALPHA or DIGIT will match whatever HEXDIG matches and are defined before HEXDIG. Switching the order:

HEXDIG  : [0-9a-fA-F]           ;
ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;

will not work because any digit will now never become a DIGIT token, and a F will now also never become a ALPHA.

Note that this is just a single example: there are more of such cases in you lexer grammar.

A solution would be to move some of the responsibility to the parser instead of the lexer:

A : [aA];
B : [bB];
C : [cC];
D : [dD];
E : [eE];
F : [fF];
G : [gG];
H : [hH];
I : [iI];
J : [jJ];
K : [kK];
L : [lL];
M : [mM];
N : [nN];
O : [oO];
P : [pP];
Q : [qQ];
R : [rR];
S : [sS];
T : [tT];
U : [uU];
V : [vV];
W : [wW];
X : [xX];
Y : [yY];
Z : [zZ];

D0 : '0';
D1 : '1';
D2 : '2';
D3 : '3';
D4 : '4';
D5 : '5';
D6 : '6';
D7 : '7';
D8 : '8';
D9 : '9';

and then in the parser you do:

alpha
 : A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z
 ;

digit
 : D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8 | D9
 ;

hexdig
 : A | B | C | D | E | F | digit
 ;

Also, remove all the literal tokens like '6' from the parser and use the proper lexer rule instead (D6, in this case). Whenever the parser sees such a literal token, which is not defined in the lexer, it "magically" creates a new token for it, resulting in mysterious error/warning messages. Best to remove all (and I mean all!) such literal token from the parser.

回复收藏 0 原文

删除→记忆 2025-02-16 00:20:04

除了在语法上做出的答案（所有都是正确的）之外，这不是如何编写分裂语法的方法！

您必须有“解析器语法尿布”；在Uriparser.g4（将Uri.g4重命名为Uriparser.g4）和“ Lexer Grammar Urilexer”中；在尿素g4中（将common.g4重命名为urilexer.g4）。

如果您尝试为原始的“拆分”语法生成解析器，则会获得由Antlr工具生成的三个.tokens文件，所有大小和内容都不同。这表明可能在Lexer和Parser之间没有令牌类型的协调。这与“令牌识别错误”无关，因为正如Bart所说，Lexer完全独立于解析器运行。但是，当您开始使用其他输入测试语法作品时，它将产生影响。

另外，您绝对不应包括@header {package ...; }在语法中。您需要使用-package选项。使用@header使语法完全无法投入到其他目标，如果您在一个目录中有多个语法，则会产生问题，有些具有@header，而有些则没有。

如果解决这些问题，则代码可以解析您的输入 - 请注意，您的Lexer规则不正确（请参阅Bart的答案）。

目前尚不清楚为什么您要将语法一开始。

uriparser.g4：

/**
 * Uniform Resource Identifier (RFC 3986).
 *
 * @author Oliver Yasuna
 * @see <a href="https://www.rfc-editor.org/rfc/rfc3986.html">RFC 3986</a>
 * @since 1.0.0
 */

parser grammar UriParser;

options {
  tokenVocab = UriLexer;
}

// Parser
//--------------------------------------------------

pctEncoded
  : '%' HEXDIG HEXDIG
  ;

reserved
  : genDelims | subDelims
  ;

genDelims
  : ':' | '/' | '?' | '#' | '[' | ']' | '@'
  ;

subDelims
  : '!' | '
 uilexer.g4：
lexer grammar UriLexer;

// ASCII
//--------------------------------------------------

BANG                  : '!'  ;
//DOUBLE_QUOTE          : '"'  ;
HASH                  : '#'  ;
DOLLAR                : '
 | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '='
  ;

unreserved
  : ALPHA | DIGIT | '-' | '.' | '_' | '~'
  ;

uri
  : scheme ':' hierPart ('?' query)? ('#' fragment_)?
  ;

hierPart
  : '//' authority pathAbEmpty
  | pathAbsolute
  | pathRootless
  | pathEmpty
  ;

scheme
  : ALPHA (ALPHA | DIGIT | '+' | '-' | '.')*
  ;

authority
  : (userinfo '@')? host (':' port)?
  ;

userinfo
  : (unreserved | pctEncoded | subDelims | ':')*
  ;

host
  : ipLiteral
  | ipv4Address
  | regName
  ;

ipLiteral
  : '[' (ipv6Address | ipvFuture) ']'
  ;

ipvFuture
  : 'v' HEXDIG+ '.' (unreserved | subDelims | ':')+
  ;

ipv6Address
:                                                                            '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
|                                                                            '::'           (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                                    h16?  '::'                     (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                        ((h16 ':')? h16)? '::'                               (h16 ':') (h16 ':') (h16 ':') ls32
  |                                             ((h16 ':')? (h16 ':')? h16)? '::'                                         (h16 ':') (h16 ':') ls32
  |                                  ((h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                    h16 ':'  ls32
  |                       ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             ls32
  |            ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             h16
  | ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'
  ;

ls32
  : (h16 ':' h16)
  | ipv4Address
  ;

h16
  : HEXDIG HEXDIG? HEXDIG? HEXDIG?
  ;

ipv4Address
  : decOctet '.' decOctet '.' decOctet '.' decOctet
  ;

decOctet
  : DIGIT
  | ('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') DIGIT
  | '1' DIGIT DIGIT
  | '2' ('0' | '1' | '2' | '3' | '4') DIGIT
  | '2' '5' ('0' | '1' | '2' | '3' | '4' | '5')
  ;

regName
  : (unreserved | pctEncoded | subDelims)*
  ;

port
  : DIGIT*
  ;

path
  : pathAbEmpty
  | pathAbsolute
  | pathNoScheme
  | pathRootless
  | pathEmpty
  ;

pathAbEmpty
  : ('/' segment)*
  ;

pathAbsolute
  : '/' (segmentNz ('/' segment)?)?
  ;

pathNoScheme
  : segmentNzNc ('/' segment)?
  ;

pathRootless
  : segmentNz ('/' segment)?
  ;

pathEmpty
  : // TODO: 0<pchar>.
  ;

segment
  : pchar*
  ;

segmentNz
  : pchar+
  ;

segmentNzNc
  : (unreserved | pctEncoded | subDelims | '@')+
  ;

pchar
  : unreserved | pctEncoded | subDelims | ':' | '@'
  ;

query
  : (pchar | '/' | '?')*
  ;

fragment_
  : (pchar | '/' | '?')*
  ;

uriReference
  : uri
  | relativeRef
  ;

relativeRef
  : relativePart ('?' query)? ('#' fragment_)?
  ;

relativePart
  : '//' authority pathAbEmpty
  | pathAbEmpty
  | pathNoScheme
  | pathEmpty
  ;

absoluteUri
  : scheme ':' hierPart ('?' query)?
  ;

 uilexer.g4：

  ;
PERCENT               : '%'  ;
AND                   : '&'  ;
SINGLE_QUOTE          : '\'' ;
LEFT_PARENTHESES      : '('  ;
RIGHT_PARENTHESES     : ')'  ;
STAR                  : '*'  ;
PLUS                  : '+'  ;
COMMA                 : ','  ;
MINUS                 : '-'  ;
DOT                   : '.'  ;
SLASH                 : '/'  ;
COLON                 : ':'  ;
SEMICOLON             : ';'  ;
LEFT_ANGLE_BRACKET    : '<'  ;
EQUAL                 : '='  ;
RIGHT_ANGLE_BRACKET   : '>'  ;
QUESTION              : '?'  ;
AT                    : '@'  ;
LEFT_SQUARE_BRACKET   : '['  ;
BACKSLASH             : '\\' ;
RIGHT_SQUARE_BRACKET  : ']'  ;
CARROT                : '^'  ;
UNDERSCORE            : '_'  ;
BACKTICK              : '`'  ;
LEFT_CURLY_BRACKET    : '{'  ;
BAR                   : '|'  ;
RIGHT_CURLY_BRACKET   : '}'  ;
TILDE                 : '~'  ;

// Core
//--------------------------------------------------

// Taken from ABNF.
ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;
HEXDIG  : [0-9a-fA-F]           ;
DQUOTE  : '"'                   ;
SP      : ' '                   ;
HTAB    : '\t'                  ;
WSP     : SP | HTAB             ;
//LWSP    : (WSP | CRLF WSP)*     ;
VCHAR   : [\u0021-\u007F]       ;
CHAR    : [\u0001-\u007F]       ;
OCTET   : [\u0000-\u00FF]       ;
CTL     : [\u0000-\u001F\u007F] ;
CR      : '\r'                  ;
LF      : '\n'                  ;
CRLF    : CR LF                 ;
BIT     : '0' | '1'             ;

// Miscellaneous
//--------------------------------------------------

DOUBLE_SLASH  : '//' ;
DOUBLE_COLON  : '::' ;

LOWER_V       : 'v'  ;

ZERO          : '0'  ;
ONE           : '1'  ;
TWO           : '2'  ;
THREE         : '3'  ;
FOUR          : '4'  ;
FIVE          : '5'  ;
SIX           : '6'  ;
SEVEN         : '7'  ;
EIGHT         : '8'  ;
NINE          : '9'  ;

uilexer.g4：

In addition to the answer Bart made on the grammar--all correct--this is not how to write a split grammar!

You must have "parser grammar UriParser;" in UriParser.g4 (rename Uri.g4 to UriParser.g4), and "lexer grammar UriLexer;" in UriLexer.g4 (rename Common.g4 to UriLexer.g4).

If you try to generate the parser for your original "split" grammar, you get three .tokens files generated by the Antlr tool, all different in size and contents. That indicates there is likely no coordination of the token types between the lexer and parser. That doesn't have anything to do with the "token recognition error" because as Bart says, the lexer operates completely independently from the parser. But, it will have an impact when you start testing the grammar productions with other input.

Also, you should never include @header { package ...; } in the grammar. You need to the -package option instead. Using the @header makes the grammar completely unportable to other targets, and creates a problem if you have multiple grammars in one directory, some with the @header and some without.

If you fix these problems, the code parses your input--with the caveat that your lexer rules are not correct (see Bart's answer).

It's not clear why you split the grammar to begin with.

UriParser.g4:

/**
 * Uniform Resource Identifier (RFC 3986).
 *
 * @author Oliver Yasuna
 * @see <a href="https://www.rfc-editor.org/rfc/rfc3986.html">RFC 3986</a>
 * @since 1.0.0
 */

parser grammar UriParser;

options {
  tokenVocab = UriLexer;
}

// Parser
//--------------------------------------------------

pctEncoded
  : '%' HEXDIG HEXDIG
  ;

reserved
  : genDelims | subDelims
  ;

genDelims
  : ':' | '/' | '?' | '#' | '[' | ']' | '@'
  ;

subDelims
  : '!' | '
UriLexer.g4:
lexer grammar UriLexer;

// ASCII
//--------------------------------------------------

BANG                  : '!'  ;
//DOUBLE_QUOTE          : '"'  ;
HASH                  : '#'  ;
DOLLAR                : '
 | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '='
  ;

unreserved
  : ALPHA | DIGIT | '-' | '.' | '_' | '~'
  ;

uri
  : scheme ':' hierPart ('?' query)? ('#' fragment_)?
  ;

hierPart
  : '//' authority pathAbEmpty
  | pathAbsolute
  | pathRootless
  | pathEmpty
  ;

scheme
  : ALPHA (ALPHA | DIGIT | '+' | '-' | '.')*
  ;

authority
  : (userinfo '@')? host (':' port)?
  ;

userinfo
  : (unreserved | pctEncoded | subDelims | ':')*
  ;

host
  : ipLiteral
  | ipv4Address
  | regName
  ;

ipLiteral
  : '[' (ipv6Address | ipvFuture) ']'
  ;

ipvFuture
  : 'v' HEXDIG+ '.' (unreserved | subDelims | ':')+
  ;

ipv6Address
:                                                                            '::' (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
|                                                                            '::'           (h16 ':') (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                                    h16?  '::'                     (h16 ':') (h16 ':') (h16 ':') (h16 ':') ls32
  |                                                        ((h16 ':')? h16)? '::'                               (h16 ':') (h16 ':') (h16 ':') ls32
  |                                             ((h16 ':')? (h16 ':')? h16)? '::'                                         (h16 ':') (h16 ':') ls32
  |                                  ((h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                    h16 ':'  ls32
  |                       ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             ls32
  |            ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'                                                             h16
  | ((h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? (h16 ':')? h16)? '::'
  ;

ls32
  : (h16 ':' h16)
  | ipv4Address
  ;

h16
  : HEXDIG HEXDIG? HEXDIG? HEXDIG?
  ;

ipv4Address
  : decOctet '.' decOctet '.' decOctet '.' decOctet
  ;

decOctet
  : DIGIT
  | ('1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9') DIGIT
  | '1' DIGIT DIGIT
  | '2' ('0' | '1' | '2' | '3' | '4') DIGIT
  | '2' '5' ('0' | '1' | '2' | '3' | '4' | '5')
  ;

regName
  : (unreserved | pctEncoded | subDelims)*
  ;

port
  : DIGIT*
  ;

path
  : pathAbEmpty
  | pathAbsolute
  | pathNoScheme
  | pathRootless
  | pathEmpty
  ;

pathAbEmpty
  : ('/' segment)*
  ;

pathAbsolute
  : '/' (segmentNz ('/' segment)?)?
  ;

pathNoScheme
  : segmentNzNc ('/' segment)?
  ;

pathRootless
  : segmentNz ('/' segment)?
  ;

pathEmpty
  : // TODO: 0<pchar>.
  ;

segment
  : pchar*
  ;

segmentNz
  : pchar+
  ;

segmentNzNc
  : (unreserved | pctEncoded | subDelims | '@')+
  ;

pchar
  : unreserved | pctEncoded | subDelims | ':' | '@'
  ;

query
  : (pchar | '/' | '?')*
  ;

fragment_
  : (pchar | '/' | '?')*
  ;

uriReference
  : uri
  | relativeRef
  ;

relativeRef
  : relativePart ('?' query)? ('#' fragment_)?
  ;

relativePart
  : '//' authority pathAbEmpty
  | pathAbEmpty
  | pathNoScheme
  | pathEmpty
  ;

absoluteUri
  : scheme ':' hierPart ('?' query)?
  ;

UriLexer.g4:

  ;
PERCENT               : '%'  ;
AND                   : '&'  ;
SINGLE_QUOTE          : '\'' ;
LEFT_PARENTHESES      : '('  ;
RIGHT_PARENTHESES     : ')'  ;
STAR                  : '*'  ;
PLUS                  : '+'  ;
COMMA                 : ','  ;
MINUS                 : '-'  ;
DOT                   : '.'  ;
SLASH                 : '/'  ;
COLON                 : ':'  ;
SEMICOLON             : ';'  ;
LEFT_ANGLE_BRACKET    : '<'  ;
EQUAL                 : '='  ;
RIGHT_ANGLE_BRACKET   : '>'  ;
QUESTION              : '?'  ;
AT                    : '@'  ;
LEFT_SQUARE_BRACKET   : '['  ;
BACKSLASH             : '\\' ;
RIGHT_SQUARE_BRACKET  : ']'  ;
CARROT                : '^'  ;
UNDERSCORE            : '_'  ;
BACKTICK              : '`'  ;
LEFT_CURLY_BRACKET    : '{'  ;
BAR                   : '|'  ;
RIGHT_CURLY_BRACKET   : '}'  ;
TILDE                 : '~'  ;

// Core
//--------------------------------------------------

// Taken from ABNF.
ALPHA   : [a-zA-Z]              ;
DIGIT   : [0-9]                 ;
HEXDIG  : [0-9a-fA-F]           ;
DQUOTE  : '"'                   ;
SP      : ' '                   ;
HTAB    : '\t'                  ;
WSP     : SP | HTAB             ;
//LWSP    : (WSP | CRLF WSP)*     ;
VCHAR   : [\u0021-\u007F]       ;
CHAR    : [\u0001-\u007F]       ;
OCTET   : [\u0000-\u00FF]       ;
CTL     : [\u0000-\u001F\u007F] ;
CR      : '\r'                  ;
LF      : '\n'                  ;
CRLF    : CR LF                 ;
BIT     : '0' | '1'             ;

// Miscellaneous
//--------------------------------------------------

DOUBLE_SLASH  : '//' ;
DOUBLE_COLON  : '::' ;

LOWER_V       : 'v'  ;

ZERO          : '0'  ;
ONE           : '1'  ;
TWO           : '2'  ;
THREE         : '3'  ;
FOUR          : '4'  ;
FIVE          : '5'  ;
SIX           : '6'  ;
SEVEN         : '7'  ;
EIGHT         : '8'  ;
NINE          : '9'  ;

UriLexer.g4:

回复收藏 0 原文

~没有更多了~