如何编写统一 diff 语法的解析器

发布于 2024-09-16 04:15:13 字数 162 浏览 8 评论 0原文

我应该使用 RegexParsers、StandardTokenParsers 还是这些都适合解析这种语法?语法示例可以从此处找到。

Should I use RegexParsers, StandardTokenParsers or are these suitable at all for parsing this kind of syntax? Example of the syntax can be found from here.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(4

叫嚣ゝ 2024-09-23 04:15:13

我会使用正则表达式。它简化了一些事情,并使其余的事情变得标准。

def process(src: scala.io.Source) {
  import scala.util.matching.Regex

  val FilePattern = """(.*) ''(.*)''"""
  val OriginalFile = new Regex("--- "+FilePattern, "path", "timestamp")
  val NewFile = new Regex("+++ "+FilePattern, "path", "timestamp")
  val Chunk = new Regex("""@@ -(\d+),(\d+) +(\d+),(\d+) @@""", "orgStarting", "orgSize", "newStarting", "newSize")
  val AddedLine = """+(.*)""".r
  val RemovedLine = """-(.*)""".r
  val UnchangedLine = """ (.*)""".r

  src.getLines() foreach {
    case OriginalFile(path, timestamp) => println("Original file: "+path)
    case NewFile(path, timestamp) => println("New file: "+path)
    case Chunk(l1, s1, l2, s2) => println("Modifying %d lines at line %d, to %d lines at %d" format (s1, l1, s2, l2))
    case AddedLine(line) => println("Adding line "+line)
    case RemovedLine(line) => println("Removing line "+line)
    case UnchangedLine(line) => println("Keeping line "+line)
  }
}

I'd use regex. It simplifies a few things, and makes the rest standard.

def process(src: scala.io.Source) {
  import scala.util.matching.Regex

  val FilePattern = """(.*) ''(.*)''"""
  val OriginalFile = new Regex("--- "+FilePattern, "path", "timestamp")
  val NewFile = new Regex("+++ "+FilePattern, "path", "timestamp")
  val Chunk = new Regex("""@@ -(\d+),(\d+) +(\d+),(\d+) @@""", "orgStarting", "orgSize", "newStarting", "newSize")
  val AddedLine = """+(.*)""".r
  val RemovedLine = """-(.*)""".r
  val UnchangedLine = """ (.*)""".r

  src.getLines() foreach {
    case OriginalFile(path, timestamp) => println("Original file: "+path)
    case NewFile(path, timestamp) => println("New file: "+path)
    case Chunk(l1, s1, l2, s2) => println("Modifying %d lines at line %d, to %d lines at %d" format (s1, l1, s2, l2))
    case AddedLine(line) => println("Adding line "+line)
    case RemovedLine(line) => println("Removing line "+line)
    case UnchangedLine(line) => println("Keeping line "+line)
  }
}
浅浅 2024-09-23 04:15:13

这种格式被设计为易于解析,您可以在没有任何正则表达式的情况下完成它,并且无需对输入进行标记。只需逐行查看前几个字符即可。文件头和块头需要更多的关注,但这并不是分割不能做的。

当然,如果你想学习如何使用一些解析库,那就去吧。

This format was designed to be easy to parse, you can do it without any regular expressions and without tokenizing your input. Just go line by line and look at the first couple of characters. The file header and chunks headers will require a little more attention, but it's nothing you can't do with split.

Of course, if you want to learn how to use some parsing libraries, then go for it.

夕色琉璃 2024-09-23 04:15:13

这是使用 RegexParsers 的解决方案。

import scala.util.parsing.combinator.RegexParsers

object UnifiedDiffParser extends RegexParsers {

  // case classes representing the data of the diff
  case class UnifiedDiff(oldFile: File, newFile: File, changeChunks: List[ChangeChunk])
  case class File(name: String, timeStamp: String)
  case class ChangeChunk(rangeInformation: RangeInformation, changeLines: List[String])
  case class RangeInformation(oldOffset: Int, oldLength: Int, newOffset: Int, newLength: Int)

  override def skipWhitespace = false

  def unifiedDiff: Parser[UnifiedDiff] = oldFile ~ newFile ~ rep1(changeChunk) ^^ {
    case of ~ nf ~ l => UnifiedDiff(of, nf, l)
  }   

  def oldFile: Parser[File] = ("--- " ~> filename) ~ ("""\s+""".r ~> timestamp <~ newline) ^^ {
    case f~t => File(f, t)
  }   
  def newFile: Parser[File] = ("+++ " ~> filename) ~ ("""\s+""".r ~> timestamp <~ newline) ^^ {
    case f~t => File(f, t)
  }   
  def filename: Parser[String] = """[\S]+""".r
  def timestamp: Parser[String] = """.*""".r

  def changeChunk: Parser[ChangeChunk] = rangeInformation ~ (newline ~> rep1(lineChange)) ^^ {
    case ri ~ l => ChangeChunk(ri, l)
  }   
  def rangeInformation: Parser[RangeInformation] = ("@@ " ~> "-" ~> number) ~ ("," ~> number) ~ (" +" ~> number) ~ ("," ~> number) <~ " @@" ^^ {
    case a ~ b ~ c ~ d => RangeInformation(a, b, c, d)
  }   

  def lineChange: Parser[String] = contextLine | addedLine | deletedLine
  def contextLine: Parser[String] = """ .*""".r <~ newline
  def addedLine: Parser[String] = """\+.*""".r <~ newline
  def deletedLine: Parser[String] = """-.*""".r <~ newline

  def newline: Parser[String] = """\n""".r
  def number: Parser[Int] = """\d+""".r ^^ {_.toInt}

  def main(args: Array[String]) {
    val reader = { 
      if (args.length == 0) {
        // read from stdin
        Console.in
      } else {
        new java.io.FileReader(args(0))
      }   
    }   
    println(parseAll(unifiedDiff, reader))
  }   
}   

Here is a solution using RegexParsers.

import scala.util.parsing.combinator.RegexParsers

object UnifiedDiffParser extends RegexParsers {

  // case classes representing the data of the diff
  case class UnifiedDiff(oldFile: File, newFile: File, changeChunks: List[ChangeChunk])
  case class File(name: String, timeStamp: String)
  case class ChangeChunk(rangeInformation: RangeInformation, changeLines: List[String])
  case class RangeInformation(oldOffset: Int, oldLength: Int, newOffset: Int, newLength: Int)

  override def skipWhitespace = false

  def unifiedDiff: Parser[UnifiedDiff] = oldFile ~ newFile ~ rep1(changeChunk) ^^ {
    case of ~ nf ~ l => UnifiedDiff(of, nf, l)
  }   

  def oldFile: Parser[File] = ("--- " ~> filename) ~ ("""\s+""".r ~> timestamp <~ newline) ^^ {
    case f~t => File(f, t)
  }   
  def newFile: Parser[File] = ("+++ " ~> filename) ~ ("""\s+""".r ~> timestamp <~ newline) ^^ {
    case f~t => File(f, t)
  }   
  def filename: Parser[String] = """[\S]+""".r
  def timestamp: Parser[String] = """.*""".r

  def changeChunk: Parser[ChangeChunk] = rangeInformation ~ (newline ~> rep1(lineChange)) ^^ {
    case ri ~ l => ChangeChunk(ri, l)
  }   
  def rangeInformation: Parser[RangeInformation] = ("@@ " ~> "-" ~> number) ~ ("," ~> number) ~ (" +" ~> number) ~ ("," ~> number) <~ " @@" ^^ {
    case a ~ b ~ c ~ d => RangeInformation(a, b, c, d)
  }   

  def lineChange: Parser[String] = contextLine | addedLine | deletedLine
  def contextLine: Parser[String] = """ .*""".r <~ newline
  def addedLine: Parser[String] = """\+.*""".r <~ newline
  def deletedLine: Parser[String] = """-.*""".r <~ newline

  def newline: Parser[String] = """\n""".r
  def number: Parser[Int] = """\d+""".r ^^ {_.toInt}

  def main(args: Array[String]) {
    val reader = { 
      if (args.length == 0) {
        // read from stdin
        Console.in
      } else {
        new java.io.FileReader(args(0))
      }   
    }   
    println(parseAll(unifiedDiff, reader))
  }   
}   
冰雪梦之恋 2024-09-23 04:15:13

在寻找为 git diff 构建 Scala 解析器(通过运行 git diff-tree 生成)时偶然发现了这一点。这与统一差异非常相似,但它确实有一些有趣的变体。

我严重依赖上面的答案,并最终编写了此处包含的解析器。当然,这并不是严格意义上的原始海报所追求的,但我认为它对其他人可能有用。

import util.parsing.combinator._

object GitDiff {
  // file names have "a/" or "b/" as prefix, need to drop that to compare
  def apply (files: (String,String), op: FileOperation, chunks: List[ChangeChunk]) = {
    def strip(s: String) = s.dropWhile(_ != '/').drop(1)
    new GitDiff( strip( files._1 ), strip( files._2 ), op, chunks )
  }
}

case class GitDiff(oldFile: String, newFile: String, op: FileOperation, chunks: List[ChangeChunk]) {
  val isRename = oldFile != newFile
}

sealed trait FileOperation
case class NewFile(mode: Int) extends FileOperation
case class DeletedFile(mode: Int) extends FileOperation
case object UpdatedFile extends FileOperation

sealed trait LineChange { def line: String }
case class ContextLine(line: String) extends LineChange
case class LineRemoved(line: String) extends LineChange
case class LineAdded(line: String) extends LineChange
case class RangeInformation(oldOffset: Int, oldLength: Int, newOffset: Int, newLength: Int)
case class ChangeChunk(rangeInformation: RangeInformation, changeLines: List[LineChange])

// Code taken from http://stackoverflow.com/questions/3560073/how-to-write-parser-for-unified-diff-syntax
object GitDiffParser extends RegexParsers {

  override def skipWhitespace = false

  def allDiffs: Parser[List[GitDiff]] = rep1(gitDiff)

  def gitDiff: Parser[GitDiff] = filesChanged ~ fileOperation ~ diffChunks ^^ {
    case files ~ op ~ chunks => GitDiff(files, op, chunks)
  }

  def filesChanged: Parser[(String, String)] =
    "diff --git " ~> filename ~ (" " ~> filename) <~ newline ^^ { case f1 ~ f2 => (f1,f2) }

  def fileOperation: Parser[FileOperation] =
    opt(deletedFileMode | newFileMode) <~ index ^^ { _ getOrElse UpdatedFile }

  def index: Parser[Any] = ( "index " ~ hash ~ ".." ~ hash ) ~> opt(" " ~> mode) <~ newline
  def deletedFileMode: Parser[DeletedFile] = "deleted file mode " ~> mode <~ newline ^^ { m => DeletedFile(m) }
  def newFileMode: Parser[NewFile] = "new file mode " ~> mode <~ newline ^^ { m => NewFile(m) }
  def hash: Parser[String] = """[0-9a-f]{7}""".r
  def mode: Parser[Int] = """\d{6}""".r ^^ { _.toInt }

  def diffChunks: Parser[List[ChangeChunk]] = (oldFile ~ newFile) ~> rep1(changeChunk)

  def oldFile: Parser[String] = "--- " ~> filename <~ newline
  def newFile: Parser[String] = "+++ " ~> filename <~ newline
  def filename: Parser[String] = """[\S]+""".r

  def changeChunk: Parser[ChangeChunk] = rangeInformation ~ opt(contextLine) ~ (opt(newline) ~> rep1(lineChange)) ^^ {
    case ri ~ opCtx ~ lines => ChangeChunk(ri, opCtx map (_ :: lines) getOrElse (lines))
  }
  def rangeInformation: Parser[RangeInformation] =
    ("@@ " ~> "-" ~> number) ~ opt("," ~> number) ~ (" +" ~> number) ~ opt("," ~> number) <~ " @@" ^^ {
      case a ~ b ~ c ~ d => RangeInformation(a, b getOrElse 0, c, d getOrElse 0)
    }

  def lineChange: Parser[LineChange] = contextLine | addedLine | deletedLine
  def contextLine: Parser[ContextLine] = " " ~> """.*""".r <~ newline ^^ { l => ContextLine(l) }
  def addedLine: Parser[LineAdded] = "+" ~> """.*""".r <~ newline ^^ { l => LineAdded(l) }
  def deletedLine: Parser[LineRemoved] = "-" ~> """.*""".r <~ newline ^^ { l => LineRemoved(l) }

  def newline: Parser[String] = """\n""".r
  def number: Parser[Int] = """\d+""".r ^^ { _.toInt }

  def parse(str: String) = parseAll(allDiffs, str)

  def main(args: Array[String]) {
    val reader = {
      if (args.length == 0) {
        // read from stdin
        Console.in
      } else {
        new java.io.FileReader(args(0))
      }
    }
    parseAll(allDiffs, reader) match {
      case Success(s,_) => println( s )
      case NoSuccess(msg,_) => sys.error("ERROR: " + msg)
    }
  }
}

Stumbled onto this while looking to build a Scala parser for a git diff, as generated by running git diff-tree. This is very similar to unified diff, but it does have a few interesting variants.

I heavily relied on an answer above, and ended up writing the parser included here. It's not strictly what the original poster was after of course, but I figured it could be useful to others.

import util.parsing.combinator._

object GitDiff {
  // file names have "a/" or "b/" as prefix, need to drop that to compare
  def apply (files: (String,String), op: FileOperation, chunks: List[ChangeChunk]) = {
    def strip(s: String) = s.dropWhile(_ != '/').drop(1)
    new GitDiff( strip( files._1 ), strip( files._2 ), op, chunks )
  }
}

case class GitDiff(oldFile: String, newFile: String, op: FileOperation, chunks: List[ChangeChunk]) {
  val isRename = oldFile != newFile
}

sealed trait FileOperation
case class NewFile(mode: Int) extends FileOperation
case class DeletedFile(mode: Int) extends FileOperation
case object UpdatedFile extends FileOperation

sealed trait LineChange { def line: String }
case class ContextLine(line: String) extends LineChange
case class LineRemoved(line: String) extends LineChange
case class LineAdded(line: String) extends LineChange
case class RangeInformation(oldOffset: Int, oldLength: Int, newOffset: Int, newLength: Int)
case class ChangeChunk(rangeInformation: RangeInformation, changeLines: List[LineChange])

// Code taken from http://stackoverflow.com/questions/3560073/how-to-write-parser-for-unified-diff-syntax
object GitDiffParser extends RegexParsers {

  override def skipWhitespace = false

  def allDiffs: Parser[List[GitDiff]] = rep1(gitDiff)

  def gitDiff: Parser[GitDiff] = filesChanged ~ fileOperation ~ diffChunks ^^ {
    case files ~ op ~ chunks => GitDiff(files, op, chunks)
  }

  def filesChanged: Parser[(String, String)] =
    "diff --git " ~> filename ~ (" " ~> filename) <~ newline ^^ { case f1 ~ f2 => (f1,f2) }

  def fileOperation: Parser[FileOperation] =
    opt(deletedFileMode | newFileMode) <~ index ^^ { _ getOrElse UpdatedFile }

  def index: Parser[Any] = ( "index " ~ hash ~ ".." ~ hash ) ~> opt(" " ~> mode) <~ newline
  def deletedFileMode: Parser[DeletedFile] = "deleted file mode " ~> mode <~ newline ^^ { m => DeletedFile(m) }
  def newFileMode: Parser[NewFile] = "new file mode " ~> mode <~ newline ^^ { m => NewFile(m) }
  def hash: Parser[String] = """[0-9a-f]{7}""".r
  def mode: Parser[Int] = """\d{6}""".r ^^ { _.toInt }

  def diffChunks: Parser[List[ChangeChunk]] = (oldFile ~ newFile) ~> rep1(changeChunk)

  def oldFile: Parser[String] = "--- " ~> filename <~ newline
  def newFile: Parser[String] = "+++ " ~> filename <~ newline
  def filename: Parser[String] = """[\S]+""".r

  def changeChunk: Parser[ChangeChunk] = rangeInformation ~ opt(contextLine) ~ (opt(newline) ~> rep1(lineChange)) ^^ {
    case ri ~ opCtx ~ lines => ChangeChunk(ri, opCtx map (_ :: lines) getOrElse (lines))
  }
  def rangeInformation: Parser[RangeInformation] =
    ("@@ " ~> "-" ~> number) ~ opt("," ~> number) ~ (" +" ~> number) ~ opt("," ~> number) <~ " @@" ^^ {
      case a ~ b ~ c ~ d => RangeInformation(a, b getOrElse 0, c, d getOrElse 0)
    }

  def lineChange: Parser[LineChange] = contextLine | addedLine | deletedLine
  def contextLine: Parser[ContextLine] = " " ~> """.*""".r <~ newline ^^ { l => ContextLine(l) }
  def addedLine: Parser[LineAdded] = "+" ~> """.*""".r <~ newline ^^ { l => LineAdded(l) }
  def deletedLine: Parser[LineRemoved] = "-" ~> """.*""".r <~ newline ^^ { l => LineRemoved(l) }

  def newline: Parser[String] = """\n""".r
  def number: Parser[Int] = """\d+""".r ^^ { _.toInt }

  def parse(str: String) = parseAll(allDiffs, str)

  def main(args: Array[String]) {
    val reader = {
      if (args.length == 0) {
        // read from stdin
        Console.in
      } else {
        new java.io.FileReader(args(0))
      }
    }
    parseAll(allDiffs, reader) match {
      case Success(s,_) => println( s )
      case NoSuccess(msg,_) => sys.error("ERROR: " + msg)
    }
  }
}
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文