f# tail-recursion tail-call-optimization

为什么这个 F# 序列函数不是尾递归？

发布于 2024-11-10 12:50:54 字数 964 浏览 10 评论 0原文

披露：这个问题出现在 FsCheck 中，FsCheck 是我维护的一个 F# 随机测试框架。我有一个解决方案，但我不喜欢它。此外，我不明白这个问题 - 它只是被规避了。

（单子，如果我们要使用大词的话）序列的一个相当标准的实现是：

let sequence l = 
    let k m m' = gen { let! x = m
                       let! xs = m'
                       return (x::xs) }
    List.foldBack k l (gen { return [] })

其中 gen 可以被选择的计算构建器替换。不幸的是，该实现会消耗堆栈空间，因此如果列表足够长，最终堆栈会溢出。问题是：为什么？我知道原则上，foldBack 不是尾递归，但 F# 团队的聪明小兔子在 FoldBack 实现中规避了这一点。计算构建器的实现是否存在问题？

如果我将实现更改为以下内容，一切都很好：

let sequence l =
    let rec go gs acc size r0 = 
        match gs with
        | [] -> List.rev acc
        | (Gen g)::gs' ->
            let r1,r2 = split r0
            let y = g size r1
            go gs' (y::acc) size r2
    Gen(fun n r -> go l [] n r)

为了完整起见，可以找到 Gen 类型和计算构建器 FsCheck 源代码

原文

Disclosure: this came up in FsCheck, an F# random testing framework I maintain. I have a solution, but I do not like it. Moreover, I do not understand the problem - it was merely circumvented.

A fairly standard implementation of (monadic, if we're going to use big words) sequence is:

let sequence l = 
    let k m m' = gen { let! x = m
                       let! xs = m'
                       return (x::xs) }
    List.foldBack k l (gen { return [] })

Where gen can be replaced by a computation builder of choice. Unfortunately, that implementation consumes stack space, and so eventually stack overflows if the list is long enough.The question is: why? I know in principle foldBack is not tail recursive, but the clever bunnies of the F# team have circumvented that in the foldBack implementation. Is there a problem in the computation builder implementation?

If I change the implementation to the below, everything is fine:

let sequence l =
    let rec go gs acc size r0 = 
        match gs with
        | [] -> List.rev acc
        | (Gen g)::gs' ->
            let r1,r2 = split r0
            let y = g size r1
            go gs' (y::acc) size r2
    Gen(fun n r -> go l [] n r)

For completeness, the Gen type and computation builder can be found in the FsCheck source

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

下雨或天晴 2024-11-17 12:50:54

在托马斯的答案的基础上，让我们定义两个模块：

module Kurt = 
    type Gen<'a> = Gen of (int -> 'a)

    let unit x = Gen (fun _ -> x)

    let bind k (Gen m) =     
        Gen (fun n ->       
            let (Gen m') = k (m n)       
            m' n)

    type GenBuilder() =
        member x.Return(v) = unit v
        member x.Bind(v,f) = bind f v

    let gen = GenBuilder()


module Tomas =
    type Gen<'a> = Gen of (int -> ('a -> unit) -> unit)

    let unit x = Gen (fun _ f -> f x)

    let bind k (Gen m) =     
        Gen (fun n f ->       
            m n (fun r ->         
                let (Gen m') = k r        
                m' n f))

    type GenBuilder() =
        member x.Return v = unit v
        member x.Bind(v,f) = bind f v

    let gen = GenBuilder()

为了稍微简化一下，让我们将原始序列函数重写为

let rec sequence = function
| [] -> gen { return [] }
| m::ms -> gen {
    let! x = m
    let! xs = sequence ms
    return x::xs }

现在，sequence [for i in 1 .. 100000 ->无论 sequence 是根据 Kurt.gen 还是 Tomas.gen 定义的，unit i] 都会运行完成。问题不在于 sequence 在使用您的定义时导致堆栈溢出，而是从调用 sequence 返回的函数在它被称为。

要了解为什么会这样，让我们根据底层单子操作来扩展 sequence 的定义：

let rec sequence = function
| [] -> unit []
| m::ms ->
    bind (fun x -> bind (fun xs -> unit (x::xs)) (sequence ms)) m

内联 Kurt.unit 和 Kurt.bind > 值并像疯狂一样简化，我们得到

let rec sequence = function
| [] -> Kurt.Gen(fun _ -> [])
| (Kurt.Gen m)::ms ->
    Kurt.Gen(fun n ->
            let (Kurt.Gen ms') = sequence ms
            (m n)::(ms' n))

现在希望清楚为什么调用 let (Kurt.Gen f) =equence [for i in 1 .. 1000000 ->; f 0 中的单元 i] 溢出堆栈：f 需要对结果函数进行非尾递归调用和求值，因此每次递归调用都会有一个堆栈帧。

相反，将 Tomas.unit 和 Tomas.bind 内联到 sequence 的定义中，我们得到以下简化版本：

let rec sequence = function
| [] -> Tomas.Gen (fun _ f -> f [])
| (Tomas.Gen m)::ms ->
    Tomas.Gen(fun n f ->  
        m n (fun r ->
            let (Tomas.Gen ms') = sequence ms
            ms' n (fun rs ->  f (r::rs))))

关于此变体的推理很棘手。您可以凭经验验证，对于某些任意大的输入，它不会破坏堆栈（正如托马斯在他的回答中所示），并且您可以逐步进行评估以说服自己相信这一事实。但是，堆栈消耗取决于传入列表中的 Gen 实例，并且对于本身不是尾递归的输入可能会破坏堆栈：

// ok
let (Tomas.Gen f) = sequence [for i in 1 .. 1000000 -> unit i]
f 0 (fun list -> printfn "%i" list.Length)

// not ok...
let (Tomas.Gen f) = sequence [for i in 1 .. 1000000 -> Gen(fun _ f -> f i; printfn "%i" i)]
f 0 (fun list -> printfn "%i" list.Length)

Building on Tomas's answer, let's define two modules:

module Kurt = 
    type Gen<'a> = Gen of (int -> 'a)

    let unit x = Gen (fun _ -> x)

    let bind k (Gen m) =     
        Gen (fun n ->       
            let (Gen m') = k (m n)       
            m' n)

    type GenBuilder() =
        member x.Return(v) = unit v
        member x.Bind(v,f) = bind f v

    let gen = GenBuilder()


module Tomas =
    type Gen<'a> = Gen of (int -> ('a -> unit) -> unit)

    let unit x = Gen (fun _ f -> f x)

    let bind k (Gen m) =     
        Gen (fun n f ->       
            m n (fun r ->         
                let (Gen m') = k r        
                m' n f))

    type GenBuilder() =
        member x.Return v = unit v
        member x.Bind(v,f) = bind f v

    let gen = GenBuilder()

To simplify things a bit, let's rewrite your original sequence function as

let rec sequence = function
| [] -> gen { return [] }
| m::ms -> gen {
    let! x = m
    let! xs = sequence ms
    return x::xs }

Now, sequence [for i in 1 .. 100000 -> unit i] will run to completion regardless of whether sequence is defined in terms of Kurt.gen or Tomas.gen. The issue is not that sequence causes a stack overflow when using your definitions, it's that the function returned from the call to sequence causes a stack overflow when it is called.

To see why this is so, let's expand the definition of sequence in terms of the underlying monadic operations:

let rec sequence = function
| [] -> unit []
| m::ms ->
    bind (fun x -> bind (fun xs -> unit (x::xs)) (sequence ms)) m

Inlining the Kurt.unit and Kurt.bind values and simplifying like crazy, we get

let rec sequence = function
| [] -> Kurt.Gen(fun _ -> [])
| (Kurt.Gen m)::ms ->
    Kurt.Gen(fun n ->
            let (Kurt.Gen ms') = sequence ms
            (m n)::(ms' n))

Now it's hopefully clear why calling let (Kurt.Gen f) = sequence [for i in 1 .. 1000000 -> unit i] in f 0 overflows the stack: f requires a non-tail-recursive call to sequence and evaluation of the resulting function, so there will be one stack frame for each recursive call.

Inlining Tomas.unit and Tomas.bind into the definition of sequence instead, we get the following simplified version:

let rec sequence = function
| [] -> Tomas.Gen (fun _ f -> f [])
| (Tomas.Gen m)::ms ->
    Tomas.Gen(fun n f ->  
        m n (fun r ->
            let (Tomas.Gen ms') = sequence ms
            ms' n (fun rs ->  f (r::rs))))

Reasoning about this variant is tricky. You can empirically verify that it won't blow the stack for some arbitrarily large inputs (as Tomas shows in his answer), and you can step through the evaluation to convince yourself of this fact. However, the stack consumption depends on the Gen instances in the list that's passed in, and it is possible to blow the stack for inputs that aren't themselves tail recursive:

// ok
let (Tomas.Gen f) = sequence [for i in 1 .. 1000000 -> unit i]
f 0 (fun list -> printfn "%i" list.Length)

// not ok...
let (Tomas.Gen f) = sequence [for i in 1 .. 1000000 -> Gen(fun _ f -> f i; printfn "%i" i)]
f 0 (fun list -> printfn "%i" list.Length)

回复收藏 0 原文

埖埖迣鎅 2024-11-17 12:50:54

你是对的 - 出现堆栈溢出的原因是 monad 的 bind 操作需要尾递归（因为它用于在折叠期间聚合值）。

FsCheck 中使用的 monad 本质上是一个状态 monad（它保留当前生成器和一些数字）。我对其进行了一些简化，得到如下结果：

type Gen<'a> = Gen of (int -> 'a)

let unit x = Gen (fun n -> x)

let bind k (Gen m) = 
    Gen (fun n -> 
      let (Gen m') = k (m n) 
      m' n)

这里，bind 函数不是尾递归的，因为它调用 k，然后执行更多工作。您可以将 monad 更改为延续 monad。它被实现为一个函数，该函数接受状态和一个延续 - 一个以结果作为参数调用的函数。对于这个 monad，您可以使 bind 尾部递归：

type Gen<'a> = Gen of (int -> ('a -> unit) -> unit)

let unit x = Gen (fun n f -> f x)

let bind k (Gen m) = 
    Gen (fun n f -> 
      m n (fun r -> 
        let (Gen m') = k r
        m' n f))

以下示例不会堆栈溢出（原始实现也是如此）：

let sequence l = 
  let k m m' = 
    m |> bind (fun x ->
      m' |> bind (fun xs -> 
        unit (x::xs)))
  List.foldBack k l (unit [])

let (Gen f) = sequence [ for i in 1 .. 100000 -> unit i ]
f 0 (fun list -> printfn "%d" list.Length)

You're correct - the reason why you're getting a stack overflow is that the bind operation of the monad needs to be tail-recursive (because it is used to aggregate values during folding).

The monad used in FsCheck is essentially a state monad (it keeps the current generator and some number). I simplified it a bit and got something like:

type Gen<'a> = Gen of (int -> 'a)

let unit x = Gen (fun n -> x)

let bind k (Gen m) = 
    Gen (fun n -> 
      let (Gen m') = k (m n) 
      m' n)

Here, the bind function is not tail-recursive because it calls k and then does some more work. You can change the monad to be a continuation monad. It is implemented as a function that takes the state and a continuation - a function that is called with the result as an argument. For this monad, you can make bind tail recursive:

type Gen<'a> = Gen of (int -> ('a -> unit) -> unit)

let unit x = Gen (fun n f -> f x)

let bind k (Gen m) = 
    Gen (fun n f -> 
      m n (fun r -> 
        let (Gen m') = k r
        m' n f))

The following example will not stack overflow (and it did with the original implementation):

let sequence l = 
  let k m m' = 
    m |> bind (fun x ->
      m' |> bind (fun xs -> 
        unit (x::xs)))
  List.foldBack k l (unit [])

let (Gen f) = sequence [ for i in 1 .. 100000 -> unit i ]
f 0 (fun list -> printfn "%d" list.Length)

回复收藏 0 原文

~没有更多了~