8.4 map

发布于 2024-10-12 19:16:08 字数 18228 浏览 0 评论 0 收藏 0

哈希表实现。

结构上用一个头结构（hmap）存储基本信息。

最重要的是引用 buckets 数组。这些桶（bmap）存储具体 key/value 数据。

         +--------------+
         | hmap         |
         +--------------+
         |   buckets    |   header
         +--------------+
                |
                |
                v
         +--------------+---//---+------+-------------+
         | bmap         |   ...  | bmap | prealloc... |   bucket array
         +--------------+---//---+------+-------------+
         |   [8]tophash |
         +--------------+
         |   [8]key     |
         +--------------+
         |   [8]value   |
         +--------------+
         |   overflow   |
         +--------------+
                |
                |
                v
         +--------------+
         | bmap         |
         +--------------+

Maximum average load of a bucket that triggers growth is 6.5.

// map.go

// A header for a Go map.
type hmap struct {
    count      int             // 键值对数量（len）。
    flags      uint8           // 状态标记。
    B          uint8           // 桶数量（2^B，可容纳的键值对数量 loadFactor * 2^B）
    noverflow  uint16          // 溢出桶数量（创建溢出桶时累加）。
    hash0      uint32          // 哈希种子。
    
    buckets    unsafe.Pointer  // 桶数组地址。
    oldbuckets unsafe.Pointer  // 旧桶数组地址，用于扩容。
    nevacuate  uintptr         // 顺序迁移进度，低于此值的已安置。
    
    extra      *mapextra       // optional fields
}

type mapextra struct {
    nextOverflow   *bmap       // 预分配桶。
}

每个桶内存储 8 个键值对。
如果 key 和 value 数据长度不超过 128，直接在桶内存储。
否则会在堆分配内存，间接（indirect）存储。
数组 tophash 内存储 key 哈希高位值，用于访问时快速匹配。
只有 tophash 值相等，才继续匹配具体 key。
判断 tophash 可知对应 key/value 是否为空位。
根据键值长度，动态分配键值对存储分配。

// A bucket for a Go map.
const (
    bucketCntBits = 3
    bucketCnt     = 1 << bucketCntBits
)

type bmap struct {
    
    // tophash generally contains the top byte of the hash value      +-----------+
    // for each key in this bucket. If tophash[0] < minTopHash,       | tophash   |
    // tophash[0] is a bucket evacuation state instead.               +-----------+
                                                                      | [8]keys   |
    tophash [bucketCnt]uint8                                          +-----------+
                                                                      | [8]values |
    // Followed by bucketCnt keys and then bucketCnt elems.           +-----------+
    // Followed by an overflow pointer.                               | overflow  |
}                                                                     +-----------+

创建

计算出合理大小，并创建桶数组。

如果 make(..., 100) ，那么计算出来的 B = 4 ，可容纳 6.5 * 2^4 = 104 个键值对。
以 map[string]int 为例，那么每个桶大小是 tophash_8 + key_str_16 * 8 + value_int_8 * 8 + overflow_8 = 208 。
使用 make(..., hint) 预分配空间，有助于减少扩容，提升性能。

// makemap implements Go map creation for make(map[k]v, hint).
// If the compiler has determined that the map or the first bucket
// can be created on the stack, h and/or bucket may be non-nil.
// If h != nil, the map can be created directly in h.
// If h.buckets != nil, bucket pointed to can be used as the first bucket.

func makemap(t *maptype, hint int, h *hmap) *hmap {
 
    // 创建头，初始化。
    if h == nil {
        h = new(hmap)
    }
    
    h.hash0 = fastrand()
    
    // 根据 hint 计算 B 指数。
    B := uint8(0)
    for overLoadFactor(hint, B) {
        B++
    }
    
    h.B = B
    
    if h.B != 0 {
        var nextOverflow *bmap
        
        // 分配内存，创建桶数组。
        h.buckets, nextOverflow = makeBucketArray(t, h.B, nil)   // 参数 nil，分配内存。
   
        // 为减少内存分配操作，分配的桶数量可能超过预期。
        // 将其存放起来，做为后面溢出桶的预分配。
        if nextOverflow != nil {
            h.extra = new(mapextra)
            h.extra.nextOverflow = nextOverflow
        }
    }
    
    return h
}

func makeBucketArray(t *maptype, b uint8, dirtyalloc unsafe.Pointer) (buckets unsafe.Pointer, nextOverflow *bmap) {

    base := bucketShift(b)  // 基本数量。
    nbuckets := base        // 实际分配数量（含溢出桶）。
    
    // 如果超过 2^4，额外分配 2^(B - 4) 个溢出桶。
    if b >= 4 {
        nbuckets += bucketShift(b - 4)
        sz := t.bucket.size * nbuckets
        
        up := roundupsize(sz)
        if up != sz {
            nbuckets = up / t.bucket.size
        }
    }
    
    // 新分配，或清理重用内存。
    if dirtyalloc == nil {
        buckets = newarray(t.bucket, int(nbuckets))
    } else {
        
        // dirtyalloc was previously generated by
        // the above newarray(t.bucket, int(nbuckets))
        // but may not be empty.
        
        buckets = dirtyalloc
        size := t.bucket.size * nbuckets
        
        if t.bucket.ptrdata != 0 {
            memclrHasPointers(buckets, size)
        } else {
            memclrNoHeapPointers(buckets, size)
        }
    }
    
    if base != nbuckets {
        // 溢出位置。
        nextOverflow = (*bmap)(add(buckets, base*uintptr(t.bucketsize)))   
        
        // 在最后一个预分配的溢出桶（bmap.overflow）设定结束标记，表示预分配资源用尽。
        // 当然，在使用（newoverflow）最后一个桶时，需要清除掉该标记（setoverflow nil）。
        last := (*bmap)(add(buckets, (nbuckets-1) * uintptr(t.bucketsize)))
        last.setoverflow(t, (*bmap)(buckets))
    }
    
    return buckets, nextOverflow
}

hmap.newoverflow: 获取预分配溢出桶（nextOverflow），或新建（newobject）。
bmap.overflow, setoverflow: 返回或设置 bmap 最后一个存放溢出桶指针的字段内容。

访问（读）

计算查找 key 哈希值，以此找到对应桶（bmap，bucket）。

1. 先用哈希高位值在 bmap.tophash 数组里匹配。（整数要比直接匹配 key 高效）

2. 如果 tophash 匹配成功，需要进一步检查 key 是否相等。（有 tophash 过滤，就不用每次匹配 key）

3. 根据索引计算 value 地址，并返回。

mapaccess2 对应 v, ok := map[key] 。

// mapaccess1 returns a pointer to h[key].  Never returns nil, instead
// it will return a reference to the zero object for the value type if
// the key is not in the map.

func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 
    // 允许访问 nil map，返回零值。
    if h == nil || h.count == 0 {
        return unsafe.Pointer(&zeroVal[0])
    }
    
    // 计算 key 哈希值，根据公式找到对应的桶。
    hash := t.hasher(key, uintptr(h.hash0))
    m := bucketMask(h.B)
    b := (*bmap)(add(h.buckets, (hash & m) * uintptr(t.bucketsize)))
    
    // 扩容迁移是分多次完成的。因此会有某些数据暂时留在 oldbuckets，先从此查找。
    if c := h.oldbuckets; c != nil {
        
        if !h.sameSizeGrow() {
            // There used to be half as many buckets; mask down one more power of two.
            m >>= 1
        }
        
        oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
        
        // 检查是否已经完成迁移。
        if !evacuated(oldb) {
            b = oldb
        }
    }
    
    // 计算 key 哈值高位值。
    top := tophash(hash)
    
bucketloop:
    
    // 遍历该桶，以及溢出桶链表。
    // 方法 overflow 通过 bmap.overflow 存储的地址找到链表下一个溢出桶。
    for ; b != nil; b = b.overflow(t) {
        
        // 遍历桶内的 tophash 数组，初步匹配。
        for i := uintptr(0); i < bucketCnt; i++ {
            
            // 不相等，检查下一个。
            if b.tophash[i] != top {
                // 后面都是空的。
                if b.tophash[i] == emptyRest {   
                    break bucketloop
                }
                continue
            }
            
            // 取出存储的 key，判断是否相等。
            k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
            if t.indirectkey() {
                k = *((*unsafe.Pointer)(k))
            }
            
            if t.key.equal(key, k) {
                // 返回对应 value 地址。
                e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
                if t.indirectelem() {
                    e = *((*unsafe.Pointer)(e))
                }
                
                return e
            }
        }
    }
    
    return unsafe.Pointer(&zeroVal[0])
}

分配（写）

分插入和更新操作两种。

1. 如果是更新操作，只需遍历找到并返回 value 地址即可。

2. 如果桶有空位，则存入 tophash 和 key，然后返回 value 地址。

3. 如果没有空位，新建溢出桶。然后使用该桶第一个空位。

// Like mapaccess, but allocates a slot for the key if it is not present in the map.

func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
 
    // 不能对 nil map 写。
    if h == nil {
        panic(plainError("assignment to entry in nil map"))
    }
    
    // 计算 key 哈希值。
    hash := t.hasher(key, uintptr(h.hash0))
    
    // 如果没有桶，新建。
    if h.buckets == nil {
        h.buckets = newobject(t.bucket)     // newarray(t.bucket, 1)
    }
    
again:
    
    // 扩容，迁移。
    bucket := hash & bucketMask(h.B)
    if h.growing() {
        growWork(t, h, bucket)
    }
    
    // 找到对应的桶。
    b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
    
    // 计算哈希高位值。
    top := tophash(hash)
    
bucketloop:
    
    for {
        // 桶内匹配，查找存放位置。
        for i := uintptr(0); i < bucketCnt; i++ {
            
            // 如果和 tophash 不等。
            if b.tophash[i] != top {
                
                // 如果该位置为空，计算插入索引、key/value 存储地址。
                // 对 inserti 赋值，决定了只找第一个插入位置。
                if isEmpty(b.tophash[i]) && inserti == nil {
                    inserti = &b.tophash[i]
                    insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
                    elem = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
                }
                
                // 后面都是空的。
                if b.tophash[i] == emptyRest {
                    break bucketloop
                }
                
                // 继续循环，优先处理 value 更新操作。
                continue
            }
            
            // 如果相等，继续匹配 key，可能是更新操作。
            // 返回 value 地址，供写入新值。
            k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
            if t.indirectkey() {
                k = *((*unsafe.Pointer)(k))
            }
            
            if !t.key.equal(key, k) {
                continue
            }
            
            if t.needkeyupdate() {
                typedmemmove(t.key, k, key)
            }
            
            elem = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
   
            // 更新操作无需更多处理。
            goto done
        }
        
        // 如果当前桶没有匹配，则继续检查溢出桶。
        ovf := b.overflow(t)
        if ovf == nil {
            break
        }
        
        b = ovf
    }
    
    // 如果负载过大，或者有太多溢出桶，则进行扩容。
    if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
        hashGrow(t, h)
        goto again       // Growing the table invalidates everything, so try again
    }
    
    // 如果没找到插入或更新位置，则意味着需要建立新的溢出桶。
    // 优先使用预分配的溢出桶。
    if inserti == nil {
        // all current buckets are full, allocate a new one.
        newb := h.newoverflow(t, b)
        inserti = &newb.tophash[0]
        insertk = add(unsafe.Pointer(newb), dataOffset)
        elem = add(insertk, bucketCnt*uintptr(t.keysize))
    }
    
    // 存储 key/value。（间接需要额外分配内存，存储指针）
    if t.indirectkey() {
        kmem := newobject(t.key)
        *(*unsafe.Pointer)(insertk) = kmem
        insertk = kmem
    }
    
    if indirectelem() {
        vmem := newobject(t.elem)
        *(*unsafe.Pointer)(elem) = vmem
    }
    
    typedmemmove(t.key, insertk, key)

    // 更新 tophash 和键值对数量。（包括插入操作）
    *inserti = top
    h.count++
    
done:
    
    h.flags &^= hashWriting
    if t.indirectelem() {
        elem = *((*unsafe.Pointer)(elem))
    }
    
    return elem
}

删除

删除操作不会收缩内存，只是清理内容。

func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {

    // 对 nil map 删除，直接返回。
    if h == nil || h.count == 0 {
        return
    }
    
    // 计算 key 哈希值。
    hash := t.hasher(key, uintptr(h.hash0))
    
    // 扩容，迁移。
    bucket := hash & bucketMask(h.B)
    if h.growing() {
        growWork(t, h, bucket)
    }
    
    // 找到对应桶，计算哈希高位值。
    b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
    bOrig := b
    top := tophash(hash)
    
search:
    
    // 遍历桶链表。
    for ; b != nil; b = b.overflow(t) {
        
        // 桶内遍历。
        for i := uintptr(0); i < bucketCnt; i++ {
            
            // tophash 不等，跳过。
            if b.tophash[i] != top {
                // 后面都是空的。
                if b.tophash[i] == emptyRest {
                    break search
                }
                
                continue
            }
       
            // 匹配 key，不等就跳过。
            k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
            k2 := k
            
            if t.indirectkey() {
                k2 = *((*unsafe.Pointer)(k2))
            }
            
            if !t.key.equal(key, k2) {
                continue
            }
            
            // 相等，清除 key 内容。
            if t.indirectkey() {
                *(*unsafe.Pointer)(k) = nil
            } else if t.key.ptrdata != 0 {
                memclrHasPointers(k, t.key.size)
            }
            
            // 清除 value 内容。
            e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
            if t.indirectelem() {
                *(*unsafe.Pointer)(e) = nil
            } else if t.elem.ptrdata != 0 {
                memclrHasPointers(e, t.elem.size)
            } else {
                memclrNoHeapPointers(e, t.elem.size)
            }
            
            // 清除 tophash 内容，减少计数。
            b.tophash[i] = emptyOne
            
            // 尝试标记停止位（emptyRest），减少检查遍历次数。
            if i == bucketCnt-1 {
                // 后面有溢出桶，无法设置。
                if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
                    goto notLast
                }
            } else {
                // 后面不是 emptyRest，没法设置。
                if b.tophash[i+1] != emptyRest {
                    goto notLast
                }
            }
            
            for {   // 往前推进。
                b.tophash[i] = emptyRest
                
                if i == 0 {
                    if b == bOrig {
                        break // beginning of initial bucket, we're done.
                    }
                    // Find previous bucket, continue at its last entry.
                    c := b
                    for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
                    }
                    i = bucketCnt - 1
                } else {
                    i--
                }
                
                if b.tophash[i] != emptyOne {
                    break
                }
            }
            
        notLast:
            
            h.count--
            
            // 跳出循环。
            break search
        }
    }
}

清空（clear）也只重置内存，不会收缩。
当 range delete 时，编译器会将其优化成清空操作。

// mapclear deletes all keys from a map.

func mapclear(t *maptype, h *hmap) {
 
    if h == nil || h.count == 0 {
        return
    }
    
    // 重置属性。
    h.flags &^= sameSizeGrow
    h.oldbuckets = nil
    h.nevacuate = 0
    h.noverflow = 0
    h.count = 0
 
    if h.extra != nil {
        *h.extra = mapextra{}
    }
    
    // 清理桶数组内存。
    _, nextOverflow := makeBucketArray(t, h.B, h.buckets)    // 参数 dirtyalloc 非 nil，仅清理内存。
    if nextOverflow != nil {
        // If overflow buckets are created then h.extra
        // will have been allocated during initial bucket creation.
        h.extra.nextOverflow = nextOverflow
    }
}

扩容

在写操作（mapassign）里会判断是否需要扩容。

基本是 2x 容量。

// growing reports whether h is growing. The growth may be to the same size or bigger.

func (h *hmap) growing() bool {
    return h.oldbuckets != nil
}

func hashGrow(t *maptype, h *hmap) {

    // 检查是否达到负载系数。
    // 超过，则增加 B 指数，横向扩大桶数组。
    bigger := uint8(1)
    if !overLoadFactor(h.count+1, h.B) {
        bigger = 0
        h.flags |= sameSizeGrow
    }
    
    // 将现有桶数组转移到 oldbuckets，创建新数组。
    oldbuckets := h.buckets
    newbuckets, nextOverflow := makeBucketArray(t, h.B+bigger, nil)
    
    h.B += bigger
    h.flags = flags
    h.oldbuckets = oldbuckets
    h.buckets = newbuckets
    h.nevacuate = 0
    h.noverflow = 0
    
    // the actual copying of the hash table data is done incrementally
    // by growWork() and evacuate().
}

接下来，在写和删除操作时，都会对目标桶执行迁移操作。

迁移是按桶分次完成的，避免一次性操作耗时过长。
该函数执行两次迁移操作：当前桶（随机）和顺序迁移。
顺序迁移每次也只处理一个桶，但保证了所有桶都可被迁移。

func growWork(t *maptype, h *hmap, bucket uintptr) {
   
    // 迁移桶（计算旧桶索引）。
    evacuate(t, h, bucket&h.oldbucketmask())
    
    // 顺序迁移。
    if h.growing() {
        evacuate(t, h, h.nevacuate)
    }
}

func evacuate(t *maptype, h *hmap, oldbucket uintptr) {

    // 旧桶，旧桶数组大小。
    b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
    newbit := h.noldbuckets()
    
    // 如果该桶还没有迁移。
    if !evacuated(b) {
        // ... 迁移操作 ...
    }
    
    // 获取下一顺序迁移序号。
    // 当 growWork 执行顺序迁移时，参数 oldbucket == nevacuate。
    if oldbucket == h.nevacuate {
        advanceEvacuationMark(h, t, newbit)
    }
}

如果全部迁移完毕，释放旧桶。

func advanceEvacuationMark(h *hmap, t *maptype, newbit uintptr) {

    // 累加顺序号。
    h.nevacuate++
    
    // 分批最大边界。
    stop := h.nevacuate + 1024
    if stop > newbit {
        stop = newbit
    }
    
    // 顺序检查下个桶是否已被迁移。
    // 确定实际顺序号（如果超出，则表示全部完成）。
    for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
        h.nevacuate++
    }
    
    // 如果所有旧桶迁移完毕，释放旧桶。
    // 不会再引发迁移操作。
    if h.nevacuate == newbit {          // newbit == # of oldbuckets
        h.oldbuckets = nil
    }
}

func bucketEvacuated(t *maptype, h *hmap, bucket uintptr) bool {
    b := (*bmap)(add(h.oldbuckets, bucket*uintptr(t.bucketsize)))
    return evacuated(b)
}

分享到QQ

分享到微博