8.4 字典

发布于 2024-10-12 19:16:01 字数 18950 浏览 0 评论 0 收藏 0

哈希表实现。头结构 hmap 存储基本信息，引用 buckets 数组。桶 bmap 存储具体 k/v 数据。

         +--------------+
         | hmap         |
         +--------------+
         |   buckets    |   header
         +--------------+
                |
                |
                v
         +--------------+---//---+------+-------------+
         | bmap         |   ...  | bmap | prealloc... |   bucket array
         +--------------+---//---+------+-------------+
         |   [8]tophash |
         +--------------+
         |   [8]key     |
         +--------------+
         |   [8]value   |
         +--------------+
         |   overflow   |
         +--------------+
                |
                |
                v
         +--------------+
         | bmap         |
         +--------------+
                |
                |
                v
         +--------------+
         | bmap         |
         +--------------+

// map.go

// A header for a Go map.
type hmap struct {
	count     int              // 键值对数量（len）。
	flags     uint8            // 状态标记。
	B         uint8            // 桶数量（2^B，可容纳的键值对数量 loadFactor * 2^B）
	noverflow uint16           // 溢出桶数量（创建溢出桶时累加）。
	hash0     uint32           // 哈希种子。

	buckets    unsafe.Pointer  // 桶数组地址。
	oldbuckets unsafe.Pointer  // 旧桶数组地址，用于扩容。
	nevacuate  uintptr         // 顺序迁移进度，低于此值的已安置。

	extra *mapextra // optional fields
}

type mapextra struct {

	// nextOverflow holds a pointer to a free overflow bucket.
	nextOverflow *bmap
}

每个桶（bmap）存储 8 个键值对。如果 key 和 value 数据长度不超过 128，直接桶内存储。否则分配堆内存，间接存储（indirect）其指针。

数组 tophash 内存储 key 哈希高位值，用于访问时快速匹配。只有 tophash 值相等，才继续匹配具体 key。判断 tophash 可知对应 key/value 是否为空位。

const (
    
	// Maximum number of key/elem pairs a bucket can hold.
	bucketCntBits = 3
	bucketCnt     = 1 << bucketCntBits   // 8
    
	// Maximum key or elem size to keep inline (instead of mallocing per element).
	maxKeySize  = 128
	maxElemSize = 128    
)

// A bucket for a Go map.
type bmap struct {
    
	// tophash generally contains the top byte of the hash value +------------+
	// for each key in this bucket. If tophash[0] < minTopHash,  | [8]tophash |
	// tophash[0] is a bucket evacuation state instead.          +------------+
                                                                 | [8]keys    |
	tophash [bucketCnt]uint8                                     +------------+
                                                                 | [8]values  |
	// Followed by bucketCnt keys and then bucketCnt elems.      +------------+
	// Followed by an overflow pointer.                          | overflow   |
}                                                                +------------+

相关状态。emptyRest 可减少查找次数。

const (
	emptyRest = 0 // this cell is empty, and there are no more non-empty cells at higher indexes or overflows.
	emptyOne  = 1 // this cell is empty
)

创建

计算出合理大小，并创建桶数组。

// makemap implements Go map creation for make(map[k]v, hint).
// If the compiler has determined that the map or the first bucket
// can be created on the stack, h and/or bucket may be non-nil.
// If h != nil, the map can be created directly in h.
// If h.buckets != nil, bucket pointed to can be used as the first bucket.

func makemap(t *maptype, hint int, h *hmap) *hmap {
    
    // make(t, hint), hint 是期望存储的容量。
    
	// 新建头部结构。
	if h == nil {
		h = new(hmap)
	}
    
    // 哈希种子。
	h.hash0 = fastrand()

	// Find the size parameter B which will hold the requested # of elements.
	// For hint < 0 overLoadFactor returns false since hint < bucketCnt.

    // 根据 hint 计算 B 指数。（哈希表的特点，实际容量超出期望）
	B := uint8(0)
	for overLoadFactor(hint, B) {
		B++
	}
	h.B = B


	// allocate initial hash table
	// if B == 0, the buckets field is allocated lazily later (in mapassign)
	// If hint is large zeroing this memory could take a while.
    
	if h.B != 0 {
		var nextOverflow *bmap
        
        // 分配内存，创建桶数组。
		h.buckets, nextOverflow = makeBucketArray(t, h.B, nil)
        
        // 为减少内存分配操作，分配的桶数量可能超过预期。
        // 将其存放起来，做为后面溢出桶的预分配。
		if nextOverflow != nil {
			h.extra = new(mapextra)
			h.extra.nextOverflow = nextOverflow
		}
	}

	return h
}

// makeBucketArray initializes a backing array for map buckets.
// 1<<b is the minimum number of buckets to allocate.
// dirtyalloc should either be nil or a bucket array previously
// allocated by makeBucketArray with the same t and b parameters.
// If dirtyalloc is nil a new backing array will be alloced and
// otherwise dirtyalloc will be cleared and reused as backing array.

func makeBucketArray(t *maptype, b uint8, dirtyalloc unsafe.Pointer) (buckets unsafe.Pointer, nextOverflow *bmap) {

    base := bucketShift(b)  // 按指数计算基本数量。
	nbuckets := base        // 实际分配数量（含溢出桶）。
    
    // 是否额外分配溢出桶。
	if b >= 4 {
		nbuckets += bucketShift(b - 4)
		sz := t.bucket.size * nbuckets
		up := roundupsize(sz)
		if up != sz {
			nbuckets = up / t.bucket.size
		}
	}

    // 新分配，或清理重用内存。
	if dirtyalloc == nil {
		buckets = newarray(t.bucket, int(nbuckets))
	} else {
		buckets = dirtyalloc
		size := t.bucket.size * nbuckets
        
		if t.bucket.ptrdata != 0 {
			memclrHasPointers(buckets, size)
		} else {
			memclrNoHeapPointers(buckets, size)
		}
	}

	if base != nbuckets {
        
		// We preallocated some overflow buckets.
		// To keep the overhead of tracking these overflow buckets to a minimum,
		// we use the convention that if a preallocated overflow bucket's overflow
		// pointer is nil, then there are more available by bumping the pointer.
		// We need a safe non-nil pointer for the last overflow bucket; just use buckets.
        
        // 溢出位置。
		nextOverflow = (*bmap)(add(buckets, base*uintptr(t.bucketsize)))
        
        // 在最后一个预分配溢出桶（bmap.overflow）设定结束标记，表示预分配资源用尽。
        // 当然，在使用（newoverflow）最后一个桶时，需清除该标记（setoverflow nil）。
		last := (*bmap)(add(buckets, (nbuckets-1)*uintptr(t.bucketsize)))
		last.setoverflow(t, (*bmap)(buckets))
	}
    
	return buckets, nextOverflow
}

访问

通常指读操作。计算查找 key 哈希值，以此找到对应桶（bmap）。

用哈希高位值在 bmap.tophash 数组里匹配。（整数要比直接匹配 key 高效）
如 tophash 匹配成功，再检查 key 是否相等。（有 tophash 过滤，就不用每次匹配 key）
根据索引计算 value 地址，并返回。

// mapaccess1 returns a pointer to h[key].  Never returns nil, instead
// it will return a reference to the zero object for the elem type if
// the key is not in the map.

func mapaccess1(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
    
    // 允许访问 nil map，返回零值。
	if h == nil || h.count == 0 {
		return unsafe.Pointer(&zeroVal[0])
	}
    
    // 计算 key 哈希值，根据公式找到对应的桶。
	hash := t.hasher(key, uintptr(h.hash0))
	m := bucketMask(h.B)
	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
    
    // 扩容迁移是分多次完成的。某些数据暂时留在 oldbuckets，先从此查找。
	if c := h.oldbuckets; c != nil {
		if !h.sameSizeGrow() {
			// There used to be half as many buckets; mask down one more power of two.
			m >>= 1
		}
        
		oldb := (*bmap)(add(c, (hash&m)*uintptr(t.bucketsize)))
        
        // 检查是否已经完成迁移。
		if !evacuated(oldb) {
			b = oldb
		}
	}
    
    // 计算 key 哈值高位值。
	top := tophash(hash)
    
bucketloop:
    
    // 遍历该桶，包括溢出桶链表。
    // 方法 overflow 通过 bmap.overflow 存储的地址找到链表下一个溢出桶。
	for ; b != nil; b = b.overflow(t) {
        
        // 遍历桶内的 tophash 数组，初步匹配。
		for i := uintptr(0); i < bucketCnt; i++ {
            
            // 不相等，检查下一个。
			if b.tophash[i] != top {
                
                // 后面都是空的。
				if b.tophash[i] == emptyRest {
					break bucketloop
				}
                
				continue
			}
            
            // 取出存储的 key。 
			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
			if t.indirectkey() {
                // 间接存储。
				k = *((*unsafe.Pointer)(k))
			}
            
            // 判断 key 是否相等。
			if t.key.equal(key, k) {
                
                // 读取 value 指针。
				e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
				if t.indirectelem() {
                    // 间接存储。
					e = *((*unsafe.Pointer)(e))
				}
                
				return e
			}
		}
	}
    
	return unsafe.Pointer(&zeroVal[0])
}

// ok-idiom: v, ok := map[key]
func mapaccess2(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, bool) {
    ...
}

分配

分插入和更新操作两种。

更新操作，遍历查找并返回 value 地址即可。
插入操作需要空位。

如桶有空位，存入 tophash 和 key，返回 value 地址。
如没有空位，新建溢出桶，使用该桶第一个空位。

func mapassign(t *maptype, h *hmap, key unsafe.Pointer) unsafe.Pointer {
    
	if h == nil {
		panic(plainError("assignment to entry in nil map"))
	}
    
	if h.flags&hashWriting != 0 {
		throw("concurrent map writes")
	}
    
    // 计算 key 哈希值。
	hash := t.hasher(key, uintptr(h.hash0))

	// Set hashWriting after calling t.hasher, since t.hasher may panic,
	// in which case we have not actually done a write.
	h.flags ^= hashWriting

    // 如果没有桶，新建。
	if h.buckets == nil {
		h.buckets = newobject(t.bucket) // newarray(t.bucket, 1)
	}

again:
    
    // 扩容，迁移。
	bucket := hash & bucketMask(h.B)
	if h.growing() {
		growWork(t, h, bucket)
	}
    
    // 找到对应桶。
	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
    
    // 计算哈希高位值。
	top := tophash(hash)

	var inserti *uint8
	var insertk unsafe.Pointer
	var elem unsafe.Pointer
    
bucketloop:
	for {
        
        // 桶内匹配，查找存放位置。
		for i := uintptr(0); i < bucketCnt; i++ {
            
            // 如果和 tophash 不等。
			if b.tophash[i] != top {
                
                // 如该位置空，获取 key/value 存储地址。
                // 对 inserti 赋值，只找第一个插入位置。
				if isEmpty(b.tophash[i]) && inserti == nil {
					inserti = &b.tophash[i]
					insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
					elem = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
				}
                
                // 后面都是空的，没有继续循环。
				if b.tophash[i] == emptyRest {
					break bucketloop
				}
                
                // 记下可插入位置。
                // 单优先处理更新操作，需要继续循环。
				continue
			}
            
            // 如果相等，继续匹配 key，可能是更新操作。
			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
			if t.indirectkey() {
				k = *((*unsafe.Pointer)(k))
			}
			if !t.key.equal(key, k) {
				continue
			}
			if t.needkeyupdate() {
				typedmemmove(t.key, k, key)
			}

            // 返回 value 地址，以便写入新值。
			elem = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
            
            // 更新无需更多处理，直接返回 value 指针。
			goto done
		}
        
        // 如果当前桶没有匹配，则继续检查溢出桶。
		ovf := b.overflow(t)
		if ovf == nil {
			break
		}
		b = ovf
	}

	// Did not find mapping for key. Allocate new cell & add entry.

    // 如果负载过大，或者有太多溢出桶，则进行扩容。
	if !h.growing() && (overLoadFactor(h.count+1, h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
		hashGrow(t, h)
		goto again // Growing the table invalidates everything, so try again
	}

    // 如果没找到插入或更新位置，则意味着需要建立新的溢出桶。
    // 优先使用预分配的溢出桶。
	if inserti == nil {
		// The current bucket and all the overflow buckets connected to it are full, allocate a new one.
		newb := h.newoverflow(t, b)
		inserti = &newb.tophash[0]
		insertk = add(unsafe.Pointer(newb), dataOffset)
		elem = add(insertk, bucketCnt*uintptr(t.keysize))
	}

    // 存储 key/value。（间接需要额外分配内存，存储指针）
	if t.indirectkey() {
		kmem := newobject(t.key)
		*(*unsafe.Pointer)(insertk) = kmem
		insertk = kmem
	}    
	if t.indirectelem() {
		vmem := newobject(t.elem)
		*(*unsafe.Pointer)(elem) = vmem
	}
	typedmemmove(t.key, insertk, key)
    
    // 更新 tophash 和键值对数量。（包括插入操作）
	*inserti = top
	h.count++

done:
	if h.flags&hashWriting == 0 {
		throw("concurrent map writes")
	}
	h.flags &^= hashWriting
    
    // 返回 value 指针。
	if t.indirectelem() {
		elem = *((*unsafe.Pointer)(elem))
	}
    
	return elem
}

删除

删除操作不会收缩内存，只是清理内容。

func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
    
	if h.flags&hashWriting != 0 {
		throw("concurrent map writes")
	}

    // 计算 key 哈希值。
	hash := t.hasher(key, uintptr(h.hash0))

	// Set hashWriting after calling t.hasher, since t.hasher may panic,
	// in which case we have not actually done a write (delete).
	h.flags ^= hashWriting

    // 扩容，迁移。
	bucket := hash & bucketMask(h.B)
	if h.growing() {
		growWork(t, h, bucket)
	}
    
    // 找到对应桶，计算哈希高位值。
	b := (*bmap)(add(h.buckets, bucket*uintptr(t.bucketsize)))
	bOrig := b
	top := tophash(hash)
    
search:
    
    // 遍历桶链表（含溢出桶）。
	for ; b != nil; b = b.overflow(t) {
        
        // 桶内遍历。
		for i := uintptr(0); i < bucketCnt; i++ {
            
            // 如果 tophash 不等，跳过。
			if b.tophash[i] != top {
                
                // 后面都是空，中断搜索。
				if b.tophash[i] == emptyRest {
					break search
				}
                
				continue
			}
            
            // 相等，继续匹配 key，不等就跳过。
			k := add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
			k2 := k
			if t.indirectkey() {
				k2 = *((*unsafe.Pointer)(k2))
			}
			if !t.key.equal(key, k2) {
				continue
			}
            
            // 清除 key 内容。
			if t.indirectkey() {
				*(*unsafe.Pointer)(k) = nil
			} else if t.key.ptrdata != 0 {
				memclrHasPointers(k, t.key.size)
			}
            
            // 清除 value 内容。
			e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
			if t.indirectelem() {
				*(*unsafe.Pointer)(e) = nil
			} else if t.elem.ptrdata != 0 {
				memclrHasPointers(e, t.elem.size)
			} else {
				memclrNoHeapPointers(e, t.elem.size)
			}
            
            // 清除 tophash 内容。
			b.tophash[i] = emptyOne
            
			// If the bucket now ends in a bunch of emptyOne states,
			// change those to emptyRest states.
			// It would be nice to make this a separate function, but
			// for loops are not currently inlineable.

            // 尝试标记停止位（emptyRest），减少检查遍历次数。
			if i == bucketCnt-1 {
                
                // 后面有溢出桶，无法设置。
				if b.overflow(t) != nil && b.overflow(t).tophash[0] != emptyRest {
					goto notLast
				}
                
			} else {
                
                // 后面不是 emptyRest，没法设置。
				if b.tophash[i+1] != emptyRest {
					goto notLast
				}
                
			}
            
            // 往前推进。
			for {
				b.tophash[i] = emptyRest
				if i == 0 {
					if b == bOrig {
						break // beginning of initial bucket, we're done.
					}
					// Find previous bucket, continue at its last entry.
					c := b
					for b = bOrig; b.overflow(t) != c; b = b.overflow(t) {
					}
					i = bucketCnt - 1
				} else {
					i--
				}
				if b.tophash[i] != emptyOne {
					break
				}
			}
            
		notLast:
			h.count--
			// Reset the hash seed to make it more difficult for attackers to
			// repeatedly trigger hash collisions. See issue 25237.
			if h.count == 0 {
				h.hash0 = fastrand()
			}
			break search
		}
	}

	if h.flags&hashWriting == 0 {
		throw("concurrent map writes")
	}
	h.flags &^= hashWriting
}

当 range delete 时，编译器会将其优化成清空操作。但清空（clear）也只重置内存，不会收缩。

// mapclear deletes all keys from a map.
func mapclear(t *maptype, h *hmap) {

	if h == nil || h.count == 0 {
		return
	}

	if h.flags&hashWriting != 0 {
		throw("concurrent map writes")
	}

	h.flags ^= hashWriting

    // 重置属性。
	h.flags &^= sameSizeGrow
	h.oldbuckets = nil
	h.nevacuate = 0
	h.noverflow = 0
	h.count = 0
	h.hash0 = fastrand()

	// Keep the mapextra allocation but clear any extra information.
	if h.extra != nil {
		*h.extra = mapextra{}
	}

	// makeBucketArray clears the memory pointed to by h.buckets
	// and recovers any overflow buckets by generating them
	// as if h.buckets was newly alloced.
    
    // 清理桶数组内存。(dirtyalloc = h.buckets)
	_, nextOverflow := makeBucketArray(t, h.B, h.buckets)
	if nextOverflow != nil {
		h.extra.nextOverflow = nextOverflow
	}

	if h.flags&hashWriting == 0 {
		throw("concurrent map writes")
	}
	h.flags &^= hashWriting
}

扩容

在写操作（mapassign）里会判断是否需要扩容。

// growing reports whether h is growing. The growth may be to the same size or bigger.
func (h *hmap) growing() bool {
	return h.oldbuckets != nil
}

func hashGrow(t *maptype, h *hmap) {
    
	// If we've hit the load factor, get bigger.
	// Otherwise, there are too many overflow buckets,
	// so keep the same number of buckets and "grow" laterally.
    
    // 检查是否达到负载系数。
    // 超过，则增加 B 指数，横向扩大桶数组。
	bigger := uint8(1)
	if !overLoadFactor(h.count+1, h.B) {
		bigger = 0
		h.flags |= sameSizeGrow
	}
    
    // 将现有桶数组转移到 oldbuckets，创建新数组。
	oldbuckets := h.buckets
	newbuckets, nextOverflow := makeBucketArray(t, h.B+bigger, nil)

	flags := h.flags &^ (iterator | oldIterator)
	if h.flags&iterator != 0 {
		flags |= oldIterator
	}
    
	h.B += bigger
	h.flags = flags
	h.oldbuckets = oldbuckets
	h.buckets = newbuckets
	h.nevacuate = 0
	h.noverflow = 0

	if h.extra != nil && h.extra.overflow != nil {
		// Promote current overflow buckets to the old generation.
		if h.extra.oldoverflow != nil {
			throw("oldoverflow is not nil")
		}
		h.extra.oldoverflow = h.extra.overflow
		h.extra.overflow = nil
	}
	if nextOverflow != nil {
		if h.extra == nil {
			h.extra = new(mapextra)
		}
		h.extra.nextOverflow = nextOverflow
	}

	// the actual copying of the hash table data is done incrementally
	// by growWork() and evacuate().
}

接下来，在写和删除操作时，都会对目标桶执行迁移操作。

迁移是按桶分次完成的，避免一次性操作耗时过长。
该函数执行两次迁移操作：当前桶（随机）和顺序迁移。
顺序迁移每次也只处理一个桶，但保证了所有桶都可被迁移。

func growWork(t *maptype, h *hmap, bucket uintptr) {
    
    // 迁移桶（由写和删除操作引发）。
	evacuate(t, h, bucket&h.oldbucketmask())

    // 顺序迁移（指定序号）。
	if h.growing() {
		evacuate(t, h, h.nevacuate)
	}
}

func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
    
    // 旧桶，旧桶数组大小。
	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
	newbit := h.noldbuckets()
    
    // 如果该桶还没有迁移。
	if !evacuated(b) {
        // 迁移到新桶 ...
	}

    // 顺序迁移需要累加序号，并检查是否迁移完毕。
    // 因为 gcWork(.., .., h.nevacuate）。
	if oldbucket == h.nevacuate {
		advanceEvacuationMark(h, t, newbit)
	}
}

如全部迁移完毕，释放旧桶。

func advanceEvacuationMark(h *hmap, t *maptype, newbit uintptr) {
    
    // 累加序号。
	h.nevacuate++
    
    // 分批最大边界。
	stop := h.nevacuate + 1024
	if stop > newbit {
		stop = newbit
	}
    
    // 顺序检查下个桶是否已被迁移。
    // 确定实际顺序号（如果超出，则表示全部完成）。
	for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
		h.nevacuate++
	}
    
    // 如果所有旧桶迁移完毕，释放旧桶。
    // 不会再引发迁移操作。
	if h.nevacuate == newbit { // newbit == # of oldbuckets
        
		// Growing is all done. Free old main bucket array.
		h.oldbuckets = nil
        
		if h.extra != nil {
			h.extra.oldoverflow = nil
		}
		h.flags &^= sameSizeGrow
	}
}

分享到QQ

分享到微博