返回介绍

Sampling with and without replacement

发布于 2025-02-25 23:43:56 字数 8020 浏览 0 评论 0 收藏 0

# Sampling is done with replacement by default
np.random.choice(4, 12)
array([2, 1, 2, 2, 0, 2, 2, 1, 3, 2, 3, 1])
# Probability weights can be given
np.random.choice(4, 12, p=[.4, .1, .1, .4])
array([3, 3, 1, 0, 0, 3, 1, 0, 0, 3, 0, 0])
x = np.random.randint(0, 10, (8, 12))
x
array([[7, 2, 4, 8, 0, 7, 9, 3, 4, 6, 1, 5],
       [6, 2, 1, 8, 3, 5, 0, 2, 6, 2, 4, 4],
       [6, 3, 0, 6, 4, 7, 6, 7, 1, 5, 7, 9],
       [2, 4, 8, 1, 2, 1, 1, 3, 5, 9, 0, 8],
       [1, 6, 3, 3, 5, 9, 7, 9, 2, 3, 3, 3],
       [8, 6, 9, 7, 6, 3, 9, 6, 6, 6, 1, 3],
       [4, 3, 1, 0, 5, 8, 6, 8, 9, 1, 0, 3],
       [1, 3, 4, 7, 6, 1, 4, 3, 3, 7, 6, 8]])
# sampling individual elements
np.random.choice(x.ravel(), 12)
array([1, 2, 4, 7, 1, 2, 2, 6, 7, 3, 8, 4])
# sampling rows
idx = np.random.choice(x.shape[0], 4)
x[idx, :]
array([[4, 3, 1, 0, 5, 8, 6, 8, 9, 1, 0, 3],
       [4, 3, 1, 0, 5, 8, 6, 8, 9, 1, 0, 3],
       [6, 2, 1, 8, 3, 5, 0, 2, 6, 2, 4, 4],
       [4, 3, 1, 0, 5, 8, 6, 8, 9, 1, 0, 3]])
# sampling columns
idx = np.random.choice(x.shape[1], 4)
x[:, idx]
array([[9, 4, 3, 1],
       [0, 6, 2, 4],
       [6, 1, 7, 7],
       [1, 5, 3, 0],
       [7, 2, 9, 3],
       [9, 6, 6, 1],
       [6, 9, 8, 0],
       [4, 3, 3, 6]])
# Give the argument replace=False
try:
    np.random.choice(4, 12, replace=False)
except ValueError, e:
    print e
Cannot take a larger sample than population when 'replace=False'

You will likely have used this for the stochastic gradient descent homework.

x
array([[7, 2, 4, 8, 0, 7, 9, 3, 4, 6, 1, 5],
       [6, 2, 1, 8, 3, 5, 0, 2, 6, 2, 4, 4],
       [6, 3, 0, 6, 4, 7, 6, 7, 1, 5, 7, 9],
       [2, 4, 8, 1, 2, 1, 1, 3, 5, 9, 0, 8],
       [1, 6, 3, 3, 5, 9, 7, 9, 2, 3, 3, 3],
       [8, 6, 9, 7, 6, 3, 9, 6, 6, 6, 1, 3],
       [4, 3, 1, 0, 5, 8, 6, 8, 9, 1, 0, 3],
       [1, 3, 4, 7, 6, 1, 4, 3, 3, 7, 6, 8]])
# Shuffling occurs "in place" for efficiency
np.random.shuffle(x)
x
array([[7, 2, 4, 8, 0, 7, 9, 3, 4, 6, 1, 5],
       [4, 3, 1, 0, 5, 8, 6, 8, 9, 1, 0, 3],
       [8, 6, 9, 7, 6, 3, 9, 6, 6, 6, 1, 3],
       [2, 4, 8, 1, 2, 1, 1, 3, 5, 9, 0, 8],
       [6, 3, 0, 6, 4, 7, 6, 7, 1, 5, 7, 9],
       [6, 2, 1, 8, 3, 5, 0, 2, 6, 2, 4, 4],
       [1, 3, 4, 7, 6, 1, 4, 3, 3, 7, 6, 8],
       [1, 6, 3, 3, 5, 9, 7, 9, 2, 3, 3, 3]])
# To shuffle columns instead, transpose before shuffling
np.random.shuffle(x.T)
x
array([[7, 0, 4, 7, 9, 8, 1, 6, 4, 3, 2, 5],
       [8, 5, 1, 4, 6, 0, 0, 1, 9, 8, 3, 3],
       [3, 6, 9, 8, 9, 7, 1, 6, 6, 6, 6, 3],
       [1, 2, 8, 2, 1, 1, 0, 9, 5, 3, 4, 8],
       [7, 4, 0, 6, 6, 6, 7, 5, 1, 7, 3, 9],
       [5, 3, 1, 6, 0, 8, 4, 2, 6, 2, 2, 4],
       [1, 6, 4, 1, 4, 7, 6, 7, 3, 3, 3, 8],
       [9, 5, 3, 1, 7, 3, 3, 3, 2, 9, 6, 3]])
# numpy.random.permutation does the same thing but returns a copy
np.random.permutation(x)
array([[7, 0, 4, 7, 9, 8, 1, 6, 4, 3, 2, 5],
       [1, 6, 4, 1, 4, 7, 6, 7, 3, 3, 3, 8],
       [1, 2, 8, 2, 1, 1, 0, 9, 5, 3, 4, 8],
       [7, 4, 0, 6, 6, 6, 7, 5, 1, 7, 3, 9],
       [9, 5, 3, 1, 7, 3, 3, 3, 2, 9, 6, 3],
       [3, 6, 9, 8, 9, 7, 1, 6, 6, 6, 6, 3],
       [8, 5, 1, 4, 6, 0, 0, 1, 9, 8, 3, 3],
       [5, 3, 1, 6, 0, 8, 4, 2, 6, 2, 2, 4]])
# When given an integre n, permutation treats is as the array arange(n)
np.random.permutation(10)
array([4, 0, 6, 7, 5, 1, 8, 2, 3, 9])
# Use indices if you needed to shuffle collections of arrays in synchrony
x = np.arange(12).reshape(4,3)
y = x + 10
idx = np.random.permutation(x.shape[0])
print x[idx, :], '\n'
print y[idx, :]
[[ 9 10 11]
 [ 3  4  5]
 [ 6  7  8]
 [ 0  1  2]]

[[19 20 21]
 [13 14 15]
 [16 17 18]
 [10 11 12]]

Bootstrap

The bootstrap is commonly used to estimate statistics when theory fails. We have already seen the bootstrap for estiamting confidence bounds for convergence in the Monte Carlo integration.

# For example, what is the 95% confidence interval for
# the mean of this data set if you didn't know how it was generated?

x = np.concatenate([np.random.exponential(size=200), np.random.normal(size=100)])
plt.hist(x, 25, histtype='step');

n = len(x)
reps = 10000
xb = np.random.choice(x, (n, reps))
mb = xb.mean(axis=0)
mb.sort()

np.percentile(mb, [2.5, 97.5])
array([0.483, 0.740])

Reprise of bootstrap example for Monte Carlo integration

def f(x):
    return x * np.cos(71*x) + np.sin(13*x)
# data sample for integration
n = 100
x = f(np.random.random(n))
# bootstrap MC integration
reps = 1000
xb = np.random.choice(x, (n, reps), replace=True)
yb = 1/np.arange(1, n+1)[:, None] * np.cumsum(xb, axis=0)
upper, lower = np.percentile(yb, [2.5, 97.5], axis=1)
plt.plot(np.arange(1, n+1)[:, None], yb, c='grey', alpha=0.02)
plt.plot(np.arange(1, n+1), yb[:, 0], c='red', linewidth=1)
plt.plot(np.arange(1, n+1), upper, 'b', np.arange(1, n+1), lower, 'b');

Leave some-out resampling

Jackknife estimate of parameters

This shows the leave-one-out calculation idiom for Python. Unlike R, a -k index to an array does not delete the kth entry, but returns the kth entry from the end, so we need another way to efficiently drop one scalar or vector. This can be done using Boolean indexing as shown in the examples below, and is efficient since the operations are on views of the origianl array rather thna copies.

def jackknife(x, func):
    """Jackknife estimate of the estimator func"""
    n = len(x)
    idx = np.arange(n)
    return np.sum(func(x[idx!=i]) for i in range(n))/float(n)
# Jackknife estimate of standard deviation
x = np.random.normal(0, 2, 100)
jackknife(x, np.std)
1.9223
def jackknife_var(x, func):
    """Jackknife estiamte of the variance of the estimator func."""
    n = len(x)
    idx = np.arange(n)
    j_est = jackknife(x, func)
    return (n-1)/(n + 0.0) * np.sum((func(x[idx!=i]) - j_est)**2.0
                                    for i in range(n))
# estimate of the variance of an estimator
jackknife_var(x, np.std)
0.0254

Leave one out cross validation (LOOCV)

LOOCV also uses the same idiom, and a simple example of LOOCV for model selection is illustrated.

a, b, c = 1, 2, 3
x = np.linspace(0, 5, 10)
y = a*x**2 + b*x + c + np.random.normal(0, 1, len(x))
plt.figure(figsize=(12,4))
for deg in range(1, 5):
    plt.subplot(1, 4, deg)
    beta = np.polyfit(x, y, deg)
    plt.plot(x, y, 'r:o')
    plt.plot(x, np.polyval(beta, x), 'b-')
    plt.title('Degree = %d' % deg)

def loocv(x, y, fit, pred, deg):
    """LOOCV RSS for fitting a polynomial model."""
    n = len(x)
    idx = np.arange(n)
    rss = np.sum([(y - pred(fit(x[idx!=i], y[idx!=i], deg), x))**2.0 for i in range(n)])
    return rss
# RSS does not detect overfitting and selects the most complex model
for deg in range(1, 5):
    print 'Degree = %d, RSS=%.2f' % (deg, np.sum((y - np.polyval(np.polyfit(x, y, deg), x))**2.0))
Degree = 1, RSS=59.90
Degree = 2, RSS=6.20
Degree = 3, RSS=6.20
Degree = 4, RSS=6.20
# LOOCV selects the correct model
for deg in range(1, 5):
    print 'Degree = %d, RSS=%.2f' % (deg, loocv(x, y, np.polyfit, np.polyval, deg))
Degree = 1, RSS=628.41
Degree = 2, RSS=64.35
Degree = 3, RSS=67.81
Degree = 4, RSS=85.39

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
    我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
    原文