文章来源于网络收集而来,版权归原创者所有,如有侵权请及时联系!
QR 分解
def ls_qr(A,b):
Q, R = scipy.linalg.qr(A, mode='economic')
return scipy.linalg.solve_triangular(R, Q.T @ b)
%timeit coeffs_qr = ls_qr(trn_int, y_trn)
# 205 µs ± 264 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
coeffs_qr = ls_qr(trn_int, y_trn)
regr_metrics(y_test, test_int @ coeffs_qr)
# (57.94102134545711, 48.053565198516452)
SVD
SVD 给出伪逆。
def ls_svd(A,b):
m, n = A.shape
U, sigma, Vh = scipy.linalg.svd(A, full_matrices=False)
w = (U.T @ b)/ sigma
return Vh.T @ w
%timeit coeffs_svd = ls_svd(trn_int, y_trn)
# 1.11 ms ± 320 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit coeffs_svd = ls_svd(trn_int, y_trn)
# 266 µs ± 8.49 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
coeffs_svd = ls_svd(trn_int, y_trn)
regr_metrics(y_test, test_int @ coeffs_svd)
# (57.941021345457244, 48.053565198516687)
最小二乘回归的随机 Sketching 技巧
线性 Sketching (Woodruff)
- 抽取
r×n
随机矩阵S
,r << n
- 计算
S A
和S b
- 找到回归
SA x = Sb
的精确解x
时间比较
import timeit
import pandas as pd
def scipylstq(A, b):
return scipy.linalg.lstsq(A,b)[0]
row_names = ['Normal Eqns- Naive',
'Normal Eqns- Cholesky',
'QR Factorization',
'SVD',
'Scipy lstsq']
name2func = {'Normal Eqns- Naive': 'ls_naive',
'Normal Eqns- Cholesky': 'ls_chol',
'QR Factorization': 'ls_qr',
'SVD': 'ls_svd',
'Scipy lstsq': 'scipylstq'}
m_array = np.array([100, 1000, 10000])
n_array = np.array([20, 100, 1000])
index = pd.MultiIndex.from_product([m_array, n_array], names=['# rows', '# cols'])
pd.options.display.float_format = '{:,.6f}'.format
df = pd.DataFrame(index=row_names, columns=index)
df_error = pd.DataFrame(index=row_names, columns=index)
# %%prun
for m in m_array:
for n in n_array:
if m >= n:
x = np.random.uniform(-10,10,n)
A = np.random.uniform(-40,40,[m,n]) # removed np.asfortranarray
b = np.matmul(A, x) + np.random.normal(0,2,m)
for name in row_names:
fcn = name2func[name]
t = timeit.timeit(fcn + '(A,b)', number=5, globals=globals())
df.set_value(name, (m,n), t)
coeffs = locals()[fcn](A, b)
reg_met = regr_metrics(b, A @ coeffs)
df_error.set_value(name, (m,n), reg_met[0])
df
# rows | 100 | 1000 | 10000 | ||||||
---|---|---|---|---|---|---|---|---|---|
# cols | 20 | 100 | 1000 | 20 | 100 | 1000 | 20 | 100 | 1000 |
Normal Eqns- Naive | 0.001276 | 0.003634 | NaN | 0.000960 | 0.005172 | 0.293126 | 0.002226 | 0.021248 | 1.164655 |
Normal Eqns- Cholesky | 0.001660 | 0.003958 | NaN | 0.001665 | 0.004007 | 0.093696 | 0.001928 | 0.010456 | 0.399464 |
QR Factorization | 0.002174 | 0.006486 | NaN | 0.004235 | 0.017773 | 0.213232 | 0.019229 | 0.116122 | 2.208129 |
SVD | 0.003880 | 0.021737 | NaN | 0.004672 | 0.026950 | 1.280490 | 0.018138 | 0.130652 | 3.433003 |
Scipy lstsq | 0.004338 | 0.020198 | NaN | 0.004320 | 0.021199 | 1.083804 | 0.012200 | 0.088467 | 2.134780 |
df_error
# rows | 100 | 1000 | 10000 | ||||||
---|---|---|---|---|---|---|---|---|---|
# cols | 20 | 100 | 1000 | 20 | 100 | 1000 | 20 | 100 | 1000 |
Normal Eqns- Naive | 1.702742 | 0.000000 | NaN | 1.970767 | 1.904873 | 0.000000 | 1.978383 | 1.980449 | 1.884440 |
Normal Eqns- Cholesky | 1.702742 | 0.000000 | NaN | 1.970767 | 1.904873 | 0.000000 | 1.978383 | 1.980449 | 1.884440 |
QR Factorization | 1.702742 | 0.000000 | NaN | 1.970767 | 1.904873 | 0.000000 | 1.978383 | 1.980449 | 1.884440 |
SVD | 1.702742 | 0.000000 | NaN | 1.970767 | 1.904873 | 0.000000 | 1.978383 | 1.980449 | 1.884440 |
Scipy lstsq | 1.702742 | 0.000000 | NaN | 1.970767 | 1.904873 | 0.000000 | 1.978383 | 1.980449 | 1.884440 |
store = pd.HDFStore('least_squares_results.h5')
store['df'] = df
'''
C:\Users\rache\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2881: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->floating,key->block0_values] [items->[(100, 20), (100, 100), (100, 1000), (1000, 20), (1000, 100), (1000, 1000), (5000, 20), (5000, 100), (5000, 1000)]]
exec(code_obj, self.user_global_ns, self.user_ns)
'''
注解
我用魔术指令 %prun
来测量我的代码。
替代方案:最小绝对偏差(L1 回归)
- 异常值的敏感度低于最小二乘法。
- 没有闭式解,但可以通过线性规划解决。
条件作用和稳定性
条件数
条件数是一个指标,衡量输入的小变化导致输出变化的程度。
问题:为什么我们在数值线性代数中,关心输入的小变化的相关行为?
相对条件数由下式定义:
其中 是无穷小。
根据 Trefethen(第 91 页),如果 κ
很小(例如 1, 10, 10^2
),问题是良态的,如果 κ
很大(例如 10^6, 10^16
),那么问题是病态的。
条件作用:数学问题的扰动行为(例如最小二乘)
稳定性:用于在计算机上解决该问题的算法的扰动行为(例如,最小二乘算法,householder,回代,高斯消除)
条件作用的例子
计算非对称矩阵的特征值的问题通常是病态的。
A = [[1, 1000], [0, 1]]
B = [[1, 1000], [0.001, 1]]
wA, vrA = scipy.linalg.eig(A)
wB, vrB = scipy.linalg.eig(B)
wA, wB
'''
(array([ 1.+0.j, 1.+0.j]),
array([ 2.00000000e+00+0.j, -2.22044605e-16+0.j]))
'''
矩阵的条件数
乘积 经常出现,它有自己的名字: A
的条件数。注意,通常我们谈论问题的条件作用,而不是矩阵。
A
的条件数涉及:
- 给定
Ax = b
中的A
和x
,计算b
- 给定
Ax = b
中的A
和b
,计算x
未交待清楚的事情
完整和简化分解
SVD
来自 Trefethen 的图:
对于所有矩阵,QR 分解都存在
与 SVD 一样,有 QR 分解的完整版和简化版。
矩阵的逆是不稳定的
from scipy.linalg import hilbert
n = 5
hilbert(n)
'''
array([[ 1. , 0.5 , 0.3333, 0.25 , 0.2 ],
[ 0.5 , 0.3333, 0.25 , 0.2 , 0.1667],
[ 0.3333, 0.25 , 0.2 , 0.1667, 0.1429],
[ 0.25 , 0.2 , 0.1667, 0.1429, 0.125 ],
[ 0.2 , 0.1667, 0.1429, 0.125 , 0.1111]])
'''
n = 14
A = hilbert(n)
x = np.random.uniform(-10,10,n)
b = A @ x
A_inv = np.linalg.inv(A)
np.linalg.norm(np.eye(n) - A @ A_inv)
# 5.0516495470543212
np.linalg.cond(A)
# 2.2271635826494112e+17
A @ A_inv
'''
array([[ 1. , 0. , -0.0001, 0.0005, -0.0006, 0.0105, -0.0243,
0.1862, -0.6351, 2.2005, -0.8729, 0.8925, -0.0032, -0.0106],
[ 0. , 1. , -0. , 0. , 0.0035, 0.0097, -0.0408,
0.0773, -0.0524, 1.6926, -0.7776, -0.111 , -0.0403, -0.0184],
[ 0. , 0. , 1. , 0.0002, 0.0017, 0.0127, -0.0273,
0. , 0. , 1.4688, -0.5312, 0.2812, 0.0117, 0.0264],
[ 0. , 0. , -0. , 1.0005, 0.0013, 0.0098, -0.0225,
0.1555, -0.0168, 1.1571, -0.9656, -0.0391, 0.018 , -0.0259],
[-0. , 0. , -0. , 0.0007, 1.0001, 0.0154, 0.011 ,
-0.2319, 0.5651, -0.2017, 0.2933, -0.6565, 0.2835, -0.0482],
[ 0. , -0. , 0. , -0.0004, 0.0059, 0.9945, -0.0078,
-0.0018, -0.0066, 1.1839, -0.9919, 0.2144, -0.1866, 0.0187],
[-0. , 0. , -0. , 0.0009, -0.002 , 0.0266, 0.974 ,
-0.146 , 0.1883, -0.2966, 0.4267, -0.8857, 0.2265, -0.0453],
[ 0. , 0. , -0. , 0.0002, 0.0009, 0.0197, -0.0435,
1.1372, -0.0692, 0.7691, -1.233 , 0.1159, -0.1766, -0.0033],
[ 0. , 0. , -0. , 0.0002, 0. , -0.0018, -0.0136,
0.1332, 0.945 , 0.3652, -0.2478, -0.1682, 0.0756, -0.0212],
[ 0. , -0. , -0. , 0.0003, 0.0038, -0.0007, 0.0318,
-0.0738, 0.2245, 1.2023, -0.2623, -0.2783, 0.0486, -0.0093],
[-0. , 0. , -0. , 0.0004, -0.0006, 0.013 , -0.0415,
0.0292, -0.0371, 0.169 , 1.0715, -0.09 , 0.1668, -0.0197],
[ 0. , -0. , 0. , 0. , 0.0016, 0.0062, -0.0504,
0.1476, -0.2341, 0.8454, -0.7907, 1.4812, -0.15 , 0.0186],
[ 0. , -0. , 0. , -0.0001, 0.0022, 0.0034, -0.0296,
0.0944, -0.1833, 0.6901, -0.6526, 0.2556, 0.8563, 0.0128],
[ 0. , 0. , 0. , -0.0001, 0.0018, -0.0041, -0.0057,
-0.0374, -0.165 , 0.3968, -0.2264, -0.1538, -0.0076, 1.005 ]])
'''
row_names = ['Normal Eqns- Naive',
'QR Factorization',
'SVD',
'Scipy lstsq']
name2func = {'Normal Eqns- Naive': 'ls_naive',
'QR Factorization': 'ls_qr',
'SVD': 'ls_svd',
'Scipy lstsq': 'scipylstq'}
pd.options.display.float_format = '{:,.9f}'.format
df = pd.DataFrame(index=row_names, columns=['Time', 'Error'])
for name in row_names:
fcn = name2func[name]
t = timeit.timeit(fcn + '(A,b)', number=5, globals=globals())
coeffs = locals()[fcn](A, b)
df.set_value(name, 'Time', t)
df.set_value(name, 'Error', regr_metrics(b, A @ coeffs)[0])
SVD 在这里最好
不要重新运行。
df
Time | Error | |
---|---|---|
Normal Eqns- Naive | 0.001334339 | 3.598901966 |
QR Factorization | 0.002166139 | 0.000000000 |
SVD | 0.001556937 | 0.000000000 |
Scipy lstsq | 0.001871590 | 0.000000000 |
即使 A
是稀疏的, 通常是密集的。对于大型矩阵, 放不进内存。
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论