返回介绍

Embarassingly parallel programs

发布于 2025-02-25 23:44:03 字数 14623 浏览 0 评论 0 收藏 0

Many statistical problems are embarassingly parallel and cna be easily decomposed into independent tasks or data sets. Here are several examples:

  • Monte Carlo integration
  • Mulitiple chains of MCMC
  • Boostrap for condence intervals
  • Power calculations by simulation
  • Permuatation-resampling tests
  • Fitting same model on multiple data sets

Other problems are serial at small scale, but can be parallelized at large scales. For example, EM and MCMC iterations are inherently serial since there is a dependence on the previous state, but within a single iteration, there can be many thousands of density calculations (one for each data poinnt to calculate the likelihood), and this is an embarassingly parallel problem within a single itneration.

These “low hanging fruits” are great because they offer a path to easy parallelism with minimal complexity.

Estimating \(\pi\) using Monte Carlo integration

This is clearly a toy example, but the template cna be used for most embarassingly parallel problems. First we see how much we can speed-up the serial code by the use of compilation, then we apply parallel processing for a furhter linear speed-up in the number of processors.

def pi_python(n):
    s = 0
    for i in range(n):
        x = random.uniform(-1, 1)
        y = random.uniform(-1, 1)
        if (x**2 + y**2) < 1:
            s += 1
    return s/n
stats = %prun -r -q pi_python(1000000)
stats.sort_stats('time').print_stats(5);
       4000004 function calls in 2.329 seconds

 Ordered by: internal time
 List reduced from 6 to 5 due to restriction <5>

 ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      1    1.132    1.132    2.329    2.329 <ipython-input-5-fe39fab6b921>:1(pi_python)
2000000    0.956    0.000    1.141    0.000 random.py:358(uniform)
2000000    0.185    0.000    0.185    0.000 {method 'random' of '_random.Random' objects}
      1    0.056    0.056    0.056    0.056 {range}
      1    0.000    0.000    2.329    2.329 <string>:1(<module>)
def pi_numpy(n):
    xs = np.random.uniform(-1, 1, (n,2))
    return 4.0*((xs**2).sum(axis=1).sum() < 1)/n
@jit
def pi_numba(n):
    s = 0
    for i in range(n):
        x = random.uniform(-1, 1)
        y = random.uniform(-1, 1)
        if x**2 + y**2 < 1:
            s += 1
    return s/n

This usse the GNU Scientific lirbary. You may need to instal it

wget ftp://ftp.gnu.org/gnu/gsl/gsl-latest.tar.gz
tar -xzf gsl-latest.tar.gz
cd gsl-1.16
./configure --prefilx=/usr/local
make
make install

and then

pip install cythongsl
%%cython -a -lgsl
import cython
import numpy as np
cimport numpy as np
from cython_gsl cimport gsl_rng_mt19937, gsl_rng, gsl_rng_alloc, gsl_rng_uniform

cdef gsl_rng *r = gsl_rng_alloc(gsl_rng_mt19937)

@cython.cdivision
@cython.boundscheck(False)
@cython.wraparound(False)
def pi_cython(int n):
    cdef int[:] s = np.zeros(n, dtype=np.int32)
    cdef int i = 0
    cdef double x, y
    for i in range(n):
        x = gsl_rng_uniform(r)*2 - 1
        y = gsl_rng_uniform(r)*2 - 1
        s[i] = x**2 + y**2 < 1
    cdef int hits = 0
    for i in range(n):
        hits += s[i]
    return 4.0*hits/n

Generated by Cython 0.22

+01: import cython
  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+02: import numpy as np
  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
 03: cimport numpy as np
 04: from cython_gsl cimport gsl_rng_mt19937, gsl_rng, gsl_rng_alloc, gsl_rng_uniform
 05: 
+06: cdef gsl_rng *r = gsl_rng_alloc(gsl_rng_mt19937)
  __pyx_v_46_cython_magic_f0d9ca082c7b25a08598049bfef2323c_r = gsl_rng_alloc(gsl_rng_mt19937);
 07: 
 08: @cython.cdivision
 09: @cython.boundscheck(False)
 10: @cython.wraparound(False)
+11: def pi_cython(int n):
/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_f0d9ca082c7b25a08598049bfef2323c_1pi_cython(PyObject *__pyx_self, PyObject *__pyx_arg_n); /*proto*/
static PyMethodDef __pyx_mdef_46_cython_magic_f0d9ca082c7b25a08598049bfef2323c_1pi_cython = {"pi_cython", (PyCFunction)__pyx_pw_46_cython_magic_f0d9ca082c7b25a08598049bfef2323c_1pi_cython, METH_O, 0};
static PyObject *__pyx_pw_46_cython_magic_f0d9ca082c7b25a08598049bfef2323c_1pi_cython(PyObject *__pyx_self, PyObject *__pyx_arg_n) {
  int __pyx_v_n;
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("pi_cython (wrapper)", 0);
  assert(__pyx_arg_n); {
    __pyx_v_n = __Pyx_PyInt_As_int(__pyx_arg_n); if (unlikely((__pyx_v_n == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
  }
  goto __pyx_L4_argument_unpacking_done;
  __pyx_L3_error:;
  __Pyx_AddTraceback("_cython_magic_f0d9ca082c7b25a08598049bfef2323c.pi_cython", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __Pyx_RefNannyFinishContext();
  return NULL;
  __pyx_L4_argument_unpacking_done:;
  __pyx_r = __pyx_pf_46_cython_magic_f0d9ca082c7b25a08598049bfef2323c_pi_cython(__pyx_self, ((int)__pyx_v_n));
  int __pyx_lineno = 0;
  const char *__pyx_filename = NULL;
  int __pyx_clineno = 0;

  /* function exit code */
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_f0d9ca082c7b25a08598049bfef2323c_pi_cython(CYTHON_UNUSED PyObject *__pyx_self, int __pyx_v_n) {
  __Pyx_memviewslice __pyx_v_s = { 0, 0, { 0 }, { 0 }, { 0 } };
  int __pyx_v_i;
  double __pyx_v_x;
  double __pyx_v_y;
  int __pyx_v_hits;
  PyObject *__pyx_r = NULL;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("pi_cython", 0);
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_1);
  __Pyx_XDECREF(__pyx_t_2);
  __Pyx_XDECREF(__pyx_t_3);
  __Pyx_XDECREF(__pyx_t_4);
  __Pyx_XDECREF(__pyx_t_5);
  __PYX_XDEC_MEMVIEW(&__pyx_t_6, 1);
  __Pyx_AddTraceback("_cython_magic_f0d9ca082c7b25a08598049bfef2323c.pi_cython", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  __pyx_L0:;
  __PYX_XDEC_MEMVIEW(&__pyx_v_s, 1);
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple__19 = PyTuple_Pack(7, __pyx_n_s_n, __pyx_n_s_n, __pyx_n_s_s, __pyx_n_s_i, __pyx_n_s_x, __pyx_n_s_y, __pyx_n_s_hits); if (unlikely(!__pyx_tuple__19)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_tuple__19);
  __Pyx_GIVEREF(__pyx_tuple__19);
/* … */
  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_46_cython_magic_f0d9ca082c7b25a08598049bfef2323c_1pi_cython, NULL, __pyx_n_s_cython_magic_f0d9ca082c7b25a085); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_pi_cython, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_codeobj__20 = (PyObject*)__Pyx_PyCode_New(1, 0, 7, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__19, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_Users_cliburn_ipython_cython__c, __pyx_n_s_pi_cython, 11, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__20)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+12:     cdef int[:] s = np.zeros(n, dtype=np.int32)
  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_zeros); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_2);
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_n); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_3);
  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1);
  __Pyx_GIVEREF(__pyx_t_1);
  __pyx_t_1 = 0;
  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_4);
  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_int32); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_5);
  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
  if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_5) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
  __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_5);
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_t_6 = __Pyx_PyObject_to_MemoryviewSlice_ds_int(__pyx_t_5);
  if (unlikely(!__pyx_t_6.memview)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
  __pyx_v_s = __pyx_t_6;
  __pyx_t_6.memview = NULL;
  __pyx_t_6.data = NULL;
+13:     cdef int i = 0
  __pyx_v_i = 0;
 14:     cdef double x, y
+15:     for i in range(n):
  __pyx_t_7 = __pyx_v_n;
  for (__pyx_t_8 = 0; __pyx_t_8 < __pyx_t_7; __pyx_t_8+=1) {
    __pyx_v_i = __pyx_t_8;
+16:         x = gsl_rng_uniform(r)*2 - 1
    __pyx_v_x = ((gsl_rng_uniform(__pyx_v_46_cython_magic_f0d9ca082c7b25a08598049bfef2323c_r) * 2.0) - 1.0);
+17:         y = gsl_rng_uniform(r)*2 - 1
    __pyx_v_y = ((gsl_rng_uniform(__pyx_v_46_cython_magic_f0d9ca082c7b25a08598049bfef2323c_r) * 2.0) - 1.0);
+18:         s[i] = x**2 + y**2 < 1
    __pyx_t_9 = __pyx_v_i;
    *((int *) ( /* dim=0 */ (__pyx_v_s.data + __pyx_t_9 * __pyx_v_s.strides[0]) )) = ((pow(__pyx_v_x, 2.0) + pow(__pyx_v_y, 2.0)) < 1.0);
  }
+19:     cdef int hits = 0
  __pyx_v_hits = 0;
+20:     for i in range(n):
  __pyx_t_7 = __pyx_v_n;
  for (__pyx_t_8 = 0; __pyx_t_8 < __pyx_t_7; __pyx_t_8+=1) {
    __pyx_v_i = __pyx_t_8;
+21:         hits += s[i]
    __pyx_t_10 = __pyx_v_i;
    __pyx_v_hits = (__pyx_v_hits + (*((int *) ( /* dim=0 */ (__pyx_v_s.data + __pyx_t_10 * __pyx_v_s.strides[0]) ))));
  }
+22:     return 4.0*hits/n
  __Pyx_XDECREF(__pyx_r);
  __pyx_t_5 = PyFloat_FromDouble(((4.0 * __pyx_v_hits) / __pyx_v_n)); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_5);
  __pyx_r = __pyx_t_5;
  __pyx_t_5 = 0;
  goto __pyx_L0;
n = int(1e5)
%timeit pi_python(n)
%timeit pi_numba(n)
%timeit pi_numpy(n)
%timeit pi_cython(n)
10 loops, best of 3: 127 ms per loop
1 loops, best of 3: 146 ms per loop
100 loops, best of 3: 5.18 ms per loop
100 loops, best of 3: 1.95 ms per loop

The bigger the problem, the more scope there is for parallelism

Amhdahls’ law says that the speedup from parallelization is bounded by the ratio of parallelizable to irreducibly serial code in the aloorithm. However, for big data analysis, Gustafson’s Law is more relevant. This says that we are nearly always interested in increasing the size of the parallelizable bits, and the ratio of parallelizable to irreducibly serial code is not a static quantity but depends on data size. For example, Gibbs sampling has an irreducibly serial nature, but for large samples, each iteration may be able perform PDF evaluations in parallel for zillions of data points.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
    我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
    原文