当前位置：文江博客话题详情

如何在Python中高效处理不断追加新项目的列表

发布于 2025-01-12 12:34:30 字数 4167 浏览 0 评论 0原文

目标：

可视化特定生物体在有限时间内的种群规模。

假设：

生物体的寿命为age_limit天，
只有年龄为day_lay_egg天的雌性才能产卵，并且允许雌性产卵鸡蛋最多 max_lay_egg 次。每次繁殖期，最多只能产 egg_no 个蛋，产生雄性后代的概率为 50%。
3 个生物体的初始种群由 2 个雌性和 1 个雄性组成

代码片段：

目前，下面的代码应产生预期的输出

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


def get_breeding(d,**kwargs):

    if d['lay_egg'] <= kwargs['max_lay_egg'] and d['dborn'] > kwargs['day_lay_egg'] and d['s'] == 1:
            nums = np.random.choice([0, 1], size=kwargs['egg_no'], p=[.5, .5]).tolist()
            npol=[dict(s=x,d=d['d'], lay_egg=0, dborn=0) for x in nums]
            d['lay_egg'] = d['lay_egg'] + 1
            return d,npol

    return d,None



def to_loop_initial_population(**kwargs):

    npol=kwargs['ipol']
    nday = 0
    total_population_per_day = []
    while nday < kwargs['nday_limit']:
        # print(f'Executing day {nday}')

        k = []
        for dpol in npol:
            dpol['d'] += 1
            dpol['dborn'] += 1
            dpol,h = get_breeding(dpol,**kwargs)

            if h is None and dpol['dborn'] <= kwargs['age_limit']:
                # If beyond the age limit, ignore the parent and update only the decedent 
                k.append(dpol)
            elif isinstance(h, list) and dpol['dborn'] <= kwargs['age_limit']:
                # If below age limit, append the parent and its offspring
                h.extend([dpol])
                k.extend(h)

        total_population_per_day.append(dict(nsize=len(k), day=nday))
        nday += 1
        npol = k

    return total_population_per_day


## Some spec and store all  setting in a dict   
numsex=[1,1,0] # 0: Male, 1: Female

# s: sex, d: day, lay_egg: Number of time the female lay an egg, dborn: The organism age
ipol=[dict(s=x,d=0, lay_egg=0, dborn=0) for x in numsex] # The initial population
age_limit = 45 # Age limit for the species
egg_no=3 # Number of eggs
day_lay_egg = 30  # Matured age for egg laying
nday_limit=360
max_lay_egg=2
para=dict(nday_limit=nday_limit,ipol=ipol,age_limit=age_limit,
          egg_no=egg_no,day_lay_egg=day_lay_egg,max_lay_egg=max_lay_egg)


dpopulation = to_loop_initial_population(**para)


### make some plot
df = pd.DataFrame(dpopulation)
sns.lineplot(x="day", y="nsize", data=df)
plt.xticks(rotation=15)
plt.title('Day vs population')
plt.show()

输出：

问题：

完成执行时间的时间随着nday_limit呈指数增长。我需要提高代码的效率。我怎样才能加快运行时间？

其他想法：

我很想应用下面的joblib。令我惊讶的是，执行时间更差。

def djob(dpol,k,**kwargs):
    dpol['d'] = dpol['d'] + 1
    dpol['dborn'] = dpol['dborn'] + 1
    dpol,h = get_breeding(dpol,**kwargs)

    if h is None and dpol['dborn'] <= kwargs['age_limit']:
        # If beyond the age limit, ignore the that particular subject
        k.append(dpol)
    elif isinstance(h, list) and dpol['dborn'] <= kwargs['age_limit']:
        # If below age limit, append the parent and its offspring
        h.extend([dpol])
        k.extend(h)

    return k
def to_loop_initial_population(**kwargs):

    npol=kwargs['ipol']
    nday = 0
    total_population_per_day = []
    while nday < kwargs['nday_limit']:


        k = []


        njob=1 if len(npol)<=50 else 4
        if njob==1:
            print(f'Executing day {nday} with single cpu')
            for dpols in npol:
                k=djob(dpols,k,**kwargs)
        else:
            print(f'Executing day {nday} with single parallel')
            k=Parallel(n_jobs=-1)(delayed(djob)(dpols,k,**kwargs) for dpols in npol)
            k = list(itertools.chain(*k))
            ll=1


        total_population_per_day.append(dict(nsize=len(k), day=nday))
        nday += 1
        npol = k

    return total_population_per_day

为了

nday_limit=365

原文

Objective:

To visualize the population size of a particular organism over finite time.

Assumptions:

The organism has a life span of age_limit days
Only Females of age day_lay_egg days can lay the egg, and the female is allowed to lay an egg a maximum of max_lay_egg times. Each breeding session, a maximum of only egg_no eggs can be laid with a 50% probability of producing male offspring.
Initial population of 3 organisms consist of 2 Female and 1 Male

Code Snippets:

Currently, the code below should produced the expected output

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


def get_breeding(d,**kwargs):

    if d['lay_egg'] <= kwargs['max_lay_egg'] and d['dborn'] > kwargs['day_lay_egg'] and d['s'] == 1:
            nums = np.random.choice([0, 1], size=kwargs['egg_no'], p=[.5, .5]).tolist()
            npol=[dict(s=x,d=d['d'], lay_egg=0, dborn=0) for x in nums]
            d['lay_egg'] = d['lay_egg'] + 1
            return d,npol

    return d,None



def to_loop_initial_population(**kwargs):

    npol=kwargs['ipol']
    nday = 0
    total_population_per_day = []
    while nday < kwargs['nday_limit']:
        # print(f'Executing day {nday}')

        k = []
        for dpol in npol:
            dpol['d'] += 1
            dpol['dborn'] += 1
            dpol,h = get_breeding(dpol,**kwargs)

            if h is None and dpol['dborn'] <= kwargs['age_limit']:
                # If beyond the age limit, ignore the parent and update only the decedent 
                k.append(dpol)
            elif isinstance(h, list) and dpol['dborn'] <= kwargs['age_limit']:
                # If below age limit, append the parent and its offspring
                h.extend([dpol])
                k.extend(h)

        total_population_per_day.append(dict(nsize=len(k), day=nday))
        nday += 1
        npol = k

    return total_population_per_day


## Some spec and store all  setting in a dict   
numsex=[1,1,0] # 0: Male, 1: Female

# s: sex, d: day, lay_egg: Number of time the female lay an egg, dborn: The organism age
ipol=[dict(s=x,d=0, lay_egg=0, dborn=0) for x in numsex] # The initial population
age_limit = 45 # Age limit for the species
egg_no=3 # Number of eggs
day_lay_egg = 30  # Matured age for egg laying
nday_limit=360
max_lay_egg=2
para=dict(nday_limit=nday_limit,ipol=ipol,age_limit=age_limit,
          egg_no=egg_no,day_lay_egg=day_lay_egg,max_lay_egg=max_lay_egg)


dpopulation = to_loop_initial_population(**para)


### make some plot
df = pd.DataFrame(dpopulation)
sns.lineplot(x="day", y="nsize", data=df)
plt.xticks(rotation=15)
plt.title('Day vs population')
plt.show()

Output:

Problem/Question:

The time to complete the execution time increases exponentially with nday_limit. I need to improve the efficiency of the code. How can I speed up the running time?

Other Thoughts:

I am tempted to apply joblib as below. To my surprise, the execution time is worse.

def djob(dpol,k,**kwargs):
    dpol['d'] = dpol['d'] + 1
    dpol['dborn'] = dpol['dborn'] + 1
    dpol,h = get_breeding(dpol,**kwargs)

    if h is None and dpol['dborn'] <= kwargs['age_limit']:
        # If beyond the age limit, ignore the that particular subject
        k.append(dpol)
    elif isinstance(h, list) and dpol['dborn'] <= kwargs['age_limit']:
        # If below age limit, append the parent and its offspring
        h.extend([dpol])
        k.extend(h)

    return k
def to_loop_initial_population(**kwargs):

    npol=kwargs['ipol']
    nday = 0
    total_population_per_day = []
    while nday < kwargs['nday_limit']:


        k = []


        njob=1 if len(npol)<=50 else 4
        if njob==1:
            print(f'Executing day {nday} with single cpu')
            for dpols in npol:
                k=djob(dpols,k,**kwargs)
        else:
            print(f'Executing day {nday} with single parallel')
            k=Parallel(n_jobs=-1)(delayed(djob)(dpols,k,**kwargs) for dpols in npol)
            k = list(itertools.chain(*k))
            ll=1


        total_population_per_day.append(dict(nsize=len(k), day=nday))
        nday += 1
        npol = k

    return total_population_per_day

for

nday_limit=365

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

风渺 2025-01-19 12:34:31

尝试将代码构建为矩阵，例如 state[age][eggs_remaining] = count。它将包含 age_limit 行和 max_lay_egg 列。

雄性从 0 Eggs_remaining 列开始，每次雌性产卵时，它们都会向下移动一个（3->2->1->0 与上面的代码）。

对于每个周期，您只需删除最后一行，迭代 age_limit 之后的所有行，然后插入包含男性和女性数量的新第一行。

如果（如您的示例所示）雌性在产下所有卵之前死于老年的可能性微乎其微，您可以将所有内容折叠成 state_alive[age][gender] = count 和一个 state_eggs[eggs_remaining] = count 代替，但除非年龄非常高或者您想要运行数千次模拟，否则没有必要。

回复收藏 0 原文

不即不离 2025-01-19 12:34:31

尽可能使用 numpy 数组操作而不是使用循环可以提高性能，请参阅下面在笔记本中测试的代码 - https://www.kaggle.com/gfteafun/notebook03118c731b

请注意，在比较时间时，nsize 比例很重要。

%%time

# s: sex, d: day, lay_egg: Number of time the female lay an egg, dborn: The organism age
x = np.array([(x, 0, 0, 0) for x in numsex ] )
iparam = np.array([0, 1, 0, 1])

total_population_per_day = []
for nday in range(nday_limit):
    x = x + iparam
    c = np.all(x < np.array([2, nday_limit, max_lay_egg, age_limit]), axis=1) & np.all(x >= np.array([1, day_lay_egg, 0, day_lay_egg]), axis=1)
    total_population_per_day.append(dict(nsize=len(x[x[:,3]<age_limit, :]), day=nday))
    n = x[c, 2].shape[0]

    if n > 0:
        x[c, 2] = x[c, 2] + 1
        newborns = np.array([(x, nday, 0, 0) for x in np.random.choice([0, 1], size=egg_no, p=[.5, .5]) for i in range(n)])
        x = np.vstack((x, newborns))


df = pd.DataFrame(total_population_per_day)
sns.lineplot(x="day", y="nsize", data=df)
plt.xticks(rotation=15)
plt.title('Day vs population')
plt.show()

use numpy array operation as much as possible instead of using loop can improve your performance, see below codes tested in notebook - https://www.kaggle.com/gfteafun/notebook03118c731b

Note that when comparing the time the nsize scale matters.

%%time

# s: sex, d: day, lay_egg: Number of time the female lay an egg, dborn: The organism age
x = np.array([(x, 0, 0, 0) for x in numsex ] )
iparam = np.array([0, 1, 0, 1])

total_population_per_day = []
for nday in range(nday_limit):
    x = x + iparam
    c = np.all(x < np.array([2, nday_limit, max_lay_egg, age_limit]), axis=1) & np.all(x >= np.array([1, day_lay_egg, 0, day_lay_egg]), axis=1)
    total_population_per_day.append(dict(nsize=len(x[x[:,3]<age_limit, :]), day=nday))
    n = x[c, 2].shape[0]

    if n > 0:
        x[c, 2] = x[c, 2] + 1
        newborns = np.array([(x, nday, 0, 0) for x in np.random.choice([0, 1], size=egg_no, p=[.5, .5]) for i in range(n)])
        x = np.vstack((x, newborns))


df = pd.DataFrame(total_population_per_day)
sns.lineplot(x="day", y="nsize", data=df)
plt.xticks(rotation=15)
plt.title('Day vs population')
plt.show()

回复收藏 0 原文

绻影浮沉 2025-01-19 12:34:30

您的代码总体看起来不错，但我可以看到几个改进点，这些改进点显着减慢了您的代码速度。

但必须注意的是，随着 nday 值的增加，您实际上无法帮助代码减慢太多，因为您需要跟踪的人口不断增长，并且您不断重新填充列表来跟踪这一点。预计随着对象数量的增加，循环将需要更长的时间才能完成，但您可以减少完成单个循环所需的时间。

elif isinstance(h, list) and dpol['dborn'] <= kwargs['age_limit']:

在这里，您在确认是否为 None 后，在每个循环中询问 h 的实例。您知道 h 将是一个列表，如果不是，您的代码即使在到达该行之前也会出错，因为列表无法创建。

此外，您还对 dpol 的 age 进行冗余条件检查，然后首先通过 dpol 冗余地扩展 h 并然后将k乘以h。这可以与上一期一起简化为：

if dpol['dborn'] <= kwargs['age_limit']:
    k.append(dpol)

if h:
    k.extend(h)

结果是相同的。

此外，您还传递了大量 **kwargs。这表明您的代码应该是一个类，其中一些不变的参数通过 self.parameter 保存。您甚至可以在此处使用数据类（https://docs.python.org/3/ library/dataclasses.html）

此外，您混合了不必要的函数职责，并使您的代码更加混乱。例如：

def get_breeding(d,**kwargs):

    if d['lay_egg'] <= kwargs['max_lay_egg'] and d['dborn'] > kwargs['day_lay_egg'] and d['s'] == 1:
            nums = np.random.choice([0, 1], size=kwargs['egg_no'], p=[.5, .5]).tolist()
            npol=[dict(s=x,d=d['d'], lay_egg=0, dborn=0) for x in nums]
            d['lay_egg'] = d['lay_egg'] + 1
            return d,npol

    return d,None

此代码包含两个职责：如果满足条件，则生成一个新个体，并检查这些条件，并根据它们返回两个不同的东西。

最好通过两个单独的函数来完成，一个函数仅检查条件，另一个函数生成一个新个体，如下所示：

def check_breeding(d, max_lay_egg, day_lay_egg):
    return d['lay_egg'] <= max_lay_egg and d['dborn'] > day_lay_egg and d['s'] == 1


def get_breeding(d, egg_no):
    nums = np.random.choice([0, 1], size=egg_no, p=[.5, .5]).tolist()
    npol=[dict(s=x, d=d['d'], lay_egg=0, dborn=0) for x in nums]
    return npol

其中 d['lay_egg'] 可以在迭代时就地更新如果满足条件则列表。

如果您在迭代列表时编辑列表，则可以通过这种方式进一步加快代码速度（通常不建议这样做，但如果您知道自己在做什么，则完全可以这样做。请确保使用索引并将其限制为列表长度的前一个界限，并在删除元素时递减索引）

示例：

i = 0
maxiter = len(npol)
while i < maxiter:
    if check_breeding(npol[i], max_lay_egg, day_lay_egg):
        npol.extend(get_breeding(npol[i], egg_no))
    
    if npol[i]['dborn'] > age_limit:
            npol.pop(i)
            i -= 1
            maxiter -= 1

这可以显着减少处理时间，因为您不需要创建新列表并每次都重新附加所有元素迭代。

最后，您可以检查一些人口增长方程和统计方法，甚至可以通过迭代将整个代码简化为计算问题，尽管这不再是模拟。

编辑

我已经完全实现了我对代码改进的建议，并使用 %%time 在 jupyter 笔记本中对它们进行了计时。我已经将函数定义从两者中分离出来，这样它们就不会占用时间，结果很能说明问题。我还让雌性在 100% 的情况下产生另一只雌性，以消除随机性，否则会更快。我比较了两者的结果，以验证它们产生相同的结果（它们确实产生了相同的结果，但我删除了“d_born”参数，因为除了设置之外，它没有在代码中使用）。

您的实现，使用 nday_limit=100 和 day_lay_egg=15：

Wall time 23.5s

我使用相同参数的实现：

Wall time 18.9s

因此您可以看出差异非常显着，对于较大的 nday_limit 值，差异会变得更大。

编辑后的代码的完整实现：

from dataclasses import dataclass
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


@dataclass
class Organism:
    sex: int
    times_laid_eggs: int = 0
    age: int = 0

    def __init__(self, sex):
        self.sex = sex


def check_breeding(d, max_lay_egg, day_lay_egg):
    return d.times_laid_eggs <= max_lay_egg and d.age > day_lay_egg and d.sex == 1


def get_breeding(egg_no): # Make sure to change probabilities back to 0.5 and 0.5 before using it
    nums = np.random.choice([0, 1], size=egg_no, p=[0.0, 1.0]).tolist()
    npol = [Organism(x) for x in nums]
    return npol


def simulate(organisms, age_limit, egg_no, day_lay_egg, max_lay_egg, nday_limit):
    npol = organisms
    nday = 0
    total_population_per_day = []

    while nday < nday_limit:
        i = 0
        maxiter = len(npol)
        while i < maxiter:
            npol[i].age += 1
            
            if check_breeding(npol[i], max_lay_egg, day_lay_egg):
                npol.extend(get_breeding(egg_no))
                npol[i].times_laid_eggs += 1

            if npol[i].age > age_limit:
                npol.pop(i)
                maxiter -= 1
                continue

            i += 1

        total_population_per_day.append(dict(nsize=len(npol), day=nday))
        nday += 1

    return total_population_per_day


if __name__ == "__main__":
    numsex = [1, 1, 0]  # 0: Male, 1: Female

    ipol = [Organism(x) for x in numsex]  # The initial population
    age_limit = 45  # Age limit for the species
    egg_no = 3  # Number of eggs
    day_lay_egg = 15  # Matured age for egg laying
    nday_limit = 100
    max_lay_egg = 2

    dpopulation = simulate(ipol, age_limit, egg_no, day_lay_egg, max_lay_egg, nday_limit)

    df = pd.DataFrame(dpopulation)
    sns.lineplot(x="day", y="nsize", data=df)
    plt.xticks(rotation=15)
    plt.title('Day vs population')
    plt.show()

Your code looks alright overall but I can see several points of improvement that are slowing your code down significantly.

Though it must be noted that you can't really help the code slowing down too much with increasing nday values, since the population you need to keep track of keeps growing and you keep re-populating a list to track this. It's expected as the number of objects increase, the loops will take longer to complete, but you can reduce the time it takes to complete a single loop.

elif isinstance(h, list) and dpol['dborn'] <= kwargs['age_limit']:

Here you ask the instance of h every single loop, after confirming whether it's None. You know for a fact that h is going to be a list, and if not, your code will error anyway even before reaching that line for the list not to have been able to be created.

Furthermore, you have a redundant condition check for age of dpol, and then redundantly first extend h by dpol and then k by h. This can be simplified together with the previous issue to this:

if dpol['dborn'] <= kwargs['age_limit']:
    k.append(dpol)

if h:
    k.extend(h)

The results are identical.

Additionally, you're passing around a lot of **kwargs. This is a sign that your code should be a class instead, where some unchanging parameters are saved through self.parameter. You could even use a dataclass here (https://docs.python.org/3/library/dataclasses.html)

Also, you mix responsibilities of functions which is unnecessary and makes your code more confusing. For instance:

def get_breeding(d,**kwargs):

    if d['lay_egg'] <= kwargs['max_lay_egg'] and d['dborn'] > kwargs['day_lay_egg'] and d['s'] == 1:
            nums = np.random.choice([0, 1], size=kwargs['egg_no'], p=[.5, .5]).tolist()
            npol=[dict(s=x,d=d['d'], lay_egg=0, dborn=0) for x in nums]
            d['lay_egg'] = d['lay_egg'] + 1
            return d,npol

    return d,None

This code contains two responsibilities: Generating a new individual if conditions are met, and checking these conditions, and returning two different things based on them.

This would be better done through two separate functions, one which simply checks the conditions, and another that generates a new individual as follows:

def check_breeding(d, max_lay_egg, day_lay_egg):
    return d['lay_egg'] <= max_lay_egg and d['dborn'] > day_lay_egg and d['s'] == 1


def get_breeding(d, egg_no):
    nums = np.random.choice([0, 1], size=egg_no, p=[.5, .5]).tolist()
    npol=[dict(s=x, d=d['d'], lay_egg=0, dborn=0) for x in nums]
    return npol

Where d['lay_egg'] could be updated in-place when iterating over the list if the condition is met.

You could speed up your code even further this way, if you edit the list as you iterate over it (it is not typically recommended but it's perfectly fine to do if you know what you're doing. Make sure to do it by using the index and limit it to the previous bounds of the length of the list, and decrement the index when an element is removed)

Example:

i = 0
maxiter = len(npol)
while i < maxiter:
    if check_breeding(npol[i], max_lay_egg, day_lay_egg):
        npol.extend(get_breeding(npol[i], egg_no))
    
    if npol[i]['dborn'] > age_limit:
            npol.pop(i)
            i -= 1
            maxiter -= 1

Which could significantly reduce processing time since you're not making a new list and appending all elements all over again every iteration.

Finally, you could check some population growth equation and statistical methods, and you could even reduce this whole code to a calculation problem with iterations, though that wouldn't be a sim anymore.

Edit

I've fully implemented my suggestions for improvements to your code and timed them in a jupyter notebook using %%time. I've separated out function definitions from both so they wouldn't contribute to the time, and the results are telling. I also made it so females produce another female 100% of the time, to remove randomness, otherwise it would be even faster. I compared the results from both to verify they produce identical results (they do, but I removed the 'd_born' parameter cause it's not used in the code apart from setting).

Your implementation, with nday_limit=100 and day_lay_egg=15:

Wall time 23.5s

My implementation with same parameters:

Wall time 18.9s

So you can tell the difference is quite significant, which grows even farther apart for larger nday_limit values.

Full implementation of edited code:

from dataclasses import dataclass
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


@dataclass
class Organism:
    sex: int
    times_laid_eggs: int = 0
    age: int = 0

    def __init__(self, sex):
        self.sex = sex


def check_breeding(d, max_lay_egg, day_lay_egg):
    return d.times_laid_eggs <= max_lay_egg and d.age > day_lay_egg and d.sex == 1


def get_breeding(egg_no): # Make sure to change probabilities back to 0.5 and 0.5 before using it
    nums = np.random.choice([0, 1], size=egg_no, p=[0.0, 1.0]).tolist()
    npol = [Organism(x) for x in nums]
    return npol


def simulate(organisms, age_limit, egg_no, day_lay_egg, max_lay_egg, nday_limit):
    npol = organisms
    nday = 0
    total_population_per_day = []

    while nday < nday_limit:
        i = 0
        maxiter = len(npol)
        while i < maxiter:
            npol[i].age += 1
            
            if check_breeding(npol[i], max_lay_egg, day_lay_egg):
                npol.extend(get_breeding(egg_no))
                npol[i].times_laid_eggs += 1

            if npol[i].age > age_limit:
                npol.pop(i)
                maxiter -= 1
                continue

            i += 1

        total_population_per_day.append(dict(nsize=len(npol), day=nday))
        nday += 1

    return total_population_per_day


if __name__ == "__main__":
    numsex = [1, 1, 0]  # 0: Male, 1: Female

    ipol = [Organism(x) for x in numsex]  # The initial population
    age_limit = 45  # Age limit for the species
    egg_no = 3  # Number of eggs
    day_lay_egg = 15  # Matured age for egg laying
    nday_limit = 100
    max_lay_egg = 2

    dpopulation = simulate(ipol, age_limit, egg_no, day_lay_egg, max_lay_egg, nday_limit)

    df = pd.DataFrame(dpopulation)
    sns.lineplot(x="day", y="nsize", data=df)
    plt.xticks(rotation=15)
    plt.title('Day vs population')
    plt.show()

回复收藏 0 原文

~没有更多了~