如何使用螺纹使其运行速度更快?

发布于 2025-02-11 07:59:58 字数 6143 浏览 0 评论 0原文

我有此代码将数据加载到文件。我想使其同时运行,以使其更快。有些人建议使用Asyncio,但我不太了解。此代码用于清洁CSV文件。例如,它以阿拉伯语格式清洁读取日期,然后将其更改为英语calender。任何人都可以简要概述如何完成。

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 00:03:38 2020

@author: siradmin
        ****** DATE ISSUES CODE ******
    The purpose of this code is to correct date in different date columns
"""
import os
os.chdir("D://Medgulf Motor/2022/Code for date cleaning")
os.getcwd()

import pandas as pd
import datetime as dt

#df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.txt",
#                 engine='python', sep=';', chunksize=100000)
df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.csv",
                 engine='python', chunksize=100000 )

columns = ['Issue Date','Inception Date','Expiry Date', 'Policy Status Date',
           'Vehicle Issue Date', 'Vehicle Inception Date','Vehicle Expiry Date',
           'Status Date', 'Insured Date of Birth','Main Driver DOB']
#             'Istemarah Exp.', 'Additional Driver DOB']

fmts2 = ['%d/%m/%Y', '%d/%m/%y', '%d-%m-%Y', '%d-%m-%y', '%m/%d/%Y', '%Y/%m/%d',
         '%Y-%m-%d', '%d|%m|%Y']
new_date = []
j = []


isd,ind,exd,psd,visd,vind,vexd,sd,ise,idb,mdd,add=(pd.DataFrame,)*12

header_flag = True

## Actual Code ##
print(dt.datetime.now())
for cx, chunk in enumerate(df):
    for col in columns:
        new_date = []

        for idx, x in enumerate(chunk[col]):
            try:
                x = int(x)
                dd = dt.datetime(1900,1,1)
                da = dt.timedelta(days=int(x)-2)
                nd = dd + da
                x = nd.date()
                
            except:
                pass


            for fmt in fmts2:
                try:
                    x = str(x)
#                    x = str(x).replace("//0/", "/0")
#                    x = str(x).replace("//1/", "/1")
#                    x = str(x).replace("//2/", "/2")
                    x = str(x).replace(" 00:00:00", "")
                    x = str(x).replace("0/0/", "1/1/")
                    x = str(x).replace("/0/", "/01/")
                    x = str(x).replace("/2/", "/02/")
                    date_object = dt.datetime.strptime(x.strip(), fmt).date()
                    new_date.append((date_object))
                    break
    
                except:
                    pass

            if len(new_date) != idx:
                pass
            elif "29/02" in x or "29-02" in x:
                new_date.append((x))
        
            else:
#                 x = "None"
                new_date.append((""))   #new_date.append((x))
                
                match col:
                    case "Issue Date":
                        isd = isd.append(chunk.iloc[[idx]])
                    case "Inception Date":
                        ind = ind.append(chunk.iloc[[idx]])
                    case "Expiry Date":
                        exd = exd.append(chunk.iloc[[idx]])
                    case "Policy Status Date":
                        psd = psd.append(chunk.iloc[[idx]])
                    case "Vehicle Issue Date":
                        visd = visd.append(chunk.iloc[[idx]])
                    case "Vehicle Inception Date":
                        vind = vind.append(chunk.iloc[[idx]])
                    case "Vehicle Expiry Date":
                        vexd = vexd.append(chunk.iloc[[idx]])
                    case "Istemarah Exp.":
                        ise = ise.append(chunk.iloc[[idx]])
                    case "Main Driver DOB":
                        mdd = mdd.append(chunk.iloc[[idx]])
                    case "Additional Driver DOB":
                        add = add.append(chunk.iloc[[idx]])
                        
#                 if col == "Issue Date":
#                     isd = isd.append(chunk.iloc[[idx]])
#                 if col == "Inception Date":
#                     ind = ind.append(chunk.iloc[[idx]])
#                 if col == "Expiry Date":
#                     exd = exd.append(chunk.iloc[[idx]])
#                 if col == "Policy Status Date":
#                     psd = psd.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Issue Date":
#                     visd = visd.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Inception Date":
#                     vind = vind.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Expiry Date":
#                     vexd = vexd.append(chunk.iloc[[idx]])
#                 if col == "Istemarah Exp.":
#                     ise = ise.append(chunk.iloc[[idx]])
# #                if col == "Insured Date of Birth":
# #                    idb = idb.append(chunk.iloc[[idx]])
#                 if col == "Main Driver DOB":
#                     mdd = mdd.append(chunk.iloc[[idx]])
#                 if col == "Additional Driver DOB":
#                     add = add.append(chunk.iloc[[idx]])
            
        chunk[col] = j = ['{}'.format(t) for idx, t in enumerate(new_date)]   
#        chunk[col] = pd.to_datetime(chunk[col])

        print ("Completed", col)
    print ('we have completed ', cx, 'chunk\n')

    chunk.to_csv('Tricast Policy Data.csv', mode='a', index =False, header = header_flag)
    
    header_flag = False



print(dt.datetime.now())


if len(isd) != 0:
    isd.to_csv("Issuedate.csv")
if len(ind) != 0:
    ind.to_csv("Inceptiondatecsv")
if len(exd) != 0:
    exd.to_csv("Expirydate.csv")
if len(psd) != 0:
    psd.to_csv("policystatedate.csv")
if len(visd) != 0:
    visd.to_csv("vehicleissuedate.csv")
if len(vind) != 0:
    vind.to_csv("vehicleinceptiondate.csv")
if len(vexd) != 0:
    vexd.to_csv("vehicleexpirydate.csv")
if len(sd) != 0:
    sd.to_csv("statusdate.csv")
if len(ise) != 0:
    ise.to_csv("istemarhexpiry.csv")
if len(idb) != 0:
    idb.to_csv("insureddateofbirth.csv")
if len(mdd) != 0:
    mdd.to_csv("maindriverdob.csv")
if len(add) != 0:
    add.to_csv("adddriverdob.csv")

###############################################################################

编辑:这是整个代码。

我的主管告诉我并发可以应用于将数据加载到CSV文件的最后一部分。

I have this code to load data to file. I want to make it run concurrently using threads to make it faster. Some people recommended to use asyncio but I could'nt really understand it. This code is for cleaning a csv file. For eg it cleans reads date in arabic format and changes it to the english calender. Can anyone provide a brief overview of how this can be done.

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 00:03:38 2020

@author: siradmin
        ****** DATE ISSUES CODE ******
    The purpose of this code is to correct date in different date columns
"""
import os
os.chdir("D://Medgulf Motor/2022/Code for date cleaning")
os.getcwd()

import pandas as pd
import datetime as dt

#df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.txt",
#                 engine='python', sep=';', chunksize=100000)
df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.csv",
                 engine='python', chunksize=100000 )

columns = ['Issue Date','Inception Date','Expiry Date', 'Policy Status Date',
           'Vehicle Issue Date', 'Vehicle Inception Date','Vehicle Expiry Date',
           'Status Date', 'Insured Date of Birth','Main Driver DOB']
#             'Istemarah Exp.', 'Additional Driver DOB']

fmts2 = ['%d/%m/%Y', '%d/%m/%y', '%d-%m-%Y', '%d-%m-%y', '%m/%d/%Y', '%Y/%m/%d',
         '%Y-%m-%d', '%d|%m|%Y']
new_date = []
j = []


isd,ind,exd,psd,visd,vind,vexd,sd,ise,idb,mdd,add=(pd.DataFrame,)*12

header_flag = True

## Actual Code ##
print(dt.datetime.now())
for cx, chunk in enumerate(df):
    for col in columns:
        new_date = []

        for idx, x in enumerate(chunk[col]):
            try:
                x = int(x)
                dd = dt.datetime(1900,1,1)
                da = dt.timedelta(days=int(x)-2)
                nd = dd + da
                x = nd.date()
                
            except:
                pass


            for fmt in fmts2:
                try:
                    x = str(x)
#                    x = str(x).replace("//0/", "/0")
#                    x = str(x).replace("//1/", "/1")
#                    x = str(x).replace("//2/", "/2")
                    x = str(x).replace(" 00:00:00", "")
                    x = str(x).replace("0/0/", "1/1/")
                    x = str(x).replace("/0/", "/01/")
                    x = str(x).replace("/2/", "/02/")
                    date_object = dt.datetime.strptime(x.strip(), fmt).date()
                    new_date.append((date_object))
                    break
    
                except:
                    pass

            if len(new_date) != idx:
                pass
            elif "29/02" in x or "29-02" in x:
                new_date.append((x))
        
            else:
#                 x = "None"
                new_date.append((""))   #new_date.append((x))
                
                match col:
                    case "Issue Date":
                        isd = isd.append(chunk.iloc[[idx]])
                    case "Inception Date":
                        ind = ind.append(chunk.iloc[[idx]])
                    case "Expiry Date":
                        exd = exd.append(chunk.iloc[[idx]])
                    case "Policy Status Date":
                        psd = psd.append(chunk.iloc[[idx]])
                    case "Vehicle Issue Date":
                        visd = visd.append(chunk.iloc[[idx]])
                    case "Vehicle Inception Date":
                        vind = vind.append(chunk.iloc[[idx]])
                    case "Vehicle Expiry Date":
                        vexd = vexd.append(chunk.iloc[[idx]])
                    case "Istemarah Exp.":
                        ise = ise.append(chunk.iloc[[idx]])
                    case "Main Driver DOB":
                        mdd = mdd.append(chunk.iloc[[idx]])
                    case "Additional Driver DOB":
                        add = add.append(chunk.iloc[[idx]])
                        
#                 if col == "Issue Date":
#                     isd = isd.append(chunk.iloc[[idx]])
#                 if col == "Inception Date":
#                     ind = ind.append(chunk.iloc[[idx]])
#                 if col == "Expiry Date":
#                     exd = exd.append(chunk.iloc[[idx]])
#                 if col == "Policy Status Date":
#                     psd = psd.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Issue Date":
#                     visd = visd.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Inception Date":
#                     vind = vind.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Expiry Date":
#                     vexd = vexd.append(chunk.iloc[[idx]])
#                 if col == "Istemarah Exp.":
#                     ise = ise.append(chunk.iloc[[idx]])
# #                if col == "Insured Date of Birth":
# #                    idb = idb.append(chunk.iloc[[idx]])
#                 if col == "Main Driver DOB":
#                     mdd = mdd.append(chunk.iloc[[idx]])
#                 if col == "Additional Driver DOB":
#                     add = add.append(chunk.iloc[[idx]])
            
        chunk[col] = j = ['{}'.format(t) for idx, t in enumerate(new_date)]   
#        chunk[col] = pd.to_datetime(chunk[col])

        print ("Completed", col)
    print ('we have completed ', cx, 'chunk\n')

    chunk.to_csv('Tricast Policy Data.csv', mode='a', index =False, header = header_flag)
    
    header_flag = False



print(dt.datetime.now())


if len(isd) != 0:
    isd.to_csv("Issuedate.csv")
if len(ind) != 0:
    ind.to_csv("Inceptiondatecsv")
if len(exd) != 0:
    exd.to_csv("Expirydate.csv")
if len(psd) != 0:
    psd.to_csv("policystatedate.csv")
if len(visd) != 0:
    visd.to_csv("vehicleissuedate.csv")
if len(vind) != 0:
    vind.to_csv("vehicleinceptiondate.csv")
if len(vexd) != 0:
    vexd.to_csv("vehicleexpirydate.csv")
if len(sd) != 0:
    sd.to_csv("statusdate.csv")
if len(ise) != 0:
    ise.to_csv("istemarhexpiry.csv")
if len(idb) != 0:
    idb.to_csv("insureddateofbirth.csv")
if len(mdd) != 0:
    mdd.to_csv("maindriverdob.csv")
if len(add) != 0:
    add.to_csv("adddriverdob.csv")

###############################################################################

Edit: this is the whole code.

My supervisor told me concurrency can be applied to the last part where the data is being loaded to the csv files.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文