如何使用螺纹使其运行速度更快？

发布于 2025-02-11 07:59:58 字数 6143 浏览 0 评论 0原文

我有此代码将数据加载到文件。我想使其同时运行，以使其更快。有些人建议使用Asyncio，但我不太了解。此代码用于清洁CSV文件。例如，它以阿拉伯语格式清洁读取日期，然后将其更改为英语calender。任何人都可以简要概述如何完成。

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 00:03:38 2020

@author: siradmin
        ****** DATE ISSUES CODE ******
    The purpose of this code is to correct date in different date columns
"""
import os
os.chdir("D://Medgulf Motor/2022/Code for date cleaning")
os.getcwd()

import pandas as pd
import datetime as dt

#df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.txt",
#                 engine='python', sep=';', chunksize=100000)
df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.csv",
                 engine='python', chunksize=100000 )

columns = ['Issue Date','Inception Date','Expiry Date', 'Policy Status Date',
           'Vehicle Issue Date', 'Vehicle Inception Date','Vehicle Expiry Date',
           'Status Date', 'Insured Date of Birth','Main Driver DOB']
#             'Istemarah Exp.', 'Additional Driver DOB']

fmts2 = ['%d/%m/%Y', '%d/%m/%y', '%d-%m-%Y', '%d-%m-%y', '%m/%d/%Y', '%Y/%m/%d',
         '%Y-%m-%d', '%d|%m|%Y']
new_date = []
j = []


isd,ind,exd,psd,visd,vind,vexd,sd,ise,idb,mdd,add=(pd.DataFrame,)*12

header_flag = True

## Actual Code ##
print(dt.datetime.now())
for cx, chunk in enumerate(df):
    for col in columns:
        new_date = []

        for idx, x in enumerate(chunk[col]):
            try:
                x = int(x)
                dd = dt.datetime(1900,1,1)
                da = dt.timedelta(days=int(x)-2)
                nd = dd + da
                x = nd.date()
                
            except:
                pass


            for fmt in fmts2:
                try:
                    x = str(x)
#                    x = str(x).replace("//0/", "/0")
#                    x = str(x).replace("//1/", "/1")
#                    x = str(x).replace("//2/", "/2")
                    x = str(x).replace(" 00:00:00", "")
                    x = str(x).replace("0/0/", "1/1/")
                    x = str(x).replace("/0/", "/01/")
                    x = str(x).replace("/2/", "/02/")
                    date_object = dt.datetime.strptime(x.strip(), fmt).date()
                    new_date.append((date_object))
                    break
    
                except:
                    pass

            if len(new_date) != idx:
                pass
            elif "29/02" in x or "29-02" in x:
                new_date.append((x))
        
            else:
#                 x = "None"
                new_date.append((""))   #new_date.append((x))
                
                match col:
                    case "Issue Date":
                        isd = isd.append(chunk.iloc[[idx]])
                    case "Inception Date":
                        ind = ind.append(chunk.iloc[[idx]])
                    case "Expiry Date":
                        exd = exd.append(chunk.iloc[[idx]])
                    case "Policy Status Date":
                        psd = psd.append(chunk.iloc[[idx]])
                    case "Vehicle Issue Date":
                        visd = visd.append(chunk.iloc[[idx]])
                    case "Vehicle Inception Date":
                        vind = vind.append(chunk.iloc[[idx]])
                    case "Vehicle Expiry Date":
                        vexd = vexd.append(chunk.iloc[[idx]])
                    case "Istemarah Exp.":
                        ise = ise.append(chunk.iloc[[idx]])
                    case "Main Driver DOB":
                        mdd = mdd.append(chunk.iloc[[idx]])
                    case "Additional Driver DOB":
                        add = add.append(chunk.iloc[[idx]])
                        
#                 if col == "Issue Date":
#                     isd = isd.append(chunk.iloc[[idx]])
#                 if col == "Inception Date":
#                     ind = ind.append(chunk.iloc[[idx]])
#                 if col == "Expiry Date":
#                     exd = exd.append(chunk.iloc[[idx]])
#                 if col == "Policy Status Date":
#                     psd = psd.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Issue Date":
#                     visd = visd.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Inception Date":
#                     vind = vind.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Expiry Date":
#                     vexd = vexd.append(chunk.iloc[[idx]])
#                 if col == "Istemarah Exp.":
#                     ise = ise.append(chunk.iloc[[idx]])
# #                if col == "Insured Date of Birth":
# #                    idb = idb.append(chunk.iloc[[idx]])
#                 if col == "Main Driver DOB":
#                     mdd = mdd.append(chunk.iloc[[idx]])
#                 if col == "Additional Driver DOB":
#                     add = add.append(chunk.iloc[[idx]])
            
        chunk[col] = j = ['{}'.format(t) for idx, t in enumerate(new_date)]   
#        chunk[col] = pd.to_datetime(chunk[col])

        print ("Completed", col)
    print ('we have completed ', cx, 'chunk\n')

    chunk.to_csv('Tricast Policy Data.csv', mode='a', index =False, header = header_flag)
    
    header_flag = False



print(dt.datetime.now())


if len(isd) != 0:
    isd.to_csv("Issuedate.csv")
if len(ind) != 0:
    ind.to_csv("Inceptiondatecsv")
if len(exd) != 0:
    exd.to_csv("Expirydate.csv")
if len(psd) != 0:
    psd.to_csv("policystatedate.csv")
if len(visd) != 0:
    visd.to_csv("vehicleissuedate.csv")
if len(vind) != 0:
    vind.to_csv("vehicleinceptiondate.csv")
if len(vexd) != 0:
    vexd.to_csv("vehicleexpirydate.csv")
if len(sd) != 0:
    sd.to_csv("statusdate.csv")
if len(ise) != 0:
    ise.to_csv("istemarhexpiry.csv")
if len(idb) != 0:
    idb.to_csv("insureddateofbirth.csv")
if len(mdd) != 0:
    mdd.to_csv("maindriverdob.csv")
if len(add) != 0:
    add.to_csv("adddriverdob.csv")

###############################################################################

编辑：这是整个代码。

我的主管告诉我并发可以应用于将数据加载到CSV文件的最后一部分。

原文

I have this code to load data to file. I want to make it run concurrently using threads to make it faster. Some people recommended to use asyncio but I could'nt really understand it. This code is for cleaning a csv file. For eg it cleans reads date in arabic format and changes it to the english calender. Can anyone provide a brief overview of how this can be done.

# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 00:03:38 2020

@author: siradmin
        ****** DATE ISSUES CODE ******
    The purpose of this code is to correct date in different date columns
"""
import os
os.chdir("D://Medgulf Motor/2022/Code for date cleaning")
os.getcwd()

import pandas as pd
import datetime as dt

#df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.txt",
#                 engine='python', sep=';', chunksize=100000)
df = pd.read_csv("D://Medgulf Motor/2022/Data/Pricing Data 11.05.2021/Tricast/TricastPolicyData.csv",
                 engine='python', chunksize=100000 )

columns = ['Issue Date','Inception Date','Expiry Date', 'Policy Status Date',
           'Vehicle Issue Date', 'Vehicle Inception Date','Vehicle Expiry Date',
           'Status Date', 'Insured Date of Birth','Main Driver DOB']
#             'Istemarah Exp.', 'Additional Driver DOB']

fmts2 = ['%d/%m/%Y', '%d/%m/%y', '%d-%m-%Y', '%d-%m-%y', '%m/%d/%Y', '%Y/%m/%d',
         '%Y-%m-%d', '%d|%m|%Y']
new_date = []
j = []


isd,ind,exd,psd,visd,vind,vexd,sd,ise,idb,mdd,add=(pd.DataFrame,)*12

header_flag = True

## Actual Code ##
print(dt.datetime.now())
for cx, chunk in enumerate(df):
    for col in columns:
        new_date = []

        for idx, x in enumerate(chunk[col]):
            try:
                x = int(x)
                dd = dt.datetime(1900,1,1)
                da = dt.timedelta(days=int(x)-2)
                nd = dd + da
                x = nd.date()
                
            except:
                pass


            for fmt in fmts2:
                try:
                    x = str(x)
#                    x = str(x).replace("//0/", "/0")
#                    x = str(x).replace("//1/", "/1")
#                    x = str(x).replace("//2/", "/2")
                    x = str(x).replace(" 00:00:00", "")
                    x = str(x).replace("0/0/", "1/1/")
                    x = str(x).replace("/0/", "/01/")
                    x = str(x).replace("/2/", "/02/")
                    date_object = dt.datetime.strptime(x.strip(), fmt).date()
                    new_date.append((date_object))
                    break
    
                except:
                    pass

            if len(new_date) != idx:
                pass
            elif "29/02" in x or "29-02" in x:
                new_date.append((x))
        
            else:
#                 x = "None"
                new_date.append((""))   #new_date.append((x))
                
                match col:
                    case "Issue Date":
                        isd = isd.append(chunk.iloc[[idx]])
                    case "Inception Date":
                        ind = ind.append(chunk.iloc[[idx]])
                    case "Expiry Date":
                        exd = exd.append(chunk.iloc[[idx]])
                    case "Policy Status Date":
                        psd = psd.append(chunk.iloc[[idx]])
                    case "Vehicle Issue Date":
                        visd = visd.append(chunk.iloc[[idx]])
                    case "Vehicle Inception Date":
                        vind = vind.append(chunk.iloc[[idx]])
                    case "Vehicle Expiry Date":
                        vexd = vexd.append(chunk.iloc[[idx]])
                    case "Istemarah Exp.":
                        ise = ise.append(chunk.iloc[[idx]])
                    case "Main Driver DOB":
                        mdd = mdd.append(chunk.iloc[[idx]])
                    case "Additional Driver DOB":
                        add = add.append(chunk.iloc[[idx]])
                        
#                 if col == "Issue Date":
#                     isd = isd.append(chunk.iloc[[idx]])
#                 if col == "Inception Date":
#                     ind = ind.append(chunk.iloc[[idx]])
#                 if col == "Expiry Date":
#                     exd = exd.append(chunk.iloc[[idx]])
#                 if col == "Policy Status Date":
#                     psd = psd.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Issue Date":
#                     visd = visd.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Inception Date":
#                     vind = vind.append(chunk.iloc[[idx]])
#                 if col == "Vehicle Expiry Date":
#                     vexd = vexd.append(chunk.iloc[[idx]])
#                 if col == "Istemarah Exp.":
#                     ise = ise.append(chunk.iloc[[idx]])
# #                if col == "Insured Date of Birth":
# #                    idb = idb.append(chunk.iloc[[idx]])
#                 if col == "Main Driver DOB":
#                     mdd = mdd.append(chunk.iloc[[idx]])
#                 if col == "Additional Driver DOB":
#                     add = add.append(chunk.iloc[[idx]])
            
        chunk[col] = j = ['{}'.format(t) for idx, t in enumerate(new_date)]   
#        chunk[col] = pd.to_datetime(chunk[col])

        print ("Completed", col)
    print ('we have completed ', cx, 'chunk\n')

    chunk.to_csv('Tricast Policy Data.csv', mode='a', index =False, header = header_flag)
    
    header_flag = False



print(dt.datetime.now())


if len(isd) != 0:
    isd.to_csv("Issuedate.csv")
if len(ind) != 0:
    ind.to_csv("Inceptiondatecsv")
if len(exd) != 0:
    exd.to_csv("Expirydate.csv")
if len(psd) != 0:
    psd.to_csv("policystatedate.csv")
if len(visd) != 0:
    visd.to_csv("vehicleissuedate.csv")
if len(vind) != 0:
    vind.to_csv("vehicleinceptiondate.csv")
if len(vexd) != 0:
    vexd.to_csv("vehicleexpirydate.csv")
if len(sd) != 0:
    sd.to_csv("statusdate.csv")
if len(ise) != 0:
    ise.to_csv("istemarhexpiry.csv")
if len(idb) != 0:
    idb.to_csv("insureddateofbirth.csv")
if len(mdd) != 0:
    mdd.to_csv("maindriverdob.csv")
if len(add) != 0:
    add.to_csv("adddriverdob.csv")

###############################################################################

Edit: this is the whole code.

My supervisor told me concurrency can be applied to the last part where the data is being loaded to the csv files.

分享到QQ

分享到微博