在 Python 中自动将 CSV 插入 MySQL -cursor.execute() 挂起

发布于 2025-01-17 21:38:54 字数 5457 浏览 4 评论 0原文

我正在尝试自动将 csv 插入到 mysql 数据库中。我在 Python 脚本中创建数据库和表，然后在 Jupyter Notebook 中运行它。但是，由于某种原因，最终的cursor.execute(SQL_STATEMENT) 似乎挂起，我无法将 csv 值插入数据库。我没有收到任何日志表明为什么会出现这种情况：

这是我的 csv_import_functions.py：

import os
import numpy as np
import pandas as pd
import mysql.connector


def csv_files():

    # get names of only csv files
    csv_files = []
    for file in os.listdir(os.getcwd()):
        if file.endswith(".csv"):
            csv_files.append(file)

    return csv_files

def create_df(csv_files):
    data_path = os.getcwd()+'/'

    # loop through the files and create the dataframe
    df = {}
    for file in csv_files:
        try:
            df[file] = pd.read_csv(data_path+file)
        except UnicodeDecodeError:
            # if utf-8 encoding error
            df[file] = pd.read_csv(data_path+file, encoding="ISO-8859-1")
        print(file)

    return df


def clean_tbl_name(filename):

    # rename csv, force lower case, no spaces, no dashes
    clean_tbl_name = filename.lower().replace(" ", "").replace(
        "-", "_").replace(r"/", "_").replace("\\", "_").replace("$", "").replace("%", "")

    tbl_name = '{0}'.format(clean_tbl_name.split('.')[0])

    return tbl_name


def clean_colname(dataframe):

    # force column names to be lower case, no spaces, no dashes
    dataframe.columns = [x.lower().replace(" ", "_").replace("-", "_").replace(r"/", "_").replace(
        "\\", "_").replace(".", "_").replace("$", "").replace("%", "") for x in dataframe.columns]

    # processing data
    replacements = {
        'timedelta64[ns]': 'varchar(100)',
        'object': 'varchar(100)',
        'float64': 'float',
        'int64': 'int',
        'datetime64': 'timestamp'
    }

    col_str = ", ".join("{} {}".format(n, d) for (n, d) in zip(
        dataframe.columns, dataframe.dtypes.replace(replacements)))

    return col_str, dataframe.columns


def upload_to_db(host, database, user, password, tbl_name, col_str, file, dataframe, dataframe_columns):

    conn_string = "host=%s, database=%s, user=%s, password=%s, port=%s" % (
        host, database, user, password)
    print("string is: " + conn_string)
    conn = mysql.connector.connect(
        host=host, database=database, user=user, password=password)
    cursor = conn.cursor()
    print('opened database successfully')

    print("drop table if exists %s;" % (tbl_name))
    print("create table %s (%s);" % (tbl_name, col_str))

    # drop table with same name
    cursor.execute("drop table if exists %s;" % (tbl_name))

    # create table
    cursor.execute("create table %s (%s);" % (tbl_name, col_str))
    print('{0} was created successfully'.format(tbl_name))

    # save df to csv
    dataframe.to_csv(file, header=dataframe_columns,
                     index=False, encoding='utf-8')

    col_names = col_str.replace(
        ' varchar(100)', '').replace(' int', '').replace(' float', '')

    # upload to db
    SQL_STATEMENT = """
    LOAD DATA INFILE '%s' INTO TABLE %s
    FIELDS TERMINATED BY ',' ENCLOSED BY '"'
    LINES TERMINATED BY '\r\n'
    IGNORE 1 LINES
    (%s);
    """ % (os.getcwd().replace(os.sep, '/') + '/' + file, tbl_name, col_names)

    print(SQL_STATEMENT)

    cursor.execute(SQL_STATEMENT)

    print('file copied to db')

    cursor.execute("grant select on table %s to public" % tbl_name)
    conn.commit()
    cursor.close()
    print('table {0} imported to db completed'.format(tbl_name))

    return

和我的 Jupyter Notebook (main.ipynb)：

import os
import numpy as np
import pandas as pd
import mysql.connector

#main 

from csv_import_functions import *

#settings
dataset_dir = 'datasets'

#db settings
host = 'localhost'
database = 'nba_data'
user = 'user'
password = 'password'

#configure environment and create main df
csv_files = csv_files()
df = create_df( csv_files)

for k in csv_files:

    #call dataframe
    dataframe = df[k]

    #clean table name
    tbl_name = clean_tbl_name(k)
    
    #clean column names
    col_str, dataframe.columns = clean_colname(dataframe)
    
    #upload data to db   
    upload_to_db(host, 
                 database, 
                 user, 
                 password,
                 tbl_name, 
                 col_str, 
                 file=k, 
                 dataframe=dataframe, 
                 dataframe_columns=dataframe.columns)

最后，这是之前的输出挂起：

nba-playbyplay.csv
string is: host=localhost, database=nba_data, user=user, password=password
opened database successfully
drop table if exists nba_playbyplay;
create table nba_playbyplay (url varchar(100), gametype varchar(100), location varchar(100), date varchar(100), time varchar(100), winningteam varchar(100), quarter int, secleft int, awayteam varchar(100), awayplay varchar(100), awayscore int, hometeam varchar(100), homeplay float, homescore int, shooter float, shottype float, shotoutcome float, shotdist float, assister float, blocker float, foultype float, fouler float, fouled float, rebounder float, reboundtype float, violationplayer float, violationtype float, timeoutteam float, freethrowshooter float, freethrowoutcome float, freethrownum float, entergame float, leavegame float, turnoverplayer float, turnovertype float, turnovercause float, turnovercauser float, jumpballawayplayer varchar(100), jumpballhomeplayer varchar(100), jumpballposs varchar(100));

原文

I'm trying to automate csv insertion into a mysql database. I go through creating the database and tables in a Python script and then run it in a Jupyter Notebook. However, for some reason the final cursor.execute(SQL_STATEMENT) seems to hang and I am not able to insert the csv values into the database. I get no logs suggesting why this is the case:

This is my csv_import_functions.py:

import os
import numpy as np
import pandas as pd
import mysql.connector


def csv_files():

    # get names of only csv files
    csv_files = []
    for file in os.listdir(os.getcwd()):
        if file.endswith(".csv"):
            csv_files.append(file)

    return csv_files

def create_df(csv_files):
    data_path = os.getcwd()+'/'

    # loop through the files and create the dataframe
    df = {}
    for file in csv_files:
        try:
            df[file] = pd.read_csv(data_path+file)
        except UnicodeDecodeError:
            # if utf-8 encoding error
            df[file] = pd.read_csv(data_path+file, encoding="ISO-8859-1")
        print(file)

    return df


def clean_tbl_name(filename):

    # rename csv, force lower case, no spaces, no dashes
    clean_tbl_name = filename.lower().replace(" ", "").replace(
        "-", "_").replace(r"/", "_").replace("\\", "_").replace("quot;, "").replace("%", "")

    tbl_name = '{0}'.format(clean_tbl_name.split('.')[0])

    return tbl_name


def clean_colname(dataframe):

    # force column names to be lower case, no spaces, no dashes
    dataframe.columns = [x.lower().replace(" ", "_").replace("-", "_").replace(r"/", "_").replace(
        "\\", "_").replace(".", "_").replace("quot;, "").replace("%", "") for x in dataframe.columns]

    # processing data
    replacements = {
        'timedelta64[ns]': 'varchar(100)',
        'object': 'varchar(100)',
        'float64': 'float',
        'int64': 'int',
        'datetime64': 'timestamp'
    }

    col_str = ", ".join("{} {}".format(n, d) for (n, d) in zip(
        dataframe.columns, dataframe.dtypes.replace(replacements)))

    return col_str, dataframe.columns


def upload_to_db(host, database, user, password, tbl_name, col_str, file, dataframe, dataframe_columns):

    conn_string = "host=%s, database=%s, user=%s, password=%s, port=%s" % (
        host, database, user, password)
    print("string is: " + conn_string)
    conn = mysql.connector.connect(
        host=host, database=database, user=user, password=password)
    cursor = conn.cursor()
    print('opened database successfully')

    print("drop table if exists %s;" % (tbl_name))
    print("create table %s (%s);" % (tbl_name, col_str))

    # drop table with same name
    cursor.execute("drop table if exists %s;" % (tbl_name))

    # create table
    cursor.execute("create table %s (%s);" % (tbl_name, col_str))
    print('{0} was created successfully'.format(tbl_name))

    # save df to csv
    dataframe.to_csv(file, header=dataframe_columns,
                     index=False, encoding='utf-8')

    col_names = col_str.replace(
        ' varchar(100)', '').replace(' int', '').replace(' float', '')

    # upload to db
    SQL_STATEMENT = """
    LOAD DATA INFILE '%s' INTO TABLE %s
    FIELDS TERMINATED BY ',' ENCLOSED BY '"'
    LINES TERMINATED BY '\r\n'
    IGNORE 1 LINES
    (%s);
    """ % (os.getcwd().replace(os.sep, '/') + '/' + file, tbl_name, col_names)

    print(SQL_STATEMENT)

    cursor.execute(SQL_STATEMENT)

    print('file copied to db')

    cursor.execute("grant select on table %s to public" % tbl_name)
    conn.commit()
    cursor.close()
    print('table {0} imported to db completed'.format(tbl_name))

    return

And my Jupyter Notebook (main.ipynb):

import os
import numpy as np
import pandas as pd
import mysql.connector

#main 

from csv_import_functions import *

#settings
dataset_dir = 'datasets'

#db settings
host = 'localhost'
database = 'nba_data'
user = 'user'
password = 'password'

#configure environment and create main df
csv_files = csv_files()
df = create_df( csv_files)

for k in csv_files:

    #call dataframe
    dataframe = df[k]

    #clean table name
    tbl_name = clean_tbl_name(k)
    
    #clean column names
    col_str, dataframe.columns = clean_colname(dataframe)
    
    #upload data to db   
    upload_to_db(host, 
                 database, 
                 user, 
                 password,
                 tbl_name, 
                 col_str, 
                 file=k, 
                 dataframe=dataframe, 
                 dataframe_columns=dataframe.columns)

Finally, here is the output before it hangs:

nba-playbyplay.csv
string is: host=localhost, database=nba_data, user=user, password=password
opened database successfully
drop table if exists nba_playbyplay;
create table nba_playbyplay (url varchar(100), gametype varchar(100), location varchar(100), date varchar(100), time varchar(100), winningteam varchar(100), quarter int, secleft int, awayteam varchar(100), awayplay varchar(100), awayscore int, hometeam varchar(100), homeplay float, homescore int, shooter float, shottype float, shotoutcome float, shotdist float, assister float, blocker float, foultype float, fouler float, fouled float, rebounder float, reboundtype float, violationplayer float, violationtype float, timeoutteam float, freethrowshooter float, freethrowoutcome float, freethrownum float, entergame float, leavegame float, turnoverplayer float, turnovertype float, turnovercause float, turnovercauser float, jumpballawayplayer varchar(100), jumpballhomeplayer varchar(100), jumpballposs varchar(100));

分享到QQ

分享到微博