This ETL tool is build for:
Dataset from Kaggle: https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset
import pandas as pd
import MySQLdb
Writing the SQL code for Insert Value and assign it to INSERT_DB:
INSERT INTO table_name (column1, column2, column3, ...)``
VALUES (value1, value2, value3, ...);
INSERT_DB = "INSERT INTO card_default(id, limit_bal, sex, education, marriage, age,\
pay_status1, pay_status2, pay_status3, pay_status4, pay_status5, pay_status6,\
bill_amt1, bill_amt2, bill_amt3, bill_amt4, bill_amt5, bill_amt6,\
pay_amt1, pay_amt2, pay_amt3, pay_amt4, pay_amt5, pay_amt6,\
pay_default) \
VALUES ({0}, {1}, '{2}', '{3}', '{4}', {5}\
, '{6}', '{7}', '{8}', '{9}', '{10}', '{11}'\
, {12}, {13}, {14}, {15}, {16}, {17} \
, {18}, {19}, {20}, {21}, {22}, {23}\
, {24})"
Fill in hostname, username, password, database.
db = MySQLdb.connect(host="database",
user="name",
passwd="password",
db="scyap")
cur = db.cursor()
card_default = pd.read_csv("UCI_Credit_Card.csv")
card_default = card_default.rename(columns={'default.payment.next.month': 'DEFAULT', 'PAY_0': 'PAY_1'})
Changing numerical to string, because it will be more convenient when analysing in SQL. When doing a GROUP BY, it make more sense to show name of each category, instead of number representation.
card_default.loc[(card_default.EDUCATION == 1), 'EDUCATION'] = 'Graduate School'
card_default.loc[(card_default.EDUCATION == 2), 'EDUCATION'] = 'University'
card_default.loc[(card_default.EDUCATION == 3), 'EDUCATION'] = 'High School'
fill = (card_default.EDUCATION == 0) | (card_default.EDUCATION == 4) | (card_default.EDUCATION == 5) | (card_default.EDUCATION == 6)
card_default.loc[fill, 'EDUCATION'] = 'Other'
print('EDUCATION ' + str(card_default['EDUCATION'].unique()))
card_default.loc[(card_default.SEX == 1), 'SEX'] = 'M'
card_default.loc[(card_default.SEX == 2), 'SEX'] = 'F'
print('SEX ' + str(card_default['SEX'].unique()))
card_default.loc[(card_default.MARRIAGE == 0), 'MARRIAGE'] = 'Other'
card_default.loc[(card_default.MARRIAGE == 1), 'MARRIAGE'] = 'Married'
card_default.loc[(card_default.MARRIAGE == 2), 'MARRIAGE'] = 'Single'
card_default.loc[(card_default.MARRIAGE == 3), 'MARRIAGE'] = 'Divorce'
print('MARRIAGE ' + str(card_default['MARRIAGE'].unique()))
# cleaning data
for i in range(1,7):
pay = 'PAY_' + str(i)
card_default.loc[(card_default[pay] == -2), pay] = 'no consumption'
card_default.loc[(card_default[pay] == -1), pay] = 'paid full'
card_default.loc[(card_default[pay] == 0), pay] = 'paid min'
card_default.loc[(card_default[pay] == 1), pay] = 'delay 1 mth'
card_default.loc[(card_default[pay] == 2), pay] = 'delay 2 mths'
card_default.loc[(card_default[pay] == 3), pay] = 'delay 3 mths'
card_default.loc[(card_default[pay] == 4), pay] = 'delay 4 mths'
card_default.loc[(card_default[pay] == 5), pay] = 'delay 5 mths'
card_default.loc[(card_default[pay] == 6), pay] = 'delay 6 mths'
card_default.loc[(card_default[pay] == 7), pay] = 'delay 7 mths'
card_default.loc[(card_default[pay] == 8), pay] = 'delay 8 mths'
print('PAY_1 ' + str(sorted(card_default['PAY_1'].unique())))
After transform columns name and values, we show top 5 row as sample to check how it looks like.
card_default.head()
Use a for loop to read row by row and load it into MySQL
for index, row in card_default.iterrows():
sql = INSERT_DB.format(
row["ID"],
float(row["LIMIT_BAL"]),
row["SEX"],
row["EDUCATION"],
row["MARRIAGE"],
int(row["AGE"]),
row["PAY_1"],
row["PAY_2"],
row["PAY_3"],
row["PAY_4"],
row["PAY_5"],
row["PAY_6"],
float(row["BILL_AMT1"]),
float(row["BILL_AMT2"]),
float(row["BILL_AMT3"]),
float(row["BILL_AMT4"]),
float(row["BILL_AMT5"]),
float(row["BILL_AMT6"]),
float(row["PAY_AMT1"]),
float(row["PAY_AMT2"]),
float(row["PAY_AMT3"]),
float(row["PAY_AMT4"]),
float(row["PAY_AMT5"]),
float(row["PAY_AMT6"]),
int(row["DEFAULT"])
)
# Optional: to print out loading status when running
# if index % 1000 == 0:
# print(str(index) + ' row done')
cur.execute(sql)
db.commit()
cur.close()
db.close()
Dataset have been Extract, Transform, Load into MySQL. We may proceed to MySQL to check our data.