Your current code looks quite good. I have changed only some minor things to the python code and got it running.
First, I was creating the data manually so we could reproduce the behavior.
# create data manually to be able to reproduce the scenario
cols = ["make", "model", "year", "mileage", "lease", "mpg", "target"]
rows = [["toyota", "corolla", 2015, 17510, 710, 20, "Y"],
["honda", "accord", 2012, 73640, 723, 23, "Y"],
["toyota", "corolla", 2020, 28525, 610,24, "N"],
["volkswagen", "jetta", 2017, 18007, 599, 21, "Y"],
["honda", "accord", 2017, 18850, 690, 23, "N"],
["volkswagen", "jetta", 2012, 5065, 292, 21, "N"],
["toyota", "highlander", 2019, 18004, 729, 18, "Y"],
["volkswagen", "jetta", 2016, 8361, 692,21, "Y"],
["toyota", "highlander", 2021, 28643, 729,18, "Y"]]
df = pd.DataFrame(columns=cols, data=rows)
As you might notice, I removed the Kia here. This is because we need a minimum of two points for a PCA with n_components=2
.
As your approach for the splitting did not work for me either, I just implemented a quite simple solution.
# 1
groups = df.groupby(["make", "model"]).count().index
Then we can make #2 similar than you already do.
# 2
for group in groups:
df[(df["make"] == group[0]) & (df["model"] == group[1])].to_csv(f"{group[0]}_{group[1]}.csv")
#3 – #5 were basically fine, just keep in mind that the end index is excluded not included, so will have to write:
df_temp = df.iloc[:,3:6]
I have also defined a colormap, so steps 3-5 look like this:
colormap = ['red', 'green', 'blue', 'yellow', 'green']
fig, axs = plt.subplots(nrows=2,ncols=2,figsize=(9,6))
for ax,file in zip(axs.flatten(), glob.glob("./*csv")):
df = pd.read_csv(file)
df_temp = df.iloc[:,3:6]
make = df['make'][0]
model= df['model'][0]
scaler = StandardScaler()
scaler.fit(df_temp)
scaled_data = scaler.transform(df_temp)
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
ax.scatter(x_pca[:,0],x_pca[:,1], c = colormap[:len(x_pca)]) # color "red' and 'green' preferred
ax.set_title(f"make:{make}, model:{model}")
ax.set_xlabel('First principal component')
ax.set_ylabel('Second Principal Component')
plt.tight_layout()
plt.legend()
fig.savefig("car_scatterplot.pdf",dpi = 300)
Edit:
To implement a multipage pdf, I have extended the data to the following:
# create data manually to be able to reproduce the scenario
cols = ["make", "model", "year", "mileage", "lease", "mpg", "target"]
rows = [["toyota", "corolla", 2015, 17510, 710, 20, "Y"],
["honda", "accord", 2012, 73640, 723, 23, "Y"],
["toyota", "corolla", 2020, 28525, 610,24, "N"],
["volkswagen", "jetta", 2017, 18007, 599, 21, "Y"],
["honda", "accord", 2017, 18850, 690, 23, "N"],
["volkswagen", "jetta", 2012, 5065, 292, 21, "N"],
["toyota", "highlander", 2019, 18004, 729, 18, "Y"],
["volkswagen", "jetta", 2016, 8361, 692,21, "Y"],
["toyota", "highlander", 2021, 28643, 729,18, "Y"],
["bmw", "M5", 2005,84392, 649, 25, "Y"],
["bmw", "M5", 2012, 17499, 899, 20, "N"]]
df = pd.DataFrame(columns=cols, data=rows)
df["color"] = df.apply(lambda row: "green" if row["target"] == "Y" else "red", axis=1)
You will have to add the following imports.
from matplotlib.backends.backend_pdf import PdfPages
import os
Then, for 3, 4 and 5, we can do the following that also generates the legends now.
# 3 4 5
files = [x for x in os.listdir("./") if os.path.splitext(x)[1] == ".csv"]
with PdfPages('car_scatterplot.pdf') as pdf:
for i in range(0, len(files), 4):
fig, axs = plt.subplots(nrows=2,ncols=2,figsize=(9,6))
max_len = i + 4 if len(files) > i + 4 else len(files)
for ax, file in zip(axs.flatten(), files[i:max_len]):
df = pd.read_csv(file)
df_temp = df.iloc[:,3:6]
make = df['make'][0]
model= df['model'][0]
scaler = StandardScaler()
scaler.fit(df_temp)
scaled_data = scaler.transform(df_temp)
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
df["x_pca_1"] = x_pca[:,0]
df["x_pca_2"] = x_pca[:,1]
for target in ["Y", "N"]:
sub_df = df[df["target"] == target]
if len(sub_df) > 0:
ax.scatter(sub_df["x_pca_1"],sub_df["x_pca_2"] ,c = sub_df['color'], label=target) # color "red' and 'green' preferred
ax.set_title(f"make:{make}, model:{model}")
ax.set_xlabel('First principal component')
ax.set_ylabel('Second Principal Component')
plt.tight_layout()
ax.legend(loc='best')
pdf.savefig(fig, dpi = 300)
CLICK HERE to find out more related problems solutions.