Try and give a grouped (by two variables) average, and if not possible give column average in python pandas

Not the most efficient way around this but because time is pressuring me I ended up doing something like this, which actually does exactly what I wanted it to do:

dict_list_1 = []
for v in dat[features_to_impute]:
    comp_mean = env.groupby('company')[v].mean().to_frame()

comp_means = pd.concat(dict_list_1,axis=1,ignore_index=(False))    
comp_means.reset_index(inplace= True) 
def unique_id(df,col1,col2):
    return df[col1].astype(str) + "_" + df[col2].astype(str)

dat['company_ptype'] = unique_id(dat,'company_name','pl_category')    
env['company_ptype'] = unique_id(env,'company','category')
dict_list_2 = []
for x in dat[features_to_impute]:
    comp_ptype_mean = env.groupby(['company_ptype'])[x].mean().to_frame()

comp_ptype_means = pd.concat(dict_list_2,axis=1,ignore_index=(False))    

dict_list_3 = []
for i in dat[features_to_impute]:
    prod_type_mean = env.groupby(['category'])[i].mean().to_frame()

prod_type_means = pd.concat(dict_list_3,axis=1,ignore_index=(False))    

for x in dat[features_to_impute]:
    dat[x] = np.where(dat[x].isnull(),dat['company_ptype'].map(comp_ptype_means.set_index('company_ptype')[x]),dat[x]) # 1st step
    dat[x] = np.where(dat[x].isnull(),dat['pl_category'].map(prod_type_means.set_index('category')[x]),dat[x]) # 2nd step
    dat[x] = dat[x].fillna(dat[x].mean()) # 3rd  step

@Tito, if you have any suggestions on how to make this more efficient I am happy to hear them and use them.


CLICK HERE to find out more related problems solutions.

Leave a Comment

Your email address will not be published.

Scroll to Top