Skip to content Skip to sidebar Skip to footer

Group Pandas Dataframe & Validate With Condition

Dataframe: id Base field1 field2 field3 1 Y AA BB CC 1 N AA BB CC 1 N AA BB CC 2 Y D

Solution 1:

Use custom function:

deff(x):
    #boolena mask for compare Y
    mask = x['Base'] == 'Y'#check multiple Y by sum of Truesif mask.sum() > 1:
        x['Error'] = 'more than 1 base Y found for id:{}'.format(x.name)
    else:
        #remove columns for not comparing with not equal
        cols = x.columns.difference(['Base','product'])
        mask1 = x[cols].ne(x.loc[mask, cols])
        #if difference get columns names by dotif mask1.values.any():
            vals = mask1.dot(mask1.columns + ', ').str.rstrip(', ') + ' mismatch with base' 
            x['Error'] = np.where(mask, 'Base: Y', vals)    
        else:
            x['Error'] = np.where(mask, 'Base: Y', 'Pass')    

    return x

df = df.groupby(level=0).apply(f)
print (df)
   product Base field1 field2 field3                              Error
id1        A    Y     AA     BB     CC                            Base: Y
1        B    N     AA     BB     CC                               Pass
1        C    N     AA     BB     CC                               Pass
2        D    Y     DD     EE     FF                            Base: Y
2        E    N     OO     EE     WT  field1, field3 mismatch with base
2        F    N     DD     JQ     FF          field2 mismatch with base
3        G    Y     MM     NN     TT  more than 1 base Y found forid:33        H    Y     MM     NN     TT  more than 1 base Y found forid:33        I    N     MM     NN     TT  more than 1 base Y found forid:3

Sample DataFrame:

df = pd.DataFrame({'id': [1, 1, 1, 2, 2, 2, 3, 3, 3], 
                   'product': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'], 
                   'Base': ['Y', 'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'N'], 
                   'field1': ['AA', 'AA', 'AA', 'DD', 'OO', 'DD', 'MM', 'MM', 'MM'], 
                   'field2': ['BB', 'BB', 'BB', 'EE', 'EE', 'JQ', 'NN', 'NN', 'NN'], 
                   'field3': ['CC', 'CC', 'CC', 'FF', 'WT', 'FF', 'TT', 'TT', 'TT']})
df = df.set_index('id')
print (df)
   product Base field1 field2 field3
id                                  
1        A    Y     AA     BB     CC
1        B    N     AA     BB     CC
1        C    N     AA     BB     CC
2        D    Y     DD     EE     FF
2        E    N     OO     EE     WT
2        F    N     DD     JQ     FF
3        G    Y     MM     NN     TT
3        H    Y     MM     NN     TT
3        I    N     MM     NN     TT

Post a Comment for "Group Pandas Dataframe & Validate With Condition"