def downsample(features, target, fraction):
features_zeros = features[target == 0]
features_ones = features[target == 1]
target_zeros = target[target == 0]
target_ones = target[target == 1]
features_downsampled = pd.concat(
[features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
target_downsampled = pd.concat(
[target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
features_downsampled, target_downsampled = shuffle(
features_downsampled, target_downsampled, random_state=12345)
return features_downsampled, target_downsampled
features_downsampled, target_downsampled = downsample(features_train, target_train, 0.39)
model = LogisticRegression()
model.fit(features_downsampled,target_downsampled)
predicted_valid = model.predict(features_valid)
print("Accuracy:", accuracy_score(predicted_valid, target_valid))
print("F1:", f1_score(target_valid, predicted_valid))
Считать accuracy при дизбалансе классов не очень показательно
Посчитай precision, mcc, etc. Результат будет явно не такой радужный