Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
Default user committed Feb 6, 2024
1 parent ef953a7 commit 020a4bb
Showing 1 changed file with 35 additions and 57 deletions.
92 changes: 35 additions & 57 deletions src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,26 @@ def generate_dt_pairs(dt_df):
return pairs, labels


def balance_data(pairs, classes, n_proportion):
"""Don't take all the generated "don't interact" pairs, just the same amount of known pairs"""
classes = np.array(classes)
pairs = np.array(pairs)

indices_true = np.where(classes == 1)[0]
indices_false = np.where(classes == 0)[0]

np.random.shuffle(indices_false)
indices = indices_false[: (n_proportion * indices_true.shape[0])]

print(f"True positives: {len(indices_true)}")
print(f"True negatives: {len(indices_false)}")
pairs = np.concatenate((pairs[indices_true], pairs[indices]), axis=0)
classes = np.concatenate((classes[indices_true], classes[indices]), axis=0)

return pairs, classes



def multimetric_score(estimator, X_test, y_test, scorers):
"""Return a dict of score for multimetric scoring"""
scores = {}
Expand All @@ -68,24 +88,6 @@ def multimetric_score(estimator, X_test, y_test, scorers):
return scores


def balance_data(pairs, classes, n_proportion):
classes = np.array(classes)
pairs = np.array(pairs)

indices_true = np.where(classes == 1)[0]
indices_false = np.where(classes == 0)[0]

np.random.shuffle(indices_false)
indices = indices_false[: (n_proportion * indices_true.shape[0])]

print(f"True positives: {len(indices_true)}")
print(f"True negatives: {len(indices_false)}")
pairs = np.concatenate((pairs[indices_true], pairs[indices]), axis=0)
classes = np.concatenate((classes[indices_true], classes[indices]), axis=0)

return pairs, classes


def get_scores(clf, X_new, y_new):
scoring = ["precision", "recall", "accuracy", "roc_auc", "f1", "average_precision"]
scorers = metrics._scorer._check_multimetric_scoring(clf, scoring=scoring)
Expand Down Expand Up @@ -223,21 +225,6 @@ def train(
n_jobs=-1,
class_weight="balanced",
)
# xgb_model = XGBClassifier(
# n_estimators=200,
# max_depth=config.max_depth,
# learning_rate=0.1,
# subsample=0.8,
# colsample_bytree=0.8,
# gamma=0,
# reg_alpha=0,
# reg_lambda=1,
# objective='binary:logistic', # For binary classification
# n_jobs=-1,
# random_state=42,
# tree_method='hist', # Use GPU optimized histogram algorithm
# # device='gpu',
# )

# clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]
clfs = [("Random Forest", rf_model)] # "XGBoost", xgb_model
Expand Down Expand Up @@ -269,11 +256,6 @@ def train(
################### Train with a grid of hyperparameters to find the best


# def get_params_combinations(params):
# keys, values = zip(*params.items())
# combinations = [dict(zip(keys, v)) for v in product(*values)]
# return combinations

def train_gpu(
df_known_interactions: pd.DataFrame,
df_drugs_embeddings: pd.DataFrame,
Expand Down Expand Up @@ -346,21 +328,19 @@ def train_gpu(
best_accuracy = 0
os.makedirs("models", exist_ok=True)

# for fold, (train_index, test_index) in enumerate(kf.split(X)):
# Train model for each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
x_train, x_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
start_fold_time = time.time()

# # Send data to GPU for XGBoost
send_time = time.time()
# dtrain = xgb.DMatrix(x_train, label=y_train)
# dtest = xgb.DMatrix(x_test, label=y_test)
# # print(f"Sending data to GPU took {time.time() - send_time}s")
# Send data to GPU for XGBoost
# dtrain = xgb.DMatrix(x_train, label=y_train)
# dtest = xgb.DMatrix(x_test, label=y_test)

# # Train XGBoost model
# model = xgb.train(params, dtrain, num_boost_round=100)
# predictions = model.predict(dtest)
# # Train XGBoost model
# model = xgb.train(params, dtrain, num_boost_round=100)
# predictions = model.predict(dtest)

# Train Random Forest model
model = RandomForestClassifier(**params)
Expand Down Expand Up @@ -395,14 +375,12 @@ def train_gpu(
with open(save_model, "wb") as f:
pickle.dump(model, f)

# os.makedirs("models", exist_ok=True)
# with open(save_model_path, "wb") as f:
# pickle.dump(model, f)

# Force garbage collection for xgb on GPU:
# del dtrain, dtest, model
gc.collect() # Force garbage collection for xgb on GPU
# gc.collect()

print(fold_results)
log.info(f"Completed fold {fold + 1}/{n_splits} in {time.time() - send_time}s")
log.info(f"Completed fold {fold + 1}/{n_splits} in {time.time() - start_fold_time}s")

log.info(f"Combination took {time.time() - combination_time}s")

Expand Down Expand Up @@ -483,10 +461,10 @@ def exclude_similar(input_dir, subject_sim_threshold: float = 1, object_sim_thre
os.makedirs(out_dir, exist_ok=True)

# Longer version:
# subject_sim_thresholds = [1, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70]
# object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.96, 0.95, 0.94, 0.93, 0.92, 0.91, 0.90]
subject_sim_thresholds = [1]
object_sim_thresholds = [1]
subject_sim_thresholds = [1, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65, 0.60, 0.55, 0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20, 0.15, 0.10]
object_sim_thresholds = [1, 0.97, 0.94, 0.91, 0.88, 0.85, 0.82, 0.79, ]
# subject_sim_thresholds = [1]
# object_sim_thresholds = [1]
# params = { #XGB
# 'max_depth': 3,
# 'n_estimators': 100,
Expand Down

0 comments on commit 020a4bb

Please sign in to comment.