cleanup

MaastrichtU-IDS · Feb 6, 2024 · 020a4bb · 020a4bb
1 parent ef953a7
commit 020a4bb
Showing 1 changed file with 35 additions and 57 deletions.
diff --git a/src/train.py b/src/train.py
@@ -46,6 +46,26 @@ def generate_dt_pairs(dt_df):
     return pairs, labels
 
 
+def balance_data(pairs, classes, n_proportion):
+    """Don't take all the generated  "don't interact" pairs, just the same amount of known pairs"""
+    classes = np.array(classes)
+    pairs = np.array(pairs)
+
+    indices_true = np.where(classes == 1)[0]
+    indices_false = np.where(classes == 0)[0]
+
+    np.random.shuffle(indices_false)
+    indices = indices_false[: (n_proportion * indices_true.shape[0])]
+
+    print(f"True positives: {len(indices_true)}")
+    print(f"True negatives: {len(indices_false)}")
+    pairs = np.concatenate((pairs[indices_true], pairs[indices]), axis=0)
+    classes = np.concatenate((classes[indices_true], classes[indices]), axis=0)
+
+    return pairs, classes
+
+
+
 def multimetric_score(estimator, X_test, y_test, scorers):
     """Return a dict of score for multimetric scoring"""
     scores = {}
@@ -68,24 +88,6 @@ def multimetric_score(estimator, X_test, y_test, scorers):
     return scores
 
 
-def balance_data(pairs, classes, n_proportion):
-    classes = np.array(classes)
-    pairs = np.array(pairs)
-
-    indices_true = np.where(classes == 1)[0]
-    indices_false = np.where(classes == 0)[0]
-
-    np.random.shuffle(indices_false)
-    indices = indices_false[: (n_proportion * indices_true.shape[0])]
-
-    print(f"True positives: {len(indices_true)}")
-    print(f"True negatives: {len(indices_false)}")
-    pairs = np.concatenate((pairs[indices_true], pairs[indices]), axis=0)
-    classes = np.concatenate((classes[indices_true], classes[indices]), axis=0)
-
-    return pairs, classes
-
-
 def get_scores(clf, X_new, y_new):
     scoring = ["precision", "recall", "accuracy", "roc_auc", "f1", "average_precision"]
     scorers = metrics._scorer._check_multimetric_scoring(clf, scoring=scoring)
@@ -223,21 +225,6 @@ def train(
         n_jobs=-1,
         class_weight="balanced",
     )
-    # xgb_model = XGBClassifier(
-    #     n_estimators=200,
-    #     max_depth=config.max_depth,
-    #     learning_rate=0.1,
-    #     subsample=0.8,
-    #     colsample_bytree=0.8,
-    #     gamma=0,
-    #     reg_alpha=0,
-    #     reg_lambda=1,
-    #     objective='binary:logistic',  # For binary classification
-    #     n_jobs=-1,
-    #     random_state=42,
-    #     tree_method='hist', # Use GPU optimized histogram algorithm
-    #     # device='gpu',
-    # )
 
     # clfs = [('Naive Bayes',nb_model),('Logistic Regression',lr_model),('Random Forest',rf_model)]
     clfs = [("Random Forest", rf_model)] # "XGBoost", xgb_model
@@ -269,11 +256,6 @@ def train(
 ################### Train with a grid of hyperparameters to find the best
 
 
-# def get_params_combinations(params):
-# 	keys, values = zip(*params.items())
-# 	combinations = [dict(zip(keys, v)) for v in product(*values)]
-# 	return combinations
-
 def train_gpu(
     df_known_interactions: pd.DataFrame,
     df_drugs_embeddings: pd.DataFrame,
@@ -346,21 +328,19 @@ def train_gpu(
     best_accuracy = 0
     os.makedirs("models", exist_ok=True)
 
-    # for fold, (train_index, test_index) in enumerate(kf.split(X)):
     # Train model for each fold
     for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
         x_train, x_test = X[train_index], X[test_index]
         y_train, y_test = y[train_index], y[test_index]
+        start_fold_time = time.time()
 
-#         # Send data to GPU for XGBoost
-        send_time = time.time()
-#         dtrain = xgb.DMatrix(x_train, label=y_train)
-#         dtest = xgb.DMatrix(x_test, label=y_test)
-#         # print(f"Sending data to GPU took {time.time() - send_time}s")
+        # Send data to GPU for XGBoost
+        # dtrain = xgb.DMatrix(x_train, label=y_train)
+        # dtest = xgb.DMatrix(x_test, label=y_test)
 
-#         # Train XGBoost model
-#         model = xgb.train(params, dtrain, num_boost_round=100)
-#         predictions = model.predict(dtest)
+        # # Train XGBoost model
+        # model = xgb.train(params, dtrain, num_boost_round=100)
+        # predictions = model.predict(dtest)
 
         # Train Random Forest model
         model = RandomForestClassifier(**params)
@@ -395,14 +375,12 @@ def train_gpu(
             with open(save_model, "wb") as f:
                 pickle.dump(model, f)
 
-        # os.makedirs("models", exist_ok=True)
-        # with open(save_model_path, "wb") as f:
-        #     pickle.dump(model, f)
-
+        # Force garbage collection for xgb on GPU:
         # del dtrain, dtest, model
-        gc.collect()  # Force garbage collection for xgb on GPU
+        # gc.collect()
+
         print(fold_results)
-        log.info(f"Completed fold {fold + 1}/{n_splits} in {time.time() - send_time}s")
+        log.info(f"Completed fold {fold + 1}/{n_splits} in {time.time() - start_fold_time}s")
 
     log.info(f"Combination took {time.time() - combination_time}s")
 
@@ -483,10 +461,10 @@ def exclude_similar(input_dir, subject_sim_threshold: float = 1, object_sim_thre
     os.makedirs(out_dir, exist_ok=True)
 
     # Longer version:
-    # subject_sim_thresholds = [1, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70]
-    # object_sim_thresholds = [1, 0.99, 0.98, 0.97, 0.96, 0.95, 0.94, 0.93, 0.92, 0.91, 0.90]
-    subject_sim_thresholds = [1]
-    object_sim_thresholds = [1]
+    subject_sim_thresholds = [1, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65, 0.60, 0.55, 0.50, 0.45, 0.40, 0.35, 0.30, 0.25, 0.20, 0.15, 0.10]
+    object_sim_thresholds = [1, 0.97, 0.94, 0.91, 0.88, 0.85, 0.82, 0.79, ]
+    # subject_sim_thresholds = [1]
+    # object_sim_thresholds = [1]
     # params = { #XGB
     #     'max_depth': 3,
     #     'n_estimators': 100,