-
Notifications
You must be signed in to change notification settings - Fork 1
/
covid_pred_lgb.py
59 lines (48 loc) · 2.09 KB
/
covid_pred_lgb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
"""LightGBM Model
"""
import numpy as np
import sys
import os
import sklearn as sk
import lightgbm as lgb
import pandas as pd
DATA_FILE = sys.argv[1] # CSV file with features, groups, and labels
FEATURE_NAMES = sys.argv[2] # column header names (treated as strings) to use as features (data in rows, i.e. generated by np.savetxt)
CLASS_LABELS_COL = sys.argv[3] # column header name (string) to use for class labels (i.e. y)
TRAIN_INDEX_COL = sys.argv[4] # column header name (string) that has a 1 if that row of features should be used for training
OUTPUT_FILE = sys.argv[5] # name for model output file (script adds the correct extension)
PRED_FILE = sys.argv[6] # raw (not rounded) predictions (in rows, no extension will be added)
NUM_BOOST_ROUNDS = int(sys.argv[7]) # no. boost rounds, typical value = 500
MAX_DEPTH = int(sys.argv[8]) # default -1 for no limit, typical value can be 15
MIN_LEAF_DATA = int(sys.argv[9]) # default 20, typical values is 50
LEARN_RATE = float(sys.argv[10]) # learning rate (typical 0.01)
try:
RAND_STATE = int(sys.argv[11])
except:
RAND_STATE = 324
try:
NTHREADS = int(sys.argv[12])
except:
NTHREADS = 0
# If any of PREDICT_FILE or OUTPUT_FILE are set to "na",
# then it won't be used
data = pd.read_csv(DATA_FILE)
feature_names = np.loadtxt(FEATURE_NAMES, dtype='str')
x = data[feature_names].values
# mask values that are unavailable -- indicated as negative
x = np.where(x < 0, np.nan, x)
y = data[CLASS_LABELS_COL].values
trainindex_col = data[TRAIN_INDEX_COL].values
trainindex = np.where(trainindex_col == 1)[0]
xtrain = x[trainindex]
ytrain = y[trainindex]
datatrain = lgb.Dataset(xtrain, ytrain)
params = {'objective':'binary', 'learning_rate':LEARN_RATE, 'seed':RAND_STATE,
'max_depth':MAX_DEPTH, 'min_data_in_leaf':MIN_LEAF_DATA, 'verbose': 1, 'n_threads':NTHREADS}
bst = lgb.train(params=params, train_set=datatrain, num_boost_round=NUM_BOOST_ROUNDS)
if OUTPUT_FILE.lower() != "na":
bst.save_model(OUTPUT_FILE+".json")
if PRED_FILE.lower() != "na":
pred = bst.predict(data=x)
np.savetxt(PRED_FILE, pred)