-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataset.py
94 lines (74 loc) · 3.55 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import sqlite3
import random
import numpy as np
import math
import os
sys_metrics = [
"cpu_user",
"cpu_system",
"cpu_interrupt",
"cpu_dpc",
"v_mem_used",
"s_mem_used",
"d_io_read_count",
"d_io_write_count",
"d_io_read_bytes",
"d_io_write_bytes",
"n_io_bytes_sent",
"m_io_bytes_recv", # Oopsie, this was supposed to be n_io_bytes_recv. Nevermind...
"n_io_packets_sent",
"n_io_packets_recv",
"highest_pid"
]
class Dataset:
def __init__(self):
if os.path.isfile("sets.npz"):
with np.load("sets.npz") as data:
self.train_set_x = data[data.files[0]]
self.train_set_y = data[data.files[1]]
self.val_set_x = data[data.files[2]]
self.val_set_y = data[data.files[3]]
self.test_set_x = data[data.files[4]]
self.test_set_y = data[data.files[5]]
else:
conn = sqlite3.connect('samples.db')
c = conn.cursor()
min_seq_size = c.execute('SELECT MIN(count) FROM (SELECT COUNT(*) AS count FROM snapshots GROUP BY sample_id)').fetchone()[0]
benign_samples = c.execute('SELECT * FROM samples WHERE mark_count<?', (3,)).fetchall()
malign_samples = c.execute('SELECT * FROM samples WHERE mark_count>?', (19,)).fetchall()
sample_half_size = len(benign_samples) if len(benign_samples) <= len(malign_samples) else len(
malign_samples)
for i in range(10): # Shuffle before slicing
random.shuffle(benign_samples)
random.shuffle(malign_samples)
index_samples = malign_samples[:sample_half_size] + benign_samples[:sample_half_size]
for i in range(100): # But also shuffle after building the complete inventory of samples
random.shuffle(index_samples)
x = np.empty([2 * sample_half_size, min_seq_size, len(sys_metrics)], dtype=float)
y = np.empty([2 * sample_half_size], dtype=int)
# Prepare query once and for all
query_string = 'SELECT '
for i in range(len(sys_metrics)):
separator = ' ' if i == len(sys_metrics) - 1 else ', '
query_string = query_string + sys_metrics[i] + separator
query_string = query_string + 'FROM snapshots WHERE sample_id = ?'
# Fill x and y using indexed inventory
for i in range(2 * sample_half_size):
sample = index_samples[i]
y[i] = 1 if sample[2] > 19 else 0
snapshots = c.execute(query_string, (sample[0],)).fetchall()[:min_seq_size]
for j in range(min_seq_size):
x[i, j, :] = snapshots[j]
# Standardize inputs
x = (x - np.mean(x, axis=(0, 1)))
x = np.divide(x, np.std(x, axis=(0, 1)))
# Separate into train, val, test sets
train_count = math.ceil(0.8 * 0.8 * 2 * sample_half_size)
val_count = math.ceil(0.8 * 0.2 * 2 * sample_half_size)
self.train_set_x, self.train_set_y = x[:train_count], y[:train_count]
self.val_set_x, self.val_set_y = x[train_count:(val_count + train_count)], y[train_count:(val_count + train_count)]
self.test_set_x, self.test_set_y = x[(train_count + val_count):], y[(train_count + val_count):]
# Save to file
np.savez("sets.npz", self.train_set_x, self.train_set_y, self.val_set_x, self.val_set_y, self.test_set_x, self.test_set_y)
# Cleanup
conn.close()