-
Notifications
You must be signed in to change notification settings - Fork 17
/
whaleDataCreatorNumpyToTorchTensors.py
executable file
·175 lines (128 loc) · 6.68 KB
/
whaleDataCreatorNumpyToTorchTensors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from helperFunctions import *
## Actual usage: python whaleDataCreatorNumpyToTorchTensors.py -numpyDataDir /Users/tarinziyaee/data/whaleData/processedData/
# Helper grouping classes.
class directory:
None
class filename:
None
class I:
None
class N:
None
# Input parser
parser = argparse.ArgumentParser(description='Settings')
parser.add_argument('-numpyDataDir', dest='numpyDataDir', required = True, type=str)
parser.add_argument('-valPercentage', dest='valPercentage', default=0.2, type=float)
parser.add_argument('-testPercentage', dest='testPercentage', default=0.1, type=float)
args = parser.parse_args()
# Save into local variables
directory.loadNumpyDataFrom = args.numpyDataDir
N.valPercentage = args.valPercentage
N.testPercentage = args.testPercentage
def splitData(testND, valND, trainND, pDataPos, pDataNeg):
""" This function will operate on binary data, and create a data split of the data set into a training,
validation, and test set, whereby the validation and test sets have an equal amount of data per class,
but the training set it not guaranteed to. For example, the test set will contain testND positive
and testND negative samples, the validation set contains valND positives and valND negatives, and
the training set contains trainND of the non-dominant class, followed by whatever is left of the
dominant class. Lastly, before the training/val/test sets are returned, they are de-meaned and std
normalized by those factors computed from the training set.
Args:
testND: The non-domimant number of test samples desired.
valND: The non-domimant number of validation samples desired.
trainND: The non-domimant number of training samples desired.
pDataPos: All positive samples from the data set.
pDataNeg: All negative samples from the data set.
Returns:
pTrainingDataPos: The positive data split for the training data.
pTrainingDataNeg: The negative data split for the training data.
(pValData, pValLabels): The tuple containing the validation data set and validation labels.
(pTestData, pTestLabels): The tuple containing the test data set and test labels.
pTrainingMean: The mean image from the training set.
pTrainingStd: The std image from the training set. """
dataShape = pDataPos.shape[1:4]
# Extract the test set:
pTestData = np.zeros((2*testND, ) + dataShape).astype(np.float32)
pTestLabels = -99*np.ones(2*testND).astype(np.int64)
pTestData[0:testND,:,:,:] = np.copy(pDataPos[0:testND,:,:,:])
pTestLabels[0:testND] = 1
pTestData[testND : 2*testND,:,:,:] = np.copy(pDataNeg[0:testND,:,:,:])
pTestLabels[testND : 2*testND] = 0
# Extract the validation set:
pValData = np.zeros((2*valND, ) + dataShape).astype(np.float32)
pValLabels = -99*np.ones(2*valND).astype(np.int64)
pValData[0:valND,:,:,:] = np.copy(pDataPos[testND : testND + valND,:,:,:])
pValLabels[0:valND] = 1
pValData[valND : 2*valND, :,:,:] = np.copy(pDataNeg[testND : testND + valND,:,:,:])
pValLabels[valND : 2*valND] = 0
# Extract the training set, (just split the existing pos/neg splits)
pTrainingDataPos = np.copy(pDataPos[(testND + valND):, :, :, :])
pTrainingDataNeg = np.copy(pDataNeg[(testND + valND):, :, :, :])
# Normalize the data:
# Compute training mean and std.
trainingPosNegConcat = np.concatenate((pTrainingDataPos, pTrainingDataNeg), 0)
pTrainingMean = np.mean(trainingPosNegConcat, 0)
pTrainingStd = np.std(trainingPosNegConcat - pTrainingMean, 0)
# Now de-mean the training and validation sets, using the TRAINING mean of course.
pTrainingDataPos -= pTrainingMean
pTrainingDataNeg -= pTrainingMean
pValData -= pTrainingMean
pTestData -= pTrainingMean
# Normalize the variance
pTrainingDataPos /= (pTrainingStd + 1e-6)
pTrainingDataNeg /= (pTrainingStd + 1e-6)
pValData /= (pTrainingStd + 1e-6)
pTestData /= (pTrainingStd + 1e-6)
return pTrainingDataPos, pTrainingDataNeg, (pValData, pValLabels), (pTestData, pTestLabels), pTrainingMean, pTrainingStd
def minimumSamples(percentage, nNonDominant):
""" For imbalanced binary data, we still desire a specific percentage split of the total data to go towards training/validation/testing.
However if data is imbalanced, then we take the minimum number of samples so that are not dominated by the bigger class.
Args:
percentage: The percentage of the total data we initially desired to take as a separate split.
nNonDominant: The number of non dominant samples.
Returns:
samples: The minimum number of samples commensurate with the desired split and the imbalance. """
samples = np.round(percentage * nNonDominant).astype(np.int64)
return samples
# Set the percentage split for validation and test data.
N.trainPercentage = 1 - (N.testPercentage + N.valPercentage)
# Set the seed used for shuffling the data.
np.random.seed(1)
# Load all the numpy data
pData = np.load(directory.loadNumpyDataFrom + 'pData' + '.npy') # Data has already been demeaned.
pLabels = np.load(directory.loadNumpyDataFrom + 'pLabels' + '.npy')
# First shuffle the entire deck
I.randomIndices = np.random.permutation(pData.shape[0])
pData = pData[I.randomIndices, :,:,:]
pLabels = pLabels[I.randomIndices]
# Split the pData into pDataPos and pDataNeg
pDataPos = np.copy(pData[pLabels==1,:,:,:])
pDataNeg = np.copy(pData[pLabels==0,:,:,:])
# Determine class dominance:
if pDataPos.shape[0] >= pDataNeg.shape[0]:
# Negative non-dominant.
N.nonDominant = pDataNeg.shape[0]
else:
# Positive non-dominant.
N.nonDominant = pDataPos.shape[0]
# Compute minimum sample numbers.
N.testND = minimumSamples(N.testPercentage, N.nonDominant)
N.valND = minimumSamples(N.valPercentage, N.nonDominant)
N.trainND = N.nonDominant - (N.valND + N.testND)
# Create the training/validation/test splits
pTrainingDataPos, pTrainingDataNeg, valTuple, testTuple, _ , _ = splitData(N.testND, N.valND, N.trainND, pDataPos, pDataNeg)
# Save off as torch tensors.
tTrainingDataPos = torch.Tensor(pTrainingDataPos)
tTrainingDataNeg = torch.Tensor(pTrainingDataNeg)
tValData = torch.Tensor(valTuple[0])
tValLabels = torch.Tensor(valTuple[1]).long()
tTestData = torch.Tensor(testTuple[0])
tTestLabels = torch.Tensor(testTuple[1]).long()
## Now save off those tensors:
torch.save(tTrainingDataPos, directory.loadNumpyDataFrom + 'tTrainingDataPos')
torch.save(tTrainingDataNeg, directory.loadNumpyDataFrom + 'tTrainingDataNeg')
torch.save(tValData, directory.loadNumpyDataFrom + 'tValData')
torch.save(tValLabels, directory.loadNumpyDataFrom + 'tValLabels')
torch.save(tTestData, directory.loadNumpyDataFrom + 'tTestData')
torch.save(tTestLabels, directory.loadNumpyDataFrom + 'tTestLabels')
print ('FIN')