-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataProcessing.py
More file actions
118 lines (84 loc) · 4.45 KB
/
dataProcessing.py
File metadata and controls
118 lines (84 loc) · 4.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
'''
Return DataLoaders to iterate over the Pytorch data
'''
import numpy as np
import h5py
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
# There are some global variables that these functions use
trainFrac = .6
valFrac = .1
testFrac = .3
nClasses = 3
class emShowersDatasetFlat(Dataset):
"""EM showers dataset"""
def __init__(self, N, relPath='../data/', trainFrac=trainFrac, transform=None):
"""
Instantiates a class which then returns examples as a tuple for the
image labels, and the truth labels are:
0 (gamma), 1 (pi-plus), 2 (positron)
Args:
N: The number of images we have for each particle class
relPath: The relative path to where the hdf5 files live
Caveat: I just manually subtracted the mean images from the training
set and did the tensor transforms, but actually, this would be done
a lot more elegantly with Pytorch's transforms
"""
d_gamma = h5py.File(relPath+'gamma.hdf5', 'r')
d_piplus = h5py.File(relPath+'piplus.hdf5', 'r')
d_eplus = h5py.File(relPath+'eplus.hdf5', 'r')
# Subtract the mean image from the training set in each of the layers
l0_gamma_mean = np.mean(d_gamma['layer_0'][:int(trainFrac*N)],axis=0)
l1_gamma_mean = np.mean(d_gamma['layer_1'][:int(trainFrac*N)],axis=0)
l2_gamma_mean = np.mean(d_gamma['layer_2'][:int(trainFrac*N)],axis=0)
l0_piplus_mean = np.mean(d_piplus['layer_0'][:int(trainFrac*N)],axis=0)
l1_piplus_mean = np.mean(d_piplus['layer_1'][:int(trainFrac*N)],axis=0)
l2_piplus_mean = np.mean(d_piplus['layer_2'][:int(trainFrac*N)],axis=0)
l0_eplus_mean = np.mean(d_eplus['layer_0'][:int(trainFrac*N)],axis=0)
l1_eplus_mean = np.mean(d_eplus['layer_1'][:int(trainFrac*N)],axis=0)
l2_eplus_mean = np.mean(d_eplus['layer_2'][:int(trainFrac*N)],axis=0)
layer0_mean = (l0_gamma_mean + l0_piplus_mean + l0_eplus_mean) / 3.
layer1_mean = (l1_gamma_mean + l1_piplus_mean + l1_eplus_mean) / 3.
layer2_mean = (l2_gamma_mean + l2_piplus_mean + l2_eplus_mean) / 3.
layer0 = np.vstack((d_gamma['layer_0'][:N], d_piplus['layer_0'][:N], d_eplus['layer_0'][:N]))
layer1 = np.vstack((d_gamma['layer_1'][:N], d_piplus['layer_1'][:N], d_eplus['layer_1'][:N]))
layer2 = np.vstack((d_gamma['layer_2'][:N], d_piplus['layer_2'][:N], d_eplus['layer_2'][:N]))
# Reshape the tensors as NxCxHxW, with C=1 in this case :)
m0,h0,w0 = layer0.shape
layer0 = layer0.reshape(m0,1,h0,w0)
m1,h1,w1 = layer1.shape
layer1 = layer1.reshape(m1,1,h1,w1)
m2,h2,w2 = layer2.shape
layer2 = layer2.reshape(m2,1,h2,w2)
# Test to make sure that all of the datasets are the same length
self.layer0 = torch.from_numpy(layer0 - layer0_mean).type(torch.FloatTensor)
self.layer1 = torch.from_numpy(layer1 - layer1_mean).type(torch.FloatTensor)
self.layer2 = torch.from_numpy(layer2 - layer2_mean).type(torch.FloatTensor)
# Get the y labels
self.y = torch.from_numpy(np.concatenate((np.zeros(N), np.ones(N), 2*np.ones(N))))
def __len__(self):
return self.layer0.shape[0]
def __getitem__(self, idx):
return self.layer0[idx], self.layer1[idx], self.layer2[idx], self.y[idx]
def getDataLoaders(batch_size=64, N=100000):
'''
Input:
batch_size
N: Number of events / particle, 100k uses all the available data
Returns: loader_train, loader_val, loader_test
DataLoaders for the train, val, and test sets
'''
nClasses = 3
dset = emShowersDatasetFlat(N=N)
idxTrain = []
idxVal = []
idxTest = []
for i in range(nClasses):
idxTrain += [j for j in range(i*N, int((i+trainFrac)*N))]
idxVal += [j for j in range(int((i+trainFrac)*N), int((i+trainFrac+valFrac)*N))]
idxTest += [j for j in range(int((i+trainFrac+valFrac)*N), (i+1)*N)]
loader_train = DataLoader(dset, batch_size=batch_size, sampler=SubsetRandomSampler(idxTrain))
loader_val = DataLoader(dset, batch_size=batch_size, sampler=SubsetRandomSampler(idxVal))
loader_test = DataLoader(dset, batch_size=batch_size, sampler=SubsetRandomSampler(idxTest))
return loader_train, loader_val, loader_test