Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 26 additions & 20 deletions src/main/python/systemds/scuro/dataloader/timeseries_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,7 @@ def __init__(
def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
self.file_sanity_check(file)

if self.file_format == "npy":
data = self._load_npy(file)
elif self.file_format in ["txt", "csv"]:
with open(file, "r") as f:
first_line = f.readline()
if any(name in first_line for name in self.signal_names):
data = self._load_csv_with_header(file)
else:
data = self._load_txt(file)
data = self._load_data(file)

if data.ndim > 1 and len(self.signal_names) == 1:
data = data.flatten()
Expand All @@ -96,6 +88,18 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
)
self.data.append(data[i])

def _load_data(self, file: str) -> np.ndarray:
if self.file_format == "npy":
data = self._load_npy(file)
elif self.file_format in ["txt", "csv"]:
with open(file, "r") as f:
first_line = f.readline()
if any(name in first_line for name in self.signal_names):
data = self._load_csv_with_header(file)
else:
data = self._load_txt(file)
return data

def _normalize_signals(self, data: np.ndarray) -> np.ndarray:
if data.ndim == 1:
mean = np.mean(data)
Expand Down Expand Up @@ -145,14 +149,16 @@ def _load_csv_with_header(self, file: str, delimiter: str = None) -> np.ndarray:
return data

def get_stats(self, source_path: str):
pass # TODO: Implement this
# self.file_sanity_check(source_path)
# max_length = 0
# num_instances = 0
# num_signals = 0
# for file in os.listdir(source_path):
# data = self._load_npy(source_path + file)
# max_length = max(max_length, data.shape[0])
# num_instances += 1
# num_signals = max(num_signals, data.shape[1])
# return TimeseriesStats(max_length, num_instances, num_signals)
self.file_sanity_check(source_path)
max_length = 0
num_instances = 0
num_signals = 0
for file_name in self.indices:
file = source_path + file_name + "." + self.file_format
data = self._load_data(file)
max_length = max(max_length, data.shape[0])
num_instances += 1
num_signals = max(num_signals, data.shape[1])
return TimeseriesStats(
max_length, num_instances, num_signals, (max_length,), True
)
4 changes: 2 additions & 2 deletions src/main/python/systemds/scuro/representations/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ def execute(self, modality, aggregate_dim=(0,)):
def transform(self, modality):
return self.execute(modality)

def compute_feature(self, instance):
return self._aggregation_func(instance)
def compute_feature(self, instance, axis=0):
return self._aggregation_func(instance, axis)

def get_aggregation_functions(self):
return list(self._aggregation_function.keys())
8 changes: 4 additions & 4 deletions src/main/python/systemds/scuro/representations/average.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@

@register_fusion_operator()
class Average(Fusion):
def __init__(self):
def __init__(self, params=None):
"""
Combines modalities using averaging
"""
Expand All @@ -41,10 +41,10 @@ def __init__(self):
self.associative = True
self.commutative = True

def execute(self, modalities: List[Modality]):
data = copy.deepcopy(modalities[0].data)
def execute(self, modalities: List[Modality], labels=None):
data = np.asarray(copy.deepcopy(modalities[0].data), dtype=float)
for i in range(1, len(modalities)):
data += modalities[i].data
data += np.asarray(modalities[i].data, dtype=float)

data /= len(modalities)

Expand Down
38 changes: 21 additions & 17 deletions src/main/python/systemds/scuro/representations/concatenation.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,23 +79,27 @@ def get_output_stats(self, input_stats_list) -> RepresentationStats:
return RepresentationStats(0, (0,))

num_instances = stats_list[0].num_instances
rank = len(stats_list[0].output_shape)

if rank == 1:
total_dim = sum(s.output_shape[0] for s in stats_list)
output_shape = (total_dim,)
elif rank == 2:
time_dim = stats_list[0].output_shape[0]
total_dim = sum(s.output_shape[1] for s in stats_list)
output_shape = (time_dim, total_dim)
else:
output_shape = stats_list[0].output_shape
total_dim = sum(s.output_shape[-1] for s in stats_list)
output_shape = (total_dim,)

return RepresentationStats(num_instances, output_shape)

def estimate_peak_memory_bytes(self, input_stats) -> dict:
# TODO
return {
"cpu_peak_bytes": 0,
"gpu_peak_bytes": 0,
}
def estimate_peak_memory_bytes(self, input_stats_list) -> dict:
elem_size = np.dtype(np.float32).itemsize

def stats_bytes(s: RepresentationStats) -> int:
numel = int(np.prod(s.output_shape)) if len(s.output_shape) > 0 else 1
return int(s.num_instances * numel * elem_size)

current_output = 0
peak = 0
for s in input_stats_list:
chunk = stats_bytes(s)
new_output = current_output + chunk

step_peak = current_output + chunk + new_output + chunk
peak = max(peak, step_peak)
current_output = new_output

cpu_peak = int(peak * 1.15 + 16 * 1024 * 1024)
return {"cpu_peak_bytes": cpu_peak, "gpu_peak_bytes": 0}
30 changes: 17 additions & 13 deletions src/main/python/systemds/scuro/representations/hadamard.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,22 @@ def get_output_stats(self, input_stats_list) -> RepresentationStats:
if not stats_list:
return RepresentationStats(0, (0,))

def num_elements(stats: RepresentationStats) -> int:
n = 1
for d in stats.output_shape:
n *= d
return n
max_dim = max([stats.output_shape[-1] for stats in stats_list])
return RepresentationStats(stats_list[0].num_instances, (max_dim,))

largest = max(stats_list, key=num_elements)
return RepresentationStats(largest.num_instances, largest.output_shape)
def estimate_peak_memory_bytes(self, input_stats_list) -> dict:
elem_size = np.dtype(np.float64).itemsize

def estimate_peak_memory_bytes(self, input_stats) -> dict:
# TODO
return {
"cpu_peak_bytes": 0,
"gpu_peak_bytes": 0,
}
def stats_payload_bytes(s: RepresentationStats) -> int:
numel = int(np.prod(s.output_shape)) if len(s.output_shape) > 0 else 1
return int(s.num_instances * numel * elem_size)

stacked_input_bytes = sum(stats_payload_bytes(s) for s in input_stats_list)
out_stats = self.get_output_stats(input_stats_list)
output_bytes = stats_payload_bytes(out_stats)
reduction_workspace_bytes = output_bytes
cpu_peak = int(
(stacked_input_bytes + output_bytes + reduction_workspace_bytes) * 1.15
+ 8 * 1024 * 1024
)
return {"cpu_peak_bytes": cpu_peak, "gpu_peak_bytes": 0}
2 changes: 1 addition & 1 deletion src/main/python/systemds/scuro/representations/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False
)
for (batch_X,) in inference_dataloader:
batch_X = batch_X.to(device)
batch_X = batch_X.to(self.device)
features, _ = self.model(batch_X)
all_features.append(features.cpu())

Expand Down
2 changes: 1 addition & 1 deletion src/main/python/systemds/scuro/representations/max.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

@register_fusion_operator()
class RowMax(Fusion):
def __init__(self):
def __init__(self, params=None):
"""
Combines modalities by computing the outer product of a modality combination and
taking the row max
Expand Down
21 changes: 17 additions & 4 deletions src/main/python/systemds/scuro/representations/mlp_averaging.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,12 @@ def __init__(self, output_dim=512, batch_size=32, params=None):
"batch_size": [8, 16, 32, 64, 128],
}
super().__init__("MLPAveraging", parameters)
self.output_dim = output_dim
self.batch_size = batch_size
if params is not None:
self.output_dim = params.get("output_dim", output_dim)
self.batch_size = params.get("batch_size", batch_size)
else:
self.output_dim = output_dim
self.batch_size = batch_size
self.device = None
self.data_type = np.float32
self.gpu_id = None
Expand All @@ -70,7 +74,10 @@ def gpu_id(self, gpu_id):
self.device = get_device(gpu_id)

def get_output_stats(self, input_stats: RepresentationStats) -> RepresentationStats:
if len(input_stats.output_shape) > 1:
if (
len(input_stats.output_shape) > 1
and np.prod(input_stats.output_shape) > self.output_dim
):
return RepresentationStats(
input_stats.num_instances,
(self.output_dim,),
Expand All @@ -87,7 +94,13 @@ def get_output_stats(self, input_stats: RepresentationStats) -> RepresentationSt
)
return RepresentationStats(
input_stats.num_instances,
(self.output_dim,),
(
(
np.prod(input_stats.output_shape)
if np.prod(input_stats.output_shape) < self.output_dim
else self.output_dim
),
),
output_shape_is_known=input_stats.output_shape_is_known,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(
batch_size=32,
num_epochs=20,
learning_rate=0.001,
params=None,
):
parameters = {
"hidden_dim": [32, 128, 256, 384, 512, 768],
Expand Down
12 changes: 8 additions & 4 deletions src/main/python/systemds/scuro/representations/spectrogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,14 @@ def transform(self, modality, aggregation=None):
return transformed_modality

def compute_feature(self, instance):
data = np.array(instance)
spectrogram = librosa.stft(
y=np.array(np.abs(instance)), hop_length=self.hop_length, n_fft=self.n_fft
y=np.abs(data), hop_length=self.hop_length, n_fft=self.n_fft
)
return librosa.amplitude_to_db(np.abs(spectrogram)).T
if data.ndim == 1:
return librosa.amplitude_to_db(np.abs(spectrogram)).T

return librosa.amplitude_to_db(np.abs(spectrogram)).transpose(0, 2, 1)

def estimate_peak_memory_bytes(self, input_stats) -> dict:
# TODO: validate this function
Expand All @@ -74,7 +78,7 @@ def estimate_peak_memory_bytes(self, input_stats) -> dict:
if signal_length < self.n_fft:
num_frames = 1
else:
num_frames = 1 + (signal_length - self.n_fft) // self.hop_length
num_frames = 1 + signal_length // self.hop_length
num_frames = max(int(num_frames), 1)
num_freq_bins = 1 + self.n_fft // 2

Expand Down Expand Up @@ -121,7 +125,7 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
if signal_length < self.n_fft:
num_frames = 1
else:
num_frames = 1 + (signal_length - self.n_fft) // self.hop_length
num_frames = 1 + signal_length // self.hop_length
num_frames = max(int(num_frames), 1)

num_freq_bins = 1 + self.n_fft // 2
Expand Down
32 changes: 19 additions & 13 deletions src/main/python/systemds/scuro/representations/sum.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,18 +61,24 @@ def get_output_stats(self, input_stats_list) -> RepresentationStats:
if not stats_list:
return RepresentationStats(0, (0,))

def num_elements(stats: RepresentationStats) -> int:
n = 1
for d in stats.output_shape:
n *= d
return n
max_dim = max([stats.output_shape[-1] for stats in stats_list])
return RepresentationStats(stats_list[0].num_instances, (max_dim,))

largest = max(stats_list, key=num_elements)
return RepresentationStats(largest.num_instances, largest.output_shape)
def estimate_peak_memory_bytes(self, input_stats_list) -> dict:
elem_size = np.dtype(np.float64).itemsize

def estimate_peak_memory_bytes(self, input_stats) -> dict:
# TODO
return {
"cpu_peak_bytes": 0,
"gpu_peak_bytes": 0,
}
def stats_payload_bytes(s: RepresentationStats) -> int:
numel = int(np.prod(s.output_shape)) if len(s.output_shape) > 0 else 1
return int(s.num_instances * numel * elem_size)

first_bytes = stats_payload_bytes(input_stats_list[0])
max_other_bytes = 0
if len(input_stats_list) > 1:
max_other_bytes = max(stats_payload_bytes(s) for s in input_stats_list[1:])

ufunc_workspace_bytes = int(0.1 * max(first_bytes, max_other_bytes))
cpu_peak = int(
(first_bytes + max_other_bytes + ufunc_workspace_bytes) * 1.15
+ 8 * 1024 * 1024
)
return {"cpu_peak_bytes": cpu_peak, "gpu_peak_bytes": 0}
Loading
Loading