-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
64 lines (47 loc) · 2.14 KB
/
main.py
File metadata and controls
64 lines (47 loc) · 2.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import random
import shutil
import subprocess
from create_tree_sitter import build_tree_sitter
from function_splitter import snippler
from path_cst_extractor import create_dataset
from path_ast_extractor import create_ast_dataset
from path_parso_extractor import create_parso_dataset
from Line_Of_Dataset import create_dict,split_train_test_val
if __name__ == '__main__':
###SET OVERSAMPLING
preprocessing= True #set oversampling On or Off
#dataredPreproBLACKED:354 numero example>200 1976
#dataredPreproMINI: 354 numero example>200 1674
#dataredPrepro:354 numero example>200 1976
#dataredPreproMINI n_examples: 1888
#dataredPreproBLACKED n_examples: 1593
#dataredPrepro n_examples: 1593
build_tree_sitter()
folders=['gcjpyred','gcjpyredMINI', 'gcjpyredBLACKED'] #,
'''
# get a list of all subdirectories in the first folder
first_folder_path = os.path.join('datasets','raw_dataset', 'gcjpyred')
all_subdirs = [d for d in os.listdir(first_folder_path) if os.path.isdir(os.path.join(first_folder_path, d))]
# randomly select 10 of these subdirectories
random.seed(123) # use a seed for reproducibility
selected_subdirs = random.sample(all_subdirs, 20)
for folder in folders:
new_folder = os.path.join('datasets', 'random_subdirs', folder)
os.makedirs(new_folder, exist_ok=True)
# copy the selected subdirectories into the new folder
for subdir in selected_subdirs:
src_path = os.path.join('datasets','raw_dataset', folder, subdir)
dest_path = os.path.join(new_folder, subdir)
if not os.path.exists(dest_path):
shutil.copytree(src_path, dest_path)
'''
#creation pipeline
for folder in folders:
print(f"Processing {folder}...")
origin = os.path.join('datasets','raw_dataset', folder)
destination = os.path.join('datasets','processed_dataset', folder + "paper")
create_dataset(origin, destination)
split_train_test_val(preprocessing, destination)
create_dict(destination)
print(f"Completed processing {folder}\n")