-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfix_json.py
84 lines (68 loc) · 2.76 KB
/
fix_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import json
import shutil
# Paths
output_dataset_path = "./dataset/gary"
train_jsonl_path = os.path.join(output_dataset_path, "train.jsonl")
test_jsonl_path = os.path.join(output_dataset_path, "test.jsonl")
# egs folder paths
egs_train_folder = "./egs/train"
egs_eval_folder = "./egs/eval"
# Function to load JSONL entries
def load_jsonl_entries(jsonl_path):
if os.path.exists(jsonl_path):
with open(jsonl_path, "r") as f:
return [json.loads(line) for line in f]
return []
# Function to save JSONL entries
def save_jsonl_entries(jsonl_path, entries):
with open(jsonl_path, "w") as f:
for entry in entries:
f.write(json.dumps(entry) + '\n')
# Function to filter entries
def filter_entries(entries):
filtered_entries = []
for entry in entries:
if os.path.exists(entry["path"]):
filtered_entries.append(entry)
else:
print(f"File {entry['path']} does not exist. Removing entry.")
return filtered_entries
# Function to remove duplicate entries based on file path
def remove_duplicates(entries):
seen_paths = set()
unique_entries = []
for entry in entries:
if entry["path"] not in seen_paths:
unique_entries.append(entry)
seen_paths.add(entry["path"])
else:
print(f"Duplicate entry found and removed: {entry['path']}")
return unique_entries
# Function to move and rename files, replacing if the file already exists
def move_and_rename_jsonl_file(src_path, dest_folder, new_filename):
dest_path = os.path.join(dest_folder, new_filename)
# Remove the destination file if it exists
if os.path.exists(dest_path):
os.remove(dest_path)
print(f"Removed existing {dest_path}")
# Copy the source file to the destination and rename it
shutil.copy(src_path, dest_path)
print(f"Moved and renamed {src_path} to {dest_path}")
# Load entries
train_entries = load_jsonl_entries(train_jsonl_path)
test_entries = load_jsonl_entries(test_jsonl_path)
# Filter entries
filtered_train_entries = filter_entries(train_entries)
filtered_test_entries = filter_entries(test_entries)
# Remove duplicates
unique_train_entries = remove_duplicates(filtered_train_entries)
unique_test_entries = remove_duplicates(filtered_test_entries)
# Save filtered and unique entries
save_jsonl_entries(train_jsonl_path, unique_train_entries)
save_jsonl_entries(test_jsonl_path, unique_test_entries)
# Move and rename 'train.jsonl' to 'egs/train/data.jsonl'
move_and_rename_jsonl_file(train_jsonl_path, egs_train_folder, "data.jsonl")
# Move and rename 'test.jsonl' to 'egs/eval/data.jsonl'
move_and_rename_jsonl_file(test_jsonl_path, egs_eval_folder, "data.jsonl")
print("Finished cleaning, deduplicating, and moving JSONL files.")