-
Notifications
You must be signed in to change notification settings - Fork 9
/
utils.py
74 lines (68 loc) · 2.72 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import json
from collections import defaultdict
import re
def find_all_keys_values(json_data,parent_key):
"""
Find all keys that don't have list or dictionary values and their values. Key should be saved with its parent key like "parent-key.key".
"""
key_values = defaultdict(list)
for key, value in json_data.items():
if isinstance(value, dict):
child_key_values = find_all_keys_values(value,key)
for child_key, child_value in child_key_values.items():
key_values[child_key].extend(child_value)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
child_key_values = find_all_keys_values(item,key)
for child_key, child_value in child_key_values.items():
key_values[child_key].extend(child_value)
else:
key_values[parent_key+"."+key].append(item)
else:
key_values[parent_key+"."+key].append(value)
return key_values
def make_csv_from_json(file_path):
"""
Make csv file from json file.
"""
if file_path.endswith(".json"):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
elif file_path.endswith(".jsonl"):
data = []
with open(file_path, 'r') as json_file:
json_list = list(json_file)
for json_str in json_list:
data.append(json.loads(json_str))
# find key_values
if isinstance(data, dict):
key_values = find_all_keys_values(data,"")
elif isinstance(data, list):
key_values = find_all_keys_values({"TOPLEVEL":data},"TOPLEVEL")
else:
raise ValueError('Your input JsonData is not a dictionary or list')
key_values = {k.replace("TOPLEVEL.",""):v for k,v in key_values.items() if len(v)>1}
df = pd.DataFrame({k:pd.Series(v) for k,v in key_values.items()})
# save to csv
save_pth = re.sub(r'\.jsonl?','.csv',file_path)
df.to_csv(save_pth, index=False, encoding='utf-8')
return df
def table_column_filter(table_df):
"""
Filter columns that have zero instances or all columns are "--"
"""
original_columns = table_df.columns
for column in table_df.columns:
column_data = [d for d in list(table_df[column]) if d == d and d != "--"]
if len(column_data) <= 1:
table_df = table_df.drop(column, axis=1)
continue
if "Unnamed:" in column:
table_df = table_df.drop(column, axis=1)
continue
remove_columns = list(set(original_columns) - set(table_df.columns))
if len(remove_columns) > 0:
print("Removed columns:", remove_columns)
return table_df