-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathprint_dataset_details.py
103 lines (79 loc) · 2.53 KB
/
print_dataset_details.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
__author__ = "Andrea Galassi"
__copyright__ = "Copyright 2018-2020 Andrea Galassi"
__license__ = "BSD 3-clause"
__version__ = "0.2.0"
__email__ = "[email protected]"
"""
Code to print the details of a dataframe from a specific corpus
"""
import os
import pandas
import json
import random
import sys
import ast
import numpy as np
import argparse
def print_dataframe_details(dataframe_path):
df = pandas.read_pickle(dataframe_path)
print(df.head())
print()
print('total relations')
print(len(df))
print()
column = 'source_to_target'
print(df[column].value_counts())
print()
column = 'relation_type'
print(df[column].value_counts())
print()
column = 'text_ID'
print(column)
print(len(df[column].drop_duplicates()))
print()
column = 'source_ID'
print(column)
print(len(df[column].drop_duplicates()))
print()
df1 = df[['source_ID', 'source_type']]
column = 'source_type'
df2 = df1.drop_duplicates()
print(len(df2))
print(df2[column].value_counts())
print("LIST OF DOCUMENT IDs")
column = 'text_ID'
print(column)
print(list(df[column].drop_duplicates()))
def print_details(dataset_name, dataset_version):
dataset_path = os.path.join(os.getcwd(), 'Datasets', dataset_name)
for split in ('train', 'test', 'validation', 'total'):
print(split)
dataframe_path = os.path.join(dataset_path, 'pickles', dataset_version, split + '.pkl')
print_dataframe_details(dataframe_path)
print('_______________________')
print('_______________________')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Create a new dataframe")
parser.add_argument('-c', '--corpus',
choices=["rct", "drinv", "cdcp", "echr", "ukp", "scidtb"],
help="Corpus", default="cdcp")
args = parser.parse_args()
corpus = args.corpus
if corpus.lower() == "rct":
dataset_name = "RCT"
dataset_version = 'total'
elif corpus.lower() == "cdcp":
dataset_name = 'cdcp_ACL17'
dataset_version = 'new_3'
elif corpus.lower() == "drinv":
dataset_name = 'DrInventor'
dataset_version = 'arg10'
elif corpus.lower() == "ukp":
dataset_name = 'AAEC_v2'
dataset_version = 'new_2'
elif corpus.lower() == "scidtb":
dataset_name = 'scidtb_argmin_annotations'
dataset_version = 'only_arg_v1'
else:
print("Datset not yet supported")
print_details(dataset_name, dataset_version)