-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.py
54 lines (45 loc) · 2.43 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import os
import collections
import onco_lens_ner as lens
BG_COLOR = {"TRT": "#997a8d", "SYM": "#db7093", "MET": "#ffb6c1", "CANC_T": "#e68fac", "SIZE": "#fc89ac", "EMO": "#f78fa7", "PPL": "#dea5a4", "MED": "#e18e96",
"MHD": "#ff91af", "ORG": "#ff91a4", "ADV_EFF": "#f19cbb", "INV": "#efbbcc", "POB": "#F9CBCB", "EGY": "#e8ccd7", "DUR": "#f7bfbe", "AGE": "#c4c3d0",
"GENDER": "#ffc1cc", "STG": "#aa98a9", "EXP": "#d98695", "A/G": "#dea5a4", "RES": "#cc8899", "DIAG": "#fc6c85", "GPE": "#c9c0bb", "NUM": "#e5ccc9"}
# format a typical entity for display
def format_entity(token, tag):
if tag:
start_mark = f'<mark class="entity" style="background: {BG_COLOR[tag]}; padding: 0.4em 0.4em; margin: 0 0.25em; line-height: 0.8; border-radius: 0.25em;">'
end_mark = '\n</mark>'
start_span = '<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">'
end_span = '\n</span>'
return f"\n{start_mark}{token}{start_span}{tag}{end_span}{end_mark}"
return f"{token}"
# extract all known entities in a lists
def get_token_tags(txtstr, entities):
begin, tokens_tags = 0, []
for start, vals in entities.items():
length, ent, tag = vals
if begin <= start:
tokens_tags.append((txtstr[begin:start], None))
tokens_tags.append((txtstr[start:start+length], tag))
begin = start+length
tokens_tags.append((txtstr[begin:], None)) #add the last untagged chunk
return tokens_tags
def visualize(text, ents):
html, end_div = f'<div class="entities" style="line-height: 2.3; direction: ltr">', '\n</div>'
for token, tag in get_token_tags(text, ents):
html += format_entity(token,tag)
html += end_div
return html
# merge two entities
def merge_entities(first_ents, second_ents):
return collections.OrderedDict(
sorted({** second_ents, **first_ents}.items()))
# show text unformated text
def show_plain_text(txtstr):
'Original text:'
start_mark = f'<mark class="entity" style="background: #FFFFFF; line-height: 2; border-radius: 0.35em;">'
end_mark = '\n</mark>'
return f"{start_mark}{txtstr}{end_mark}"
#============================= Building the Spacy model ===================================
EXAMPLES_DIR = os.path.join('resources', 'example_texts')
example_files = sorted([f for f in os.listdir(EXAMPLES_DIR) if f.endswith('.txt')])