Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Yc #8

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open

Yc #8

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions blue_plus/biosses_dataset/biosses.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# dataset name
name: BIOSSES

# description of this dataset.
description: A corpus of sentence pairs selected from the Biomedical Summarization Track Training
Dataset in the biomedical domain.

version: 1.0d

# The citation to use for this dataset.
citation: "Sogancioglu G, Ozturk H, Ozgur A. BIOSSES: a semantic sentence similarity estimation
system for the biomedical domain. Bioinformatics. 2017 Jul 12;33(14):i49-58."

# Homepages of the dataset
links:
# original dataset
train.tsv: http://pengyifan.com/tmp/BIOSSES/train.tsv
dev.tsv: http://pengyifan.com/tmp/BIOSSES/dev.tsv
test.tsv: http://pengyifan.com/tmp/BIOSSES/test.tsv
test_results.tsv: http://pengyifan.com/tmp/BIOSSES/test_results.tsv
# license information
# license.txt
# BERT version
# bert_train: bert_train.csv
# bert_dev: bert_dev.csv
# bert_test: bert_test.csv
# bert_test_results: bert_test_results.csv

121 changes: 119 additions & 2 deletions blue_plus/dataset.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import yaml

import urllib
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np
import os

class BaseDataset(object):
"""Abstract dataset class"""

def __init__(self, config_file):
print(config_file)
with open(config_file, encoding='utf8') as fp:
self.config = yaml.load(fp)
self.name = self.config['name'] if 'name' in self.config else ''
Expand Down Expand Up @@ -50,3 +54,116 @@ def evaluate(self, test_file, prediction_file, output_file):
results: string or pandas DataFrame that containing the evaluation results.
"""
raise NotImplementedError


class form_tsv(object):
def __init__(self, config_file="/rowdata/biosses/biosses.yml"):
config_file = os.getcwd()+config_file
with open(config_file, encoding='utf8') as fp:
self.config = yaml.load(fp)
self.save_path = self.config['save_path'] if 'save_path' in self.config else ''
self.description = self.config['description'] if 'description' in self.config else ''
self.path_pair = self.config['links'].get('path_pair') if 'links' in self.config else ""
self.path_score = self.config['links'].get('path_score') if 'links' in self.config else ""
self.data = None
def biosses_format_data(self, col_index_pair=[ "s1", 's2'],
col_index_score=[ 'score1', 'score2', 'score3', 'score4', 'score5']):
"""

self.path_pair: the path for pairs.xls
self.path_score: the path for scores.xls
:param col_index_pair: column index for pairs.xls
:param col_index_score: column index for scores.xls
:return: all formatted data in pandas frame
"""
if self.path_score is '':
raise FileNotFoundError("path_score doesn't exit,please check yml configuration file")
if self.path_pair is '':
raise FileNotFoundError("path_pair doesn't exit,please check yml configuration file")

new_col_index = ['genre', 'filename', 'year', 'old_index', 'source1', 'source2', 'sentence1',
'sentence2', 'score']
# drop the first line
data = pd.read_excel(os.getcwd()+self.path_pair,index_col=0, header=0, drop=True)
data.columns = col_index_pair

# index = [i for i in range(data.count()[0])]
# data.index = index
data = data.reset_index(drop=True)

score = pd.read_excel(os.getcwd()+self.path_score, index_col=0, header=0, drop=True)
score.columns = col_index_score
score = score.reset_index(drop=True)
# score.index = index

score = score.mean(axis=1)


rtn = pd.DataFrame(columns=new_col_index,dtype= str)
# rtn = rtn.reset_index(drop=True)
# rtn['index'] = index

rtn['genre'] = ['GENRE' for _ in range(100)]
rtn['filename'] = 'BIOSSES'
rtn['year'] = '1997'
rtn['old_index'] = data.index
rtn['source1'] = 'BIOSSES'
rtn['source2'] = 'BIOSSES'
rtn['sentence1'] = data[data.columns[0]]
rtn['sentence2'] = data[data.columns[1]]
rtn['score'] = score
self.data = rtn

return self.data

def data_split(self, data, p_train=0.7, p_test=0.14):
"""

:param data: all formatted data returned by biosses_format_data function
:param p_train: percentage for training data
:param p_test: percentage for testing data
note: p_train + p_test < 1
:return: train data, test data, dev data
"""
assert p_train + p_test < 1, "the percentages of training and testing data should be less than 100%"
data = data.sample(frac=1.0)

data_count = data.shape[0]

train_count = int(np.floor(data_count * p_train))
# print(train_count)
test_count = int(train_count + np.floor(data_count * p_test))

data = data.reset_index(drop=True)
# print(data['old_index'])

train = data.iloc[0:train_count]
train = train.reset_index(drop=True)
# print(train)
# print(train['old_index'])

# print(train_count," ", test_count)
test = data.iloc[train_count:test_count]
test = test.reset_index(drop=True)


dev = data.iloc[test_count:]
dev = dev.reset_index(drop=True)

test_results = test['score'].copy()
return train, test, dev, test_results

def save_files(self):
data = self.biosses_format_data()
splited_data = self.data_split(data,p_train= 0.7,p_test= 0.14)
#train,test,dev,test_results

for i,name in enumerate(self.save_path):
path = self.save_path[name]
if not os.path.exists(path):
splited_data[i].to_csv(path_or_buf= path, sep='\t')
else:
print(path,"exits! ")
break


4 changes: 4 additions & 0 deletions blue_plus/example_dataset/biosses_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,16 @@ def prepare_bert_format(self, input_file, output_file):
def main():
logging.basicConfig(level=logging.INFO)
dir = os.path.dirname(os.path.abspath(__file__))

print("dir:",dir)

d = BIOSSES_Dataset(os.path.join(dir, 'biosses.yml'))
print('Name: ', d.full_name)
print('Description:', d.description)
print('Citation: ', d.citation)

dir = Path('blue_plus_data') / d.full_name
print("dir:",dir)
dir.mkdir(parents=True, exist_ok=True)
d.download(override=True)
d.evaluate(dir / 'test.tsv', dir / 'test_results.tsv', dir / 'test_results.txt')
Expand Down
31 changes: 31 additions & 0 deletions blue_plus/rowdata/biosses/biosses.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# dataset name
name: BIOSSES_row_data

# description of this dataset.
description: the yml is used for configuration of row data processing

version: 1.0


# Homepages of the dataset
links:
# original dataset
train.tsv: /rowdata/biosses/train.tsv
dev.tsv: /rowdata/biosses/dev.tsv
test.tsv: /rowdata/biosses/test.tsv
test_results.tsv: /rowdata/biosses/test_results.tsv
path_pair: /rowdata/biosses/pairs.xls
path_score: /rowdata/biosses/scores.xls
# license information
# license.txt
# BERT version
# bert_train: bert_train.csv
# bert_dev: bert_dev.csv
# bert_test: bert_test.csv
# bert_test_results: bert_test_results.csv

save_path:
train: rowdata/biosses/train.tsv
test: rowdata/biosses/test.tsv
dev: rowdata/biosses/dev.tsv
test_results: rowdata/biosses/test_results.tsv
17 changes: 17 additions & 0 deletions blue_plus/rowdata/biosses/dev.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
genre filename year old_index source1 source2 sentence1 sentence2 score
0 GENRE BIOSSES 1997 78 BIOSSES BIOSSES In lung tumors, TRAF6 levels can become elevated by several mechanisms: "GATA2lox/lox Sporadic infection of lung cells with Adeno-Cre virus GATA2 loss induces regression of established tumors" 1.4
1 GENRE BIOSSES 1997 56 BIOSSES BIOSSES Alterations in Oct-4 expression promote differentiation and leads to the specification of ectodermal, endodermal or mesodermal primitive progenitors. Additionally, expression of OCT4 and SOX2 has been shown to affect early differentiation genes such as SOX-17 expression. 1.8
2 GENRE BIOSSES 1997 97 BIOSSES BIOSSES Three programs, PicTar, miRanda, and TargetScan , were used to predict the targets of miR-21. The genes that decreased 2-fold or more were further screened for possible miR-372/3 target sites using a local version of the TargetScan algorithm. 2.4
3 GENRE BIOSSES 1997 20 BIOSSES BIOSSES Up-regulation of miR-24 has been observed in a number of cancers, including OSCC. In addition, miR-24 is one of the most abundant miRNAs in cervical cancer cells, and is reportedly up-regulated in solid stomach cancers. 3.0
4 GENRE BIOSSES 1997 87 BIOSSES BIOSSES Centrosomes increase both in size and in microtubule-nucleating capacity just before mitotic entry. Functional studies showed that, when introduced into cell lines, miR-146a was found to promote cell proliferation in cervical cancer cells, which suggests that miR-146a works as an oncogenic miRNA in these cancers. 0.0
5 GENRE BIOSSES 1997 57 BIOSSES BIOSSES T47D, MCF-7, Skbr3, HeLa, and Caco-2 cells were transfected by electroporation as described previously. MCF7 or HeLa cells were electroporated as described previously to more than 95% efficiency with pSuper constructs against the various targets, and 72 hr later, protein expression was analyzed by SDS-PAGE and Western blotting. 3.0
6 GENRE BIOSSES 1997 81 BIOSSES BIOSSES Third, human Wts2 is a phosphorylation target of Aurora-A kinase, and this phosphorylation plays a role in regulating centrosomal localization of hWts2 ( Toji et al., 2004) Similarly to PLK1, Aurora-A activity is required for the enrichment or localisation of multiple centrosomal factors which have roles in maturation, including LATS2 [ 22] and CDK5RAP2/Cnn [ 23] (see [ 10] for a review) 1.2
7 GENRE BIOSSES 1997 0 BIOSSES BIOSSES It has recently been shown that Craf is essential for Kras G12D-induced NSCLC. It has recently become evident that Craf is essential for the onset of Kras-driven non-small cell lung cancer. 4.0
8 GENRE BIOSSES 1997 92 BIOSSES BIOSSES The cyclin-dependent kinase (CDK) inhibitor roscovitine has been reported to down-regulate the anti-apoptotic protein Mcl-1 Recent work in model systems and acute myelogenous leukemia has suggested that expression of MCL-1 is a key determinant of resistance to ABT-737 2.0
9 GENRE BIOSSES 1997 46 BIOSSES BIOSSES miR-Vec constructs were described before, and Dnd1 open-reading frames were cloned as described into a pCS2-based CMV expression vector to contain a double carboxy-terminal HA tag. The pMSCV-blast-miR plasmids, containing either hsa-miR-376a1 human miRNA or control miRNA (hTR-human telomerase RNA), were constructed as described previously. 1.0
10 GENRE BIOSSES 1997 49 BIOSSES BIOSSES Consequently miRNAs have been demonstrated to act either as oncogenes (e.g., miR-155, miR-17−5p and miR-21) or tumor suppressors (e.g., miR-34, miR-15a, miR-16−1 and let-7) Given the extensive involvement of miRNA in physiology, dysregulation of miRNA expression can be associated with cancer pathobiology including oncogenesis], proliferation, epithelial-mesenchymal transition, metastasis, aberrations in metabolism, and angiogenesis, among others 2.8
11 GENRE BIOSSES 1997 23 BIOSSES BIOSSES miR-223 regulates granulopoiesis by a feedback mechanism and is modulated competitively by the transcription factors nuclear factor I/A (NFI-A) and CCAAT/enhancer binding protein-α (C/EBPα) There is growing evidence from animal systems that miRNA-regulated transcription factors frequently regulate the transcription of their cognate miRNAs. 1.8
12 GENRE BIOSSES 1997 86 BIOSSES BIOSSES In PC9 cells, loss of GATA6 and/or HOPX did not alter cell growth, whereas reduction of GATA2 and EGFR inhibited cell viability as previously reported. Aurora-A is required for the correct localisation and function of centrosomal components like centrosomin, NDEL1, LATS and TACC proteins 0.2
13 GENRE BIOSSES 1997 6 BIOSSES BIOSSES Recently, it was reported that expression of IDH1R132H suppresses TET2 activity and the mutations of IDH1 and IDH2 genes occur in a mutual exclusive manner with that of TET2 gene in AML the mechanism was clarified by yet another genomic survey, this time involving acute myelogenous leukemia (AML). 1.6
14 GENRE BIOSSES 1997 73 BIOSSES BIOSSES Finally, researchers combined available inhibitors selective for two of the pathways regulated by GATA2 to treat mice with Kras-driven NSCLCs. In PC9 cells, loss of GATA6 and/or HOPX did not alter cell growth, whereas reduction of GATA2 and EGFR inhibited cell viability as previously reported. 1.8
15 GENRE BIOSSES 1997 3 BIOSSES BIOSSES More recently, IDH mutations and resultant 2-hydroxyglutarate (2HG) production in leukemia cells were reported to induce global DNA hypermethylation through impaired TET2 catalytic function. It has also been recently reported that mutations of the isocitrate dehydrogenase genes IDH1 and IDH2 can lead to the aberrant production of 2-hydroxyglutarate (2-HG), a metabolite that inhibits TET2 enzymatic activity, resulting in a hypermethylated promoter phenotype in acute myeloid leukemia (AML) tumors carrying IDH1/2 mutations. 3.2
Binary file added blue_plus/rowdata/biosses/pairs.xls
Binary file not shown.
Binary file added blue_plus/rowdata/biosses/scores.xls
Binary file not shown.
15 changes: 15 additions & 0 deletions blue_plus/rowdata/biosses/test.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
genre filename year old_index source1 source2 sentence1 sentence2 score
0 GENRE BIOSSES 1997 66 BIOSSES BIOSSES This oxidative branch activity is elevated in comparison to many cancer cell lines, where the oxidative branch is typically reduced and accounts for <20% of the carbon flow through PPP. The Downward laboratory went all the way from identifying GATA2 as a novel synthetic lethal gene to validating it using Kras-driven GEM models. 0.0
1 GENRE BIOSSES 1997 83 BIOSSES BIOSSES We found no obvious effect of LATS2-depletion on the Aurora-A kinase activity when monitored by phosphorylation state of Thr288 on Aurora-A [18] (Fig. S2), suggesting that LATS2 may be a downstream of Aurora-A as mentioned in a previous report [8] Among them, miR-143, miR-145 and miR-34a have been shown to inhibit cell proliferation, and miR-146a and miR-21 to increase cell growth 0.2
2 GENRE BIOSSES 1997 14 BIOSSES BIOSSES Several computational target prediction approaches, such as TargetScan, PicTar, miRanda, PITA, DIANA-microT and RNAhybrid, have been developed to predict target genes. Three programs, PicTar, miRanda, and TargetScan , were used to predict the targets of miR-21. 2.4
3 GENRE BIOSSES 1997 74 BIOSSES BIOSSES They identified that some genes involved in RHO-related signaling pathways were occupied by GATA2 in KRAS mutant but not wild-type tumor cells. The researchers combined available inhibitors selective for two of the pathways regulated by GATA2 to treat mice with Kras-driven NSCLCs 2.0
4 GENRE BIOSSES 1997 63 BIOSSES BIOSSES Expression of an activated form of Ras proteins can induce senescence in some primary fibroblasts. The senescent state has been observed to be inducible in certain cultured cells in response to high level expression of genes such as the activated ras oncogene. 3.6
5 GENRE BIOSSES 1997 22 BIOSSES BIOSSES Furthermore, a very recent study demonstrated the mechanism that specifies myeloid expression of miR-223 and proposed a unique “minicircuitry” comprised of miR-223 and transcription factors, NFI-A and CAAT enhancer-binding protein α (C/EBPα) miR-223 regulates granulopoiesis by a feedback mechanism and is modulated competitively by the transcription factors nuclear factor I/A (NFI-A) and CCAAT/enhancer binding protein-α (C/EBPα). 3.4
6 GENRE BIOSSES 1997 38 BIOSSES BIOSSES In eukaryotic cells, small G-proteins are critically regulated by Guanine nucleotide Exchange Factors (GEFs) and GTPase Activating Proteins (GAPs). Eukaryotic small G-proteins are often controlled through the balancing actions of GAPs and GEFs. 4.0
7 GENRE BIOSSES 1997 12 BIOSSES BIOSSES Furthermore, transiently expressed exogenous LATS2 is localized to centrosomes. LATS1 and LATS2 have been detected on interphase and mitotic centrosomes 3.0
8 GENRE BIOSSES 1997 59 BIOSSES BIOSSES A few studies have reported the control of APC/C by genotoxic stress in mammalian cells, Ionizing radiation was shown to activate the APC/C to degrade cyclin D1, which triggers an immediate p53-independent G1 arrest. In fact, genotoxic stresses such as ionizing radiation have been demonstrated to trigger rapid proteolysis of cyclin D1, leading to p53-independent G1 arrest. 3.8
9 GENRE BIOSSES 1997 18 BIOSSES BIOSSES Recently, miR-126 was identified as a metastasis suppressing miRNA that is downregulated in relapsing breast cancer, leukemia, and cervical cancer. Subsequent reports showed that miR-126 targeted the oncogene IRS-1 (insulin receptor substrate-1) in breast cancer cells and miR-126 was downregulated in cervical cancer. 3.2
10 GENRE BIOSSES 1997 33 BIOSSES BIOSSES The oncogenic activity of mutant Kras appears dependent on functional Craf, but not on Braf Notably, c-Raf has recently been found essential for development of K-Ras-driven NSCLCs 3.0
11 GENRE BIOSSES 1997 68 BIOSSES BIOSSES Considerable evidence indicates that cancer cells develop dependencies on normal functions of certain genes that can potentially be exploited to improve therapeutic strategies. In the case of cell response to stress, cyclin D1 can be degraded through its binding to the anaphase-promoting complex and a RXXL sequence located in the NH2-terminal part of the protein. 0.0
12 GENRE BIOSSES 1997 70 BIOSSES BIOSSES A recent study identified the importance of the GATA2 transcriptional network in RAS oncogene-driven NSCLC and suggested effective combinations targeting the proteasome together with IL-1 and Rho-signalling. Alternatively, the anaphase promoting complex (APC) is responsible for the rapid degradation of cyclin D1 in cells irradiated with ionizing radiation. 0.2
13 GENRE BIOSSES 1997 79 BIOSSES BIOSSES GATA2 is also of considerable interest; genetic ablation leads to tumor regression in mouse models of adenocarcinoma of the lung, and whereas this transcription factor may appear to be the least druggable of targets, its role in regulating the proteasome suggested therapeutic approaches that appear very promising The GATA2 transcription factor, which is essential for oncogenic K-ras-dependent lung tumor development, binds to the TRAF6 promoter and enhances its expression. 1.8
19 changes: 19 additions & 0 deletions blue_plus/rowdata/biosses/test_dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pandas as pd

index = [ 2,4,8,9,10]
col = ['index','B']

da = pd.DataFrame(data=None, columns= col)

da['index'] = index
da = da.reindex()
da = da.sample(frac=1.0)

print(da)

da = da.reset_index(drop=True)

for i in da['index']:
print(i)

da.to_csv('sssss.tsv',sep='\t')
Loading