-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataclass.py
executable file
·75 lines (72 loc) · 2.92 KB
/
dataclass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
# encoding: utf-8
# Benjamin Imlay
import pandas as pd
from pathlib import Path
class expressionData():
def __init__(self,data_dir,manifest):
"""
The constructor expects the data dir of type Path from pathlib, a manifest dictionary with data, sample_meta, and subject_meta.
"""
self.rawCounts=self.readRawCounts(data_dir/manifest["data"])
self.subjMeta=self.readMeta(data_dir/manifest["subject_meta"])
self.sampleMeta=self.readMeta(data_dir/manifest["sample_meta"])
self.sampleMeta=self.orderMeta()
self.sampleMeta=self.mergeSubjSamp()
def orderMeta(self):
""""
This function reads, removes, and orders metadata entries to align with the rawcounts matrix
Input:
self
Output:
DataFrame of ordered sample metadata
"""
SAMPID_DataFrame=pd.DataFrame({'SAMPID':pd.Series(self.rawCounts.index)})
meta=SAMPID_DataFrame.merge(self.sampleMeta,how='left',on='SAMPID')
print("Trimmed and Ordered metadata to dimensions: ",meta.shape)
return meta
def mergeSubjSamp(self):
"""
This function parses the index of samples to get the subject prefix. Then the subject metadata is merged with the sample metadataw.
Input:
self.sampleMeta
self.subjMeta
Output:
Dataframe of ordered sample metadata.
"""
meta=self.subjMeta
SAMPmeta=self.sampleMeta
SAMPmeta['SUBJID']=pd.Series(SAMPmeta.index,index=SAMPmeta.index).str.rsplit('-',n=3).str.get(0)
SAMPmeta=SAMPmeta.merge(meta,on="SUBJID",how="left")
return SAMPmeta
def readMeta(self,data_path):
"""
This function reads raw GTEx metadata file in a tsv format.
Input:
Path (str) to the sample metadata file.
Output:
Pandas dataframe of sample metadata.
"""
data=pd.read_csv(data_path,sep="\t")
print("Imported metadata of dimensions",data.shape)
return data
def readRawCounts(self,data_path):
"""
This function reads raw counts in the gct format, which has two lines of metadata, and two columns for gene name.
Input:
Path (str) to the .gct file.
Output:
Pandas dataframe with integer count matrix and multiindex.
"""
data=pd.read_csv(data_path,sep="\t",skiprows=2,index_col=[0,1])
print("Imported raw count data of dimensions",data.T.shape)
return data.T
def main():
pass
if __name__ == '__main__':
data_dir=Path("data")
manifest={"data":"All_Tissue_Site_Details.combined.reads.gct",
"sample_meta":"GTEx_v7_Annotations_SampleAttributesDS.txt",
"subject_meta":"GTEx_v7_Annotations_SubjectPhenotypesDS.txt"}
data=expressionData(data_dir,manifest)
data.sampleMeta.to_csv(data_dir/"merged_meta.tsv",sep="\t")