-
Notifications
You must be signed in to change notification settings - Fork 0
/
kmer_counts.py
41 lines (32 loc) · 1 KB
/
kmer_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 18 15:29:54 2019
@author: YudongCai
@Email: [email protected]
"""
import click
import khmer
import pandas as pd
from itertools import product
@click.command()
@click.option('--infile', help='input fasta or fastq file')
@click.option('--k', help='[k]mer', type=int)
@click.option('--outfile', help='output file')
def main(infile, k, outfile):
kmerlist = []
for kmer in product('ACGT', repeat=k):
kmerlist.append(''.join(kmer))
df = []
seqids = []
for seq in khmer.ReadParser(infile):
seqids.append(seq.name)
df.append([])
counts = khmer.Counttable(k, 1e6, 4)
counts.set_use_bigcount(True)
counts.consume(seq.sequence.upper())
for kmer in kmerlist:
df[-1].append(counts.get(kmer))
df = pd.DataFrame(df, columns=kmerlist, index=seqids)
df.T.reset_index().to_csv(outfile, index=False, sep='\t')
if __name__ == '__main__':
main()