DaehwanKimLab · chbe-helix · Feb 6, 2020 · Feb 6, 2020 · Feb 14, 2020 · Feb 17, 2020
diff --git a/Makefile b/Makefile
@@ -237,8 +237,6 @@ HT2LIB_PKG_SRC = \
 GENERAL_LIST = $(wildcard scripts/*.sh) \
 	$(wildcard scripts/*.pl) \
 	$(wildcard *.py) \
-	$(wildcard hisatgenotype_modules/*.py) \
-	$(wildcard hisatgenotype_scripts/*.py) \
 	$(wildcard example/index/*.ht2) \
 	$(wildcard example/reads/*.fa) \
 	example/reference/22_20-21M.fa \

diff --git a/hisat2_extract_exons.py b/hisat2_extract_exons.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Copyright 2015, Daehwan Kim <[email protected]>
@@ -19,8 +19,6 @@
 # along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-from __future__ import print_function
-
 from sys import stderr, exit
 from collections import defaultdict as dd, Counter
 from argparse import ArgumentParser, FileType

diff --git a/hisat2_extract_snps_haplotypes_UCSC.py b/hisat2_extract_snps_haplotypes_UCSC.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Copyright 2015, Daehwan Kim <[email protected]>
@@ -23,6 +23,7 @@
 import sys, subprocess
 import re
 from argparse import ArgumentParser, FileType
+from functools import cmp_to_key
 
 
 """
@@ -80,8 +81,8 @@ def compare_vars(a, b):
 
     # daehwan - for debugging purposes
     if a_chr != b_chr:
-        print a
-        print b
+        print(a)
+        print(b)
 
     assert a_chr == b_chr
     if a_pos != b_pos:
@@ -129,7 +130,7 @@ def generate_haplotypes(snp_file,
     assert len(vars) > 0
 
     # Sort variants and remove redundant variants
-    vars = sorted(vars, cmp=compare_vars)
+    vars = sorted(vars, key=cmp_to_key(compare_vars))
     tmp_vars = []
     v = 0
     while v < len(vars):
@@ -223,8 +224,8 @@ def generate_haplotypes(snp_file,
         else:
             assert type == 'I'
             type = "insertion"
-        print >> snp_file, "%s\t%s\t%s\t%s\t%s" % \
-            (varID, type, chr, pos, data)
+        print("%s\t%s\t%s\t%s\t%s" % (varID, type, chr, pos, data),
+                file=snp_file)
 
     # genotypes_list looks like
     #    Var0: 0
@@ -270,7 +271,7 @@ def split_haplotypes(haplotypes):
                     split_haplotypes.add('#'.join(haplotype[prev_s:s]))
         return split_haplotypes
 
-    haplotypes2 = split_haplotypes(haplotypes)
+    haplotypes2 = sorted(list(split_haplotypes(haplotypes)))
 
     def cmp_haplotype(a, b):
         a = a.split('#')
@@ -288,8 +289,8 @@ def cmp_haplotype(a, b):
         if a_begin != b_begin:
             return a_begin - b_begin
         return a_end - b_end
-    
-    haplotypes = sorted(list(haplotypes2), cmp=cmp_haplotype)
+
+    haplotypes = sorted(list(haplotypes2), key=cmp_to_key(cmp_haplotype))
 
     # Write haplotypes
     for h_i in range(len(haplotypes)):
@@ -317,8 +318,8 @@ def cmp_haplotype(a, b):
         for id in h:
             var_dic = vars[int(id)][4]
             h_add.append(var_dic["id2"])
-        print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \
-            (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add))
+        print("ht%d\t%s\t%d\t%d\t%s" % (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add)),
+                file=haplotype_file)
         num_haplotypes += 1
 
     return num_haplotypes
@@ -352,6 +353,7 @@ def main(genome_file,
     else:
         snp_cmd = ["cat", snp_fname]
     snp_proc = subprocess.Popen(snp_cmd,
+                                text=True,
                                 stdout=subprocess.PIPE,
                                 stderr=open("/dev/null", 'w'))
     ids_seen = set()
@@ -447,10 +449,10 @@ def main(genome_file,
                 if testset:
                     ref_seq = chr_seq[start-50:start+50]
                     alt_seq = chr_seq[start-50:start] + allele + chr_seq[start+1:start+50]
-                    print >> ref_testset_file, ">%s_single_%d" % (rs_id, start - 50)
-                    print >> ref_testset_file, ref_seq
-                    print >> alt_testset_file, ">%s_single_%d_%s" % (rs_id, start - 50, ref_seq)
-                    print >> alt_testset_file, alt_seq
+                    print(">%s_single_%d" % (rs_id, start - 50), file=ref_testset_file)
+                    print(ref_seq, file=ref_testset_file)
+                    print(">%s_single_%d_%s" % (rs_id, start - 50, ref_seq), file=alt_testset_file)
+                    print(alt_seq, file=alt_testset_file)
 
         elif classType == "deletion":
             if start > 0:
@@ -475,10 +477,10 @@ def main(genome_file,
             if testset and delLen > 0 and delLen <= 10:
                 ref_seq = chr_seq[start-50:start+50]
                 alt_seq = chr_seq[start-50:start] + chr_seq[start+delLen:start+50+delLen]
-                print >> ref_testset_file, ">%s_deletion_%d" % (rs_id, start - 50)
-                print >> ref_testset_file, ref_seq
-                print >> alt_testset_file, ">%s_deletion_%d_%s" % (rs_id, start - 50, ref_seq)
-                print >> alt_testset_file, alt_seq
+                print(">%s_deletion_%d" % (rs_id, start - 50), file=ref_testset_file)
+                print(ref_seq, file=ref_testset_file)
+                print(">%s_deletion_%d_%s" % (rs_id, start - 50, ref_seq), file=alt_testset_file)
+                print(alt_seq, file=alt_testset_file)
         else:
             assert classType == "insertion"
             if start > 0:
@@ -497,10 +499,10 @@ def main(genome_file,
                     if testset and insLen > 0 and insLen <= 10:
                         ref_seq = chr_seq[start-50:start+50]
                         alt_seq = chr_seq[start-50:start] + allele + chr_seq[start:start+50-insLen]
-                        print >> ref_testset_file, ">%s_insertion_%d" % (rs_id, start - 50)
-                        print >> ref_testset_file, ref_seq
-                        print >> alt_testset_file, ">%s_insertion_%d_%s" % (rs_id, start - 50, ref_seq)
-                        print >> alt_testset_file, alt_seq
+                        print(">%s_insertion_%d" % (rs_id, start - 50), file=ref_testset_file)
+                        print(ref_seq, file=ref_testset_file)
+                        print(">%s_insertion_%d_%s" % (rs_id, start - 50, ref_seq), file=alt_testset_file)
+                        print(alt_seq, file=alt_testset_file)
 
         if curr_right < end:
             curr_right = end

diff --git a/hisat2_extract_snps_haplotypes_VCF.py b/hisat2_extract_snps_haplotypes_VCF.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 #
 # Copyright 2016, Daehwan Kim <[email protected]>
 #
@@ -21,6 +21,7 @@
 
 import sys, os, subprocess
 from argparse import ArgumentParser, FileType
+from functools import cmp_to_key
 
 digit2str = [str(i) for i in range(10)]
 
@@ -100,11 +101,11 @@ def extract_vars(chr_dic, chr, pos, ref_allele, alt_alleles, varID):
         ref_allele2, pos2 = ref_allele, pos
 
         if chr_seq[pos:pos+len(ref_allele)] != ref_allele:
-            print >> sys.stderr, "Error: the reference genome you provided seems to be incompatible with the VCF file at %d of chromosome %s where %s is in the reference genome while %s is in the VCF file" % (pos, chr, chr_seq[pos:pos+len(ref_allele)], ref_allele)
+            print("Error: the reference genome you provided seems to be incompatible with the VCF file at %d of chromosome %s where %s is in the reference genome while %s is in the VCF file" % (pos, chr, chr_seq[pos:pos+len(ref_allele)], ref_allele), file=sys.stderr)
 
         def warning_msg():
-            print >> sys.stderr, "Warning) ref allele (%s) and alt allele (%s in %s) at chr%s:%d are excluded." % \
-                (ref_allele, alt_allele, ','.join(alt_alleles), chr, pos + 1)
+            print("Warning) ref allele (%s) and alt allele (%s in %s) at chr%s:%d are excluded." % \
+                (ref_allele, alt_allele, ','.join(alt_alleles), chr, pos + 1), file=sys.stderr)
 
         min_len = min(len(ref_allele2), len(alt_allele2))
         if min_len >= 2:
@@ -170,7 +171,7 @@ def generate_haplotypes(snp_file,
     assert len(vars) > 0
 
     # Sort variants and remove redundant variants
-    vars = sorted(vars, cmp=compare_vars)
+    vars = sorted(vars, key=cmp_to_key(compare_vars))
     tmp_vars = []
     v = 0
     while v < len(vars):
@@ -203,8 +204,8 @@ def generate_haplotypes(snp_file,
         else:
             assert type == 'I'
             type = "insertion"
-        print >> snp_file, "%s\t%s\t%s\t%s\t%s" % \
-            (varID, type, chr, pos, data)
+        print("%s\t%s\t%s\t%s\t%s" % \
+            (varID, type, chr, pos, data), file=snp_file)
 
     # variant compatibility
     vars_cmpt = [-1 for i in range(len(vars))]
@@ -363,7 +364,7 @@ def split_haplotypes(haplotypes):
                     split_haplotypes.add('#'.join(haplotype[prev_s:s]))
         return split_haplotypes
 
-    haplotypes2 = split_haplotypes(haplotypes)
+    haplotypes2 = sorted(list(split_haplotypes(haplotypes)))
 
     def cmp_haplotype(a, b):
         a = a.split('#')
@@ -382,7 +383,7 @@ def cmp_haplotype(a, b):
             return a_begin - b_begin
         return a_end - b_end
 
-    haplotypes = sorted(list(haplotypes2), cmp=cmp_haplotype)
+    haplotypes = sorted(list(haplotypes2), key=cmp_to_key(cmp_haplotype))
 
     # daehwan - for debugging purposes
     """
@@ -424,8 +425,8 @@ def cmp_haplotype(a, b):
         for id in h:
             var_dic = vars[int(id)][4]
             h_add.append(var_dic["id2"])
-        print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \
-            (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add))
+        print("ht%d\t%s\t%d\t%d\t%s" % \
+            (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add)), file=haplotype_file)
         num_haplotypes += 1
 
     return num_haplotypes
@@ -464,6 +465,7 @@ def main(genome_file,
         else:
             vcf_cmd = ["cat", genotype_vcf]
         vcf_proc = subprocess.Popen(vcf_cmd,
+                                    text=True,
                                     stdout=subprocess.PIPE,
                                     stderr=open("/dev/null", 'w'))
         for line in vcf_proc.stdout:
@@ -525,17 +527,17 @@ def main(genome_file,
 
                 var_set.add(var_str)
 
-        print >> sys.stderr, "Number of variants in %s is:" % (genotype_vcf)
+        print("Number of variants in %s is:" % (genotype_vcf), file=sys.stderr)
         for chr, vars in genotype_var_list.items():
             vars = sorted(vars, cmp=compare_vars)
-            print >> sys.stderr, "\tChromosome %s: %d variants" % (chr, len(vars))
+            print("\tChromosome %s: %d variants" % (chr, len(vars)), file=sys.stderr)
 
         for chr, gene_ranges in genotype_ranges.items():
             for gene, value in gene_ranges.items():
                 gene_ranges[gene] = [value[0] - 100, value[1] + 100]
                 value = genotype_ranges[chr][gene]
                 if verbose:
-                    print >> sys.stderr, "%s\t%s\t%d-%d" % (chr, gene, value[0], value[1])
+                    print("%s\t%s\t%d-%d" % (chr, gene, value[0], value[1]), file=sys.stderr)
 
         if extra_files or True:
             clnsig_file = open("%s.clnsig" % base_fname, 'w')
@@ -544,7 +546,7 @@ def main(genome_file,
                     varID = var[4]["id2"]
                     CLNSIG = var[4]["CLNSIG"]
                     gene = var[4]["gene"]
-                    print >> clnsig_file, "%s\t%s\t%s" % (varID, gene, CLNSIG)
+                    print("%s\t%s\t%s" % (varID, gene, CLNSIG), file=clnsig_file)
             clnsig_file.close()
 
     SNP_file = open("%s.snp" % base_fname, 'w')
@@ -558,7 +560,7 @@ def main(genome_file,
                 left, right = value
                 if reference_type == "gene":
                     left, right = 0, right - left
-                print >> ref_file, "%s\t%s\t%d\t%d" % (gene, chr, left, right)
+                print("%s\t%s\t%d\t%d" % (gene, chr, left, right), file=ref_file)
         ref_file.close()
 
         if reference_type == "gene":
@@ -567,10 +569,10 @@ def main(genome_file,
                 for gene, value in gene_ranges.items():
                     left, right = value
                     left, right = 0, right - left
-                    print >> backbone_file, ">%s" % (gene)
+                    print(">%s" % (gene), file=backbone_file)
                     backbone_seq = chr_dic[chr][value[0]:value[1]+1]
                     for s in range(0, len(backbone_seq), 60):
-                        print >> backbone_file, backbone_seq[s:s+60]
+                        print(backbone_seq[s:s+60], file=backbone_file)
             backbone_file.close()
         elif reference_type == "chromosome":
             first = True
@@ -604,6 +606,7 @@ def main(genome_file,
             else:
                 vcf_cmd = ["cat", VCF_fname]
             vcf_proc = subprocess.Popen(vcf_cmd,
+                                        text=True,
                                         stdout=subprocess.PIPE,
                                         stderr=open("/dev/null", 'w'))
 
@@ -665,7 +668,7 @@ def main(genome_file,
                 offset = 0
                 gene = None
                 if num_lines % 10000 == 1:
-                    print >> sys.stderr, "\t%s:%d\r" % (chr, pos),
+                    print("\t%s:%d\r" % (chr, pos), file=sys.stderr)
 
                 if chr_genotype_ranges:
                     skip = True
@@ -883,6 +886,7 @@ def add_vars(pos,
             else:
                 vcf_cmd = ["cat", args.genotype_vcf]
             vcf_proc = subprocess.Popen(vcf_cmd,
+                                        text=True,
                                         stdout=subprocess.PIPE,
                                         stderr=open("/dev/null", 'w'))
             for line in vcf_proc.stdout:
@@ -900,7 +904,7 @@ def add_vars(pos,
             args.genotype_gene_list = args.genotype_gene_list.split(',')
 
         if len(args.genotype_gene_list) == 0:
-            print >> sys.stderr, "Error: please specify --genotype-gene-list."
+            print("Error: please specify --genotype-gene-list.", file=sys.stderr)
             sys.exit(1)
 
     else:

diff --git a/hisat2_extract_splice_sites.py b/hisat2_extract_splice_sites.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Copyright 2015, Daehwan Kim <[email protected]>
@@ -19,8 +19,6 @@
 # along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-from __future__ import print_function
-
 from sys import stderr, exit
 from collections import defaultdict as dd, Counter
 from argparse import ArgumentParser, FileType
@@ -105,18 +103,18 @@ def extract_splice_sites(gtf_file, verbose=False):
                 len(genes), sum(len(v) > 1 for v in genes.values())),
               file=stderr)
         print('transcripts: {}, transcript avg. length: {:.0f}'.format(
-                len(trans), sum(trans_lengths.elements())/len(trans)),
+                len(trans), sum(trans_lengths.elements())//len(trans)),
               file=stderr)
         print('exons: {}, exon avg. length: {:.0f}'.format(
                 sum(exon_lengths.values()),
-                sum(exon_lengths.elements())/sum(exon_lengths.values())),
+                sum(exon_lengths.elements())//sum(exon_lengths.values())),
               file=stderr)
         print('introns: {}, intron avg. length: {:.0f}'.format(
                 sum(intron_lengths.values()),
-                sum(intron_lengths.elements())/sum(intron_lengths.values())),
+                sum(intron_lengths.elements())//sum(intron_lengths.values())),
               file=stderr)
         print('average number of exons per transcript: {:.0f}'.format(
-                sum(exon_lengths.values())/len(trans)),
+                sum(exon_lengths.values())//len(trans)),
               file=stderr)