naobservatory · simonleandergrimm · Nov 29, 2023 · Nov 29, 2023 · Nov 29, 2023 · Nov 29, 2023
diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
@@ -0,0 +1,12 @@
+name: Presubmit
+on: [push]
+jobs:
+  Presubmit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/setup-python@v4
+        with:
+            python-version: '3.11'
+      - uses: actions/checkout@v3
+      - run: python3 -m pip install -r requirements-dev.txt
+      - run: ./check.sh
diff --git a/bioprojects/PRJEB13833/metadata/prepare-metadata.py b/bioprojects/PRJEB13833/metadata/prepare-metadata.py
@@ -6,12 +6,16 @@
             if line.startswith("run"):
                 continue
 
-            run_accession, sample_accession, sample_alias, sample_title = line.split(
-                "\t"
-            )
+            (
+                run_accession,
+                sample_accession,
+                sample_alias,
+                sample_title,
+            ) = line.split("\t")
 
             _, yyyy, mm, dd, site = sample_title.split("_")
 
             outf.write(
-                "%s\t%s-%s-%s\tCluster %s\n" % (run_accession, yyyy, mm, dd, site)
+                "%s\t%s-%s-%s\tCluster %s\n"
+                % (run_accession, yyyy, mm, dd, site)
             )
diff --git a/bioprojects/PRJEB49260/metadata/prepare_metadata.py b/bioprojects/PRJEB49260/metadata/prepare_metadata.py
@@ -26,4 +26,6 @@
             dd = date[2:4]
             yy = date[4:6]
 
-            outf.write("%s\t20%s-%s-%s\t%s\n" % (run_accession, yy, mm, dd, location))
+            outf.write(
+                "%s\t20%s-%s-%s\t%s\n" % (run_accession, yy, mm, dd, location)
+            )
diff --git a/bioprojects/PRJNA729801/metadata/parse_metadata.py b/bioprojects/PRJNA729801/metadata/parse_metadata.py
@@ -39,7 +39,9 @@ def start(raw_metadata_in, parsed_metadata_out):
     data.sort()
 
     with open(parsed_metadata_out, "w") as outf:
-        outf.write("\t".join(["filename", "date", "plant", "is_enriched"]) + "\n")
+        outf.write(
+            "\t".join(["filename", "date", "plant", "is_enriched"]) + "\n"
+        )
         for plant, date, filename, is_enriched in data:
             outf.write("\t".join([filename, date, plant, is_enriched]) + "\n")
 

diff --git a/bioprojects/PRJNA812772/metadata/prepare_metadata.py b/bioprojects/PRJNA812772/metadata/prepare_metadata.py
@@ -16,11 +16,17 @@
 with open("raw_metadata.tsv") as inf:
     with open("metadata.tsv", "w") as outf:
         for line in inf:
-            sample_accession, run_accession, sample_alias = line.strip().split("\t")
+            sample_accession, run_accession, sample_alias = line.strip().split(
+                "\t"
+            )
 
             _, strategy = sample_alias.split("_")
             if strategy == "sarscov2":
                 continue
 
-            collection_date = sample_accession_to_collection_date[sample_accession]
-            outf.write("%s\t%s\t%s\n" % (run_accession, strategy, collection_date))
+            collection_date = sample_accession_to_collection_date[
+                sample_accession
+            ]
+            outf.write(
+                "%s\t%s\t%s\n" % (run_accession, strategy, collection_date)
+            )
diff --git a/build_bowtie2_db.py b/build_bowtie2_db.py
@@ -116,7 +116,9 @@ def combine_genomes(combined_genomes_fname):
                 outf.writelines(inf.readlines())
 
 
-def mask_low_complexity_sequences(combined_genomes_fname, masked_genomes_fname):
+def mask_low_complexity_sequences(
+    combined_genomes_fname, masked_genomes_fname
+):
     if os.path.exists(masked_genomes_fname):
         return
     print("Masking low complexity sequences...")
@@ -149,7 +151,9 @@ def mask_low_complexity_sequences(combined_genomes_fname, masked_genomes_fname):
     #
     # This regexp replaces all lowercase letters that aren't on lines beginning
     # with '>', which in FASTA means everywhere except in the sequence IDs.
-    subprocess.check_call(["sed", "/^>/!s/[a-z]/x/g", "-i", masked_genomes_fname])
+    subprocess.check_call(
+        ["sed", "/^>/!s/[a-z]/x/g", "-i", masked_genomes_fname]
+    )
 
 
 def build_db(bowtie_db_prefix, genomes_fname):

diff --git a/count_clades.py b/count_clades.py
@@ -10,7 +10,9 @@
 parents = {}  # child_taxid -> parent_taxid
 with open("dashboard/nodes.dmp") as inf:
     for line in inf:
-        child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
+        child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
+            "\t|\t"
+        )
         child_taxid = int(child_taxid)
         parent_taxid = int(parent_taxid)
         parents[child_taxid] = parent_taxid

diff --git a/dashboard/determine_comparison_species.py b/dashboard/determine_comparison_species.py
@@ -8,7 +8,9 @@
 children = defaultdict(list)  # parent_taxid -> [children]
 with open("nodes.dmp") as inf:
     for line in inf:
-        child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
+        child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
+            "\t|\t"
+        )
         child_taxid = int(child_taxid)
         parent_taxid = int(parent_taxid)
         if child_taxid != parent_taxid:

diff --git a/dashboard/determine_key_clades.py b/dashboard/determine_key_clades.py
@@ -10,7 +10,9 @@
 children = defaultdict(set)  # parent_taxid -> [children]
 with open("nodes.dmp") as inf:
     for line in inf:
-        child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
+        child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
+            "\t|\t"
+        )
         child_taxid = int(child_taxid)
         parent_taxid = int(parent_taxid)
         if child_taxid != parent_taxid:

diff --git a/dashboard/prepare-dashboard-data.py b/dashboard/prepare-dashboard-data.py
@@ -39,7 +39,9 @@
 parents = {}  # child_taxid -> parent_taxid
 with open("%s/nodes.dmp" % DASHBOARD_DIR) as inf:
     for line in inf:
-        child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
+        child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
+            "\t|\t"
+        )
         child_taxid = int(child_taxid)
         parent_taxid = int(parent_taxid)
         parents[child_taxid] = parent_taxid
@@ -51,7 +53,9 @@
 
 # project -> sample -> n_reads
 project_sample_reads = defaultdict(dict)
-for metadata_fname in glob.glob("%s/bioprojects/*/metadata/metadata.tsv" % ROOT_DIR):
+for metadata_fname in glob.glob(
+    "%s/bioprojects/*/metadata/metadata.tsv" % ROOT_DIR
+):
     project = metadata_fname.split("/")[-3]
     if project in ["PRJEB30546", "PRJNA691135"]:
         # didn't finish importing this one, and the dashboard chokes on papers
@@ -137,7 +141,9 @@
 # paper -> {link, samples, projects, na_type, subset}
 papers = {}
 for project in projects:
-    with open("%s/bioprojects/%s/metadata/name.txt" % (ROOT_DIR, project)) as inf:
+    with open(
+        "%s/bioprojects/%s/metadata/name.txt" % (ROOT_DIR, project)
+    ) as inf:
         paper_name = inf.read().strip()
         if paper_name not in papers:
             papers[paper_name] = {}
@@ -169,7 +175,8 @@
 
 def rc(s):
     return "".join(
-        {"T": "A", "G": "C", "A": "T", "C": "G", "N": "N"}[x] for x in reversed(s)
+        {"T": "A", "G": "C", "A": "T", "C": "G", "N": "N"}[x]
+        for x in reversed(s)
     )
 
 
@@ -289,7 +296,9 @@ def count_dups(hvr_fname):
 taxonomic_names = defaultdict(list)
 with open("%s/names.dmp" % DASHBOARD_DIR) as inf:
     for line in inf:
-        taxid, name, unique_name, name_class = line.replace("\t|\n", "").split("\t|\t")
+        taxid, name, unique_name, name_class = line.replace("\t|\n", "").split(
+            "\t|\t"
+        )
         taxid = int(taxid)
 
         if taxid in mentioned_taxids or taxid in comparison_sample_counts:
@@ -305,19 +314,26 @@ def count_dups(hvr_fname):
 sample_metadata = defaultdict(dict)
 
 for project in projects:
-    with open("%s/bioprojects/%s/metadata/metadata.tsv" % (ROOT_DIR, project)) as inf:
+    with open(
+        "%s/bioprojects/%s/metadata/metadata.tsv" % (ROOT_DIR, project)
+    ) as inf:
         for line in inf:
             if not line.strip():
                 continue
             line = line[:-1]  # drop trailing newline
 
-            sample, sample_metadata_dict = sample_metadata_classifier.interpret(
+            (
+                sample,
+                sample_metadata_dict,
+            ) = sample_metadata_classifier.interpret(
                 project, papers, line.split("\t")
             )
             sample_metadata[sample] = sample_metadata_dict
 
     for sample in project_sample_reads[project]:
-        sample_metadata[sample]["reads"] = project_sample_reads[project][sample]
+        sample_metadata[sample]["reads"] = project_sample_reads[project][
+            sample
+        ]
 
         rf_fname = "ribofrac/%s.ribofrac.txt" % sample
         try:
@@ -349,7 +365,10 @@ def count_dups(hvr_fname):
 ]:
     with open(DASHBOARD_DIR + name + ".json", "w") as outf:
         json.dump(
-            val, outf, sort_keys=True, indent=None if val is human_virus_tree else 2
+            val,
+            outf,
+            sort_keys=True,
+            indent=None if val is human_virus_tree else 2,
         )
 
 # To make the dashboard load faster, divide counts by bioproject and don't load

diff --git a/dashboard/sample_metadata_classifier.py b/dashboard/sample_metadata_classifier.py
@@ -61,12 +61,18 @@ def interpret(project, papers, bits):
     elif project in papers["Bengtsson-Palme 2016"]["projects"]:
         sample, location, site = bits
         return sample, dict(
-            date="2012-09", country="Sweden", location=location, fine_location=site
+            date="2012-09",
+            country="Sweden",
+            location=location,
+            fine_location=site,
         )
     elif project in papers["Brinch 2020"]["projects"]:
         sample, loc, date = bits
         return sample, dict(
-            date=date, country="Denmark", location="Copenhagen", fine_location=loc
+            date=date,
+            country="Denmark",
+            location="Copenhagen",
+            fine_location=loc,
         )
     elif project in papers["Spurbeck 2023"]["projects"]:
         sample, loc, date = bits
@@ -158,7 +164,10 @@ def interpret(project, papers, bits):
     elif project in papers["Hendriksen 2019"]["projects"]:
         sample, date, cluster = bits
         return sample, dict(
-            country="Kenya", location="Kibera", fine_location=cluster, date=date
+            country="Kenya",
+            location="Kibera",
+            fine_location=cluster,
+            date=date,
         )
     elif project in papers["Yang 2020"]["projects"]:
         sample, city = bits
@@ -168,7 +177,10 @@ def interpret(project, papers, bits):
     elif project in papers["Wang 2022"]["projects"]:
         sample, date, hospital = bits
         return sample, dict(
-            country="Saudi Arabia", location="Jeddah", date=date, fine_location=hospital
+            country="Saudi Arabia",
+            location="Jeddah",
+            date=date,
+            fine_location=hospital,
         )
     elif project in papers["Cui 2023"]["projects"]:
         (sample,) = bits
@@ -259,7 +271,11 @@ def interpret(project, papers, bits):
         sample, _, enrichment, loc, city_state, date, flow = bits
         city, state = city_state.split(", ")
         record = dict(
-            country="United States", city=city, state="Texas", location=loc, date=date
+            country="United States",
+            city=city,
+            state="Texas",
+            location=loc,
+            date=date,
         )
         if enrichment == "1":
             record["enrichment"] = "panel"

diff --git a/expand-human-viruses.py b/expand-human-viruses.py
@@ -16,7 +16,9 @@
 children = {}
 with open("dashboard/nodes.dmp") as inf:
     for line in inf:
-        child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
+        child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
+            "\t|\t"
+        )
         child_taxid = int(child_taxid)
         parent_taxid = int(parent_taxid)
 
@@ -42,7 +44,9 @@ def add_children(taxid):
 taxonomic_names = {}
 with open("dashboard/names.dmp") as inf:
     for line in inf:
-        taxid, name, unique_name, name_class = line.replace("\t|\n", "").split("\t|\t")
+        taxid, name, unique_name, name_class = line.replace("\t|\n", "").split(
+            "\t|\t"
+        )
         taxid = int(taxid)
 
         if taxid in hv:

diff --git a/papers/Brinch2020/prepare_metadata.py b/papers/Brinch2020/prepare_metadata.py
@@ -16,8 +16,12 @@
         sample_to_details[bits[7]] = bits[15], bits[17]
 
 for project in ["PRJEB34633", "PRJEB13832"]:
-    with open("../../bioprojects/%s/metadata/metadata_raw.tsv" % project) as inf:
-        with open("../../bioprojects/%s/metadata/metadata.tsv" % project, "w") as outf:
+    with open(
+        "../../bioprojects/%s/metadata/metadata_raw.tsv" % project
+    ) as inf:
+        with open(
+            "../../bioprojects/%s/metadata/metadata.tsv" % project, "w"
+        ) as outf:
             for line in inf:
                 sample = line.strip()
                 if sample not in sample_to_details:

diff --git a/papers/Munk2022/prepare-metadata.py b/papers/Munk2022/prepare-metadata.py
@@ -115,7 +115,9 @@ def clean_date(x):
 
 bioproject_dir = os.path.join(root, "bioprojects")
 for bioproject in os.listdir(bioproject_dir):
-    with open(os.path.join(bioproject_dir, bioproject, "metadata", "name.txt")) as inf:
+    with open(
+        os.path.join(bioproject_dir, bioproject, "metadata", "name.txt")
+    ) as inf:
         if inf.read().strip() != "Munk 2022":
             continue
 

diff --git a/pipeline-operation/screen-summary.py b/pipeline-operation/screen-summary.py
@@ -41,7 +41,9 @@ def start():
         with tempfile.TemporaryDirectory() as workdir:
             tmpfname = os.path.join(workdir, "tmp.txt")
 
-            subprocess.check_call(["screen", "-S", screen, "-X", "hardcopy", tmpfname])
+            subprocess.check_call(
+                ["screen", "-S", screen, "-X", "hardcopy", tmpfname]
+            )
 
             # wait for screen to dump like we asked
             while not os.path.exists(tmpfname):

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,6 @@
+[tool.black]
+line-length = 79
+
+[tool.isort]
+profile = "black"
+line_length = 79
diff --git a/reprocess-bioprojects.py b/reprocess-bioprojects.py
@@ -28,7 +28,9 @@
 restricted_bioprojects = []
 restricted_dir = os.path.join("..", "mgs-restricted")
 if os.path.exists(restricted_dir):
-    restricted_bioprojects = os.listdir(os.path.join(restricted_dir, "bioprojects"))
+    restricted_bioprojects = os.listdir(
+        os.path.join(restricted_dir, "bioprojects")
+    )
 
 
 def prepare_job(bioproject, log_prefix, run_args):
@@ -85,7 +87,8 @@ def start():
         help="Log prefix, for storing this run under log/",
     )
     parser.add_argument(
-        "--bioprojects", help="The IDs of the bioproject to process, comma separated"
+        "--bioprojects",
+        help="The IDs of the bioproject to process, comma separated",
     )
     args = parser.parse_args(our_args)
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1 @@
+black