Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added check.sh to the repository. Checks if all Python scripts are formatted with black. #51

Merged
merged 5 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/presubmit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: Presubmit
on: [push]
jobs:
Presubmit:
runs-on: ubuntu-latest
steps:
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- uses: actions/checkout@v3
- run: python3 -m pip install -r requirements-dev.txt
- run: ./check.sh
12 changes: 8 additions & 4 deletions bioprojects/PRJEB13833/metadata/prepare-metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@
if line.startswith("run"):
continue

run_accession, sample_accession, sample_alias, sample_title = line.split(
"\t"
)
(
run_accession,
sample_accession,
sample_alias,
sample_title,
) = line.split("\t")

_, yyyy, mm, dd, site = sample_title.split("_")

outf.write(
"%s\t%s-%s-%s\tCluster %s\n" % (run_accession, yyyy, mm, dd, site)
"%s\t%s-%s-%s\tCluster %s\n"
% (run_accession, yyyy, mm, dd, site)
)
4 changes: 3 additions & 1 deletion bioprojects/PRJEB49260/metadata/prepare_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,6 @@
dd = date[2:4]
yy = date[4:6]

outf.write("%s\t20%s-%s-%s\t%s\n" % (run_accession, yy, mm, dd, location))
outf.write(
"%s\t20%s-%s-%s\t%s\n" % (run_accession, yy, mm, dd, location)
)
4 changes: 3 additions & 1 deletion bioprojects/PRJNA729801/metadata/parse_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ def start(raw_metadata_in, parsed_metadata_out):
data.sort()

with open(parsed_metadata_out, "w") as outf:
outf.write("\t".join(["filename", "date", "plant", "is_enriched"]) + "\n")
outf.write(
"\t".join(["filename", "date", "plant", "is_enriched"]) + "\n"
)
for plant, date, filename, is_enriched in data:
outf.write("\t".join([filename, date, plant, is_enriched]) + "\n")

Expand Down
12 changes: 9 additions & 3 deletions bioprojects/PRJNA812772/metadata/prepare_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@
with open("raw_metadata.tsv") as inf:
with open("metadata.tsv", "w") as outf:
for line in inf:
sample_accession, run_accession, sample_alias = line.strip().split("\t")
sample_accession, run_accession, sample_alias = line.strip().split(
"\t"
)

_, strategy = sample_alias.split("_")
if strategy == "sarscov2":
continue

collection_date = sample_accession_to_collection_date[sample_accession]
outf.write("%s\t%s\t%s\n" % (run_accession, strategy, collection_date))
collection_date = sample_accession_to_collection_date[
sample_accession
]
outf.write(
"%s\t%s\t%s\n" % (run_accession, strategy, collection_date)
)
8 changes: 6 additions & 2 deletions build_bowtie2_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,9 @@ def combine_genomes(combined_genomes_fname):
outf.writelines(inf.readlines())


def mask_low_complexity_sequences(combined_genomes_fname, masked_genomes_fname):
def mask_low_complexity_sequences(
combined_genomes_fname, masked_genomes_fname
):
if os.path.exists(masked_genomes_fname):
return
print("Masking low complexity sequences...")
Expand Down Expand Up @@ -149,7 +151,9 @@ def mask_low_complexity_sequences(combined_genomes_fname, masked_genomes_fname):
#
# This regexp replaces all lowercase letters that aren't on lines beginning
# with '>', which in FASTA means everywhere except in the sequence IDs.
subprocess.check_call(["sed", "/^>/!s/[a-z]/x/g", "-i", masked_genomes_fname])
subprocess.check_call(
["sed", "/^>/!s/[a-z]/x/g", "-i", masked_genomes_fname]
)


def build_db(bowtie_db_prefix, genomes_fname):
Expand Down
4 changes: 3 additions & 1 deletion count_clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
parents = {} # child_taxid -> parent_taxid
with open("dashboard/nodes.dmp") as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)
parents[child_taxid] = parent_taxid
Expand Down
4 changes: 3 additions & 1 deletion dashboard/determine_comparison_species.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
children = defaultdict(list) # parent_taxid -> [children]
with open("nodes.dmp") as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)
if child_taxid != parent_taxid:
Expand Down
4 changes: 3 additions & 1 deletion dashboard/determine_key_clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
children = defaultdict(set) # parent_taxid -> [children]
with open("nodes.dmp") as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)
if child_taxid != parent_taxid:
Expand Down
37 changes: 28 additions & 9 deletions dashboard/prepare-dashboard-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
parents = {} # child_taxid -> parent_taxid
with open("%s/nodes.dmp" % DASHBOARD_DIR) as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)
parents[child_taxid] = parent_taxid
Expand All @@ -51,7 +53,9 @@

# project -> sample -> n_reads
project_sample_reads = defaultdict(dict)
for metadata_fname in glob.glob("%s/bioprojects/*/metadata/metadata.tsv" % ROOT_DIR):
for metadata_fname in glob.glob(
"%s/bioprojects/*/metadata/metadata.tsv" % ROOT_DIR
):
project = metadata_fname.split("/")[-3]
if project in ["PRJEB30546", "PRJNA691135"]:
# didn't finish importing this one, and the dashboard chokes on papers
Expand Down Expand Up @@ -137,7 +141,9 @@
# paper -> {link, samples, projects, na_type, subset}
papers = {}
for project in projects:
with open("%s/bioprojects/%s/metadata/name.txt" % (ROOT_DIR, project)) as inf:
with open(
"%s/bioprojects/%s/metadata/name.txt" % (ROOT_DIR, project)
) as inf:
paper_name = inf.read().strip()
if paper_name not in papers:
papers[paper_name] = {}
Expand Down Expand Up @@ -169,7 +175,8 @@

def rc(s):
return "".join(
{"T": "A", "G": "C", "A": "T", "C": "G", "N": "N"}[x] for x in reversed(s)
{"T": "A", "G": "C", "A": "T", "C": "G", "N": "N"}[x]
for x in reversed(s)
)


Expand Down Expand Up @@ -289,7 +296,9 @@ def count_dups(hvr_fname):
taxonomic_names = defaultdict(list)
with open("%s/names.dmp" % DASHBOARD_DIR) as inf:
for line in inf:
taxid, name, unique_name, name_class = line.replace("\t|\n", "").split("\t|\t")
taxid, name, unique_name, name_class = line.replace("\t|\n", "").split(
"\t|\t"
)
taxid = int(taxid)

if taxid in mentioned_taxids or taxid in comparison_sample_counts:
Expand All @@ -305,19 +314,26 @@ def count_dups(hvr_fname):
sample_metadata = defaultdict(dict)

for project in projects:
with open("%s/bioprojects/%s/metadata/metadata.tsv" % (ROOT_DIR, project)) as inf:
with open(
"%s/bioprojects/%s/metadata/metadata.tsv" % (ROOT_DIR, project)
) as inf:
for line in inf:
if not line.strip():
continue
line = line[:-1] # drop trailing newline

sample, sample_metadata_dict = sample_metadata_classifier.interpret(
(
sample,
sample_metadata_dict,
) = sample_metadata_classifier.interpret(
project, papers, line.split("\t")
)
sample_metadata[sample] = sample_metadata_dict

for sample in project_sample_reads[project]:
sample_metadata[sample]["reads"] = project_sample_reads[project][sample]
sample_metadata[sample]["reads"] = project_sample_reads[project][
sample
]

rf_fname = "ribofrac/%s.ribofrac.txt" % sample
try:
Expand Down Expand Up @@ -349,7 +365,10 @@ def count_dups(hvr_fname):
]:
with open(DASHBOARD_DIR + name + ".json", "w") as outf:
json.dump(
val, outf, sort_keys=True, indent=None if val is human_virus_tree else 2
val,
outf,
sort_keys=True,
indent=None if val is human_virus_tree else 2,
)

# To make the dashboard load faster, divide counts by bioproject and don't load
Expand Down
26 changes: 21 additions & 5 deletions dashboard/sample_metadata_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,18 @@ def interpret(project, papers, bits):
elif project in papers["Bengtsson-Palme 2016"]["projects"]:
sample, location, site = bits
return sample, dict(
date="2012-09", country="Sweden", location=location, fine_location=site
date="2012-09",
country="Sweden",
location=location,
fine_location=site,
)
elif project in papers["Brinch 2020"]["projects"]:
sample, loc, date = bits
return sample, dict(
date=date, country="Denmark", location="Copenhagen", fine_location=loc
date=date,
country="Denmark",
location="Copenhagen",
fine_location=loc,
)
elif project in papers["Spurbeck 2023"]["projects"]:
sample, loc, date = bits
Expand Down Expand Up @@ -158,7 +164,10 @@ def interpret(project, papers, bits):
elif project in papers["Hendriksen 2019"]["projects"]:
sample, date, cluster = bits
return sample, dict(
country="Kenya", location="Kibera", fine_location=cluster, date=date
country="Kenya",
location="Kibera",
fine_location=cluster,
date=date,
)
elif project in papers["Yang 2020"]["projects"]:
sample, city = bits
Expand All @@ -168,7 +177,10 @@ def interpret(project, papers, bits):
elif project in papers["Wang 2022"]["projects"]:
sample, date, hospital = bits
return sample, dict(
country="Saudi Arabia", location="Jeddah", date=date, fine_location=hospital
country="Saudi Arabia",
location="Jeddah",
date=date,
fine_location=hospital,
)
elif project in papers["Cui 2023"]["projects"]:
(sample,) = bits
Expand Down Expand Up @@ -259,7 +271,11 @@ def interpret(project, papers, bits):
sample, _, enrichment, loc, city_state, date, flow = bits
city, state = city_state.split(", ")
record = dict(
country="United States", city=city, state="Texas", location=loc, date=date
country="United States",
city=city,
state="Texas",
location=loc,
date=date,
)
if enrichment == "1":
record["enrichment"] = "panel"
Expand Down
8 changes: 6 additions & 2 deletions expand-human-viruses.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
children = {}
with open("dashboard/nodes.dmp") as inf:
for line in inf:
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split("\t|\t")
child_taxid, parent_taxid, rank, *_ = line.replace("\t|\n", "").split(
"\t|\t"
)
child_taxid = int(child_taxid)
parent_taxid = int(parent_taxid)

Expand All @@ -42,7 +44,9 @@ def add_children(taxid):
taxonomic_names = {}
with open("dashboard/names.dmp") as inf:
for line in inf:
taxid, name, unique_name, name_class = line.replace("\t|\n", "").split("\t|\t")
taxid, name, unique_name, name_class = line.replace("\t|\n", "").split(
"\t|\t"
)
taxid = int(taxid)

if taxid in hv:
Expand Down
8 changes: 6 additions & 2 deletions papers/Brinch2020/prepare_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@
sample_to_details[bits[7]] = bits[15], bits[17]

for project in ["PRJEB34633", "PRJEB13832"]:
with open("../../bioprojects/%s/metadata/metadata_raw.tsv" % project) as inf:
with open("../../bioprojects/%s/metadata/metadata.tsv" % project, "w") as outf:
with open(
"../../bioprojects/%s/metadata/metadata_raw.tsv" % project
) as inf:
with open(
"../../bioprojects/%s/metadata/metadata.tsv" % project, "w"
) as outf:
for line in inf:
sample = line.strip()
if sample not in sample_to_details:
Expand Down
4 changes: 3 additions & 1 deletion papers/Munk2022/prepare-metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@ def clean_date(x):

bioproject_dir = os.path.join(root, "bioprojects")
for bioproject in os.listdir(bioproject_dir):
with open(os.path.join(bioproject_dir, bioproject, "metadata", "name.txt")) as inf:
with open(
os.path.join(bioproject_dir, bioproject, "metadata", "name.txt")
) as inf:
if inf.read().strip() != "Munk 2022":
continue

Expand Down
4 changes: 3 additions & 1 deletion pipeline-operation/screen-summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ def start():
with tempfile.TemporaryDirectory() as workdir:
tmpfname = os.path.join(workdir, "tmp.txt")

subprocess.check_call(["screen", "-S", screen, "-X", "hardcopy", tmpfname])
subprocess.check_call(
["screen", "-S", screen, "-X", "hardcopy", tmpfname]
)

# wait for screen to dump like we asked
while not os.path.exists(tmpfname):
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[tool.black]
line-length = 79

[tool.isort]
profile = "black"
line_length = 79
7 changes: 5 additions & 2 deletions reprocess-bioprojects.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@
restricted_bioprojects = []
restricted_dir = os.path.join("..", "mgs-restricted")
if os.path.exists(restricted_dir):
restricted_bioprojects = os.listdir(os.path.join(restricted_dir, "bioprojects"))
restricted_bioprojects = os.listdir(
os.path.join(restricted_dir, "bioprojects")
)


def prepare_job(bioproject, log_prefix, run_args):
Expand Down Expand Up @@ -85,7 +87,8 @@ def start():
help="Log prefix, for storing this run under log/",
)
parser.add_argument(
"--bioprojects", help="The IDs of the bioproject to process, comma separated"
"--bioprojects",
help="The IDs of the bioproject to process, comma separated",
)
args = parser.parse_args(our_args)

Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
black
Loading