Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge Camerian's changes from A la carte into master #32

Open
wants to merge 31 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ee64cce
Debugging the webdriver. Chrome seems to have an issue, trying firefo…
colemathis Sep 30, 2022
41871d6
Chromedriver working!!
colemathis Sep 30, 2022
076d9d3
Updating docs to reflect Chromedriver success
colemathis Sep 30, 2022
556c3de
Updating JGI to not take so long, replacing explicit time.sleep with …
colemathis Oct 7, 2022
49552d6
Remove egg-info from repo tracking
colemathis Oct 7, 2022
7bae1d4
Update the find_elements_by_tag_name to work with newer versions of s…
colemathis Oct 8, 2022
2847bc3
General Debugging and work flow issues
colemathis Oct 12, 2022
c2d86ee
tacked on fxn at end, not yet integrated into Kegg class, for catalog…
camerianm Nov 8, 2022
a0fafc9
Merge branch 'driver_debug' of https://github.com/ELIFE-ASU/ecg into …
camerianm Nov 8, 2022
e1fa5d5
streamlined EC transfer steps and x-referencing of EC-RN ties in KEGG
camerianm Mar 24, 2023
94161e6
Deleted Untitled.ipynb
camerianm Mar 24, 2023
5e4aac5
removed excess content from enzyme-info scraping; removed metadata sc…
camerianm Apr 19, 2023
b2ae9c1
notebook demonstrating a la carte taxon-ID enzyme count download
camerianm Apr 19, 2023
2ddf85f
KO edit of jgi data pull
camerianm Jul 19, 2023
1a0e596
KO file URLs from KO count and gene counts in metadata CSV file downl…
camerianm Jul 19, 2023
6fd74fd
scrape_urls_unsafe_alacarte added to enable use of JGI metadata bulk …
camerianm Jul 27, 2023
41b61b4
notebook demonstrating a la carte download of EC/KO data when presenc…
camerianm Jul 27, 2023
e2a653e
download progress indicator
camerianm Nov 14, 2023
87cc059
removed excess content
camerianm Dec 14, 2023
022cfe9
removed excess file
camerianm Dec 14, 2023
7d2bb96
working on JGI ECG docs
colemathis Dec 14, 2023
7b53955
Merge branch 'alacarte' of https://github.com/ELIFE-ASU/ecg into alac…
colemathis Dec 14, 2023
bcfdeed
added IMG/M metadata for Trembath-Reichert et al 2019
camerianm Dec 14, 2023
c3ee6c3
typos in readme
colemathis Dec 14, 2023
c022731
screenshots of IMG/M process
camerianm Dec 14, 2023
53f54e6
Merge branch 'alacarte' of https://github.com/ELIFE-ASU/ecg into alac…
camerianm Dec 14, 2023
73447d3
add images to readme
colemathis Dec 14, 2023
cb5cf4b
filters metadata file to identify eligible samples
camerianm Dec 14, 2023
b06de34
Merge branch 'alacarte' of https://github.com/ELIFE-ASU/ecg into alac…
camerianm Dec 14, 2023
dc21449
expected output from running FormForDownloads.ipynb on example folder
camerianm Dec 14, 2023
867fe2c
modified FormForDownloads to specify metatranscriptomes or metagenomes
camerianm Dec 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions FormForDownloads.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "4d3dc956",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import numpy as np\n",
"from ecg import jgi_ko_edit as jgi\n",
"\n",
"#######################################################\n",
"\n",
"#in what folder are inputs and outputs? must include metadata file\n",
"folder_name = 'example'\n",
"#what is the metadata file name?\n",
"metadata_file = 'imgm_metadata.txt'\n",
"#do you want 'ecs' or 'kos'?\n",
"data_needed ='kos'\n",
"#how frequently to make csv, at minimum? file writing takes time\n",
"output_frequency = 50\n",
"#what subset do you want? see pd.DataFrame.query for format rules\n",
"query = 'Ecosystem==\"Environmental\"'\n",
"#analysis type options: 'Metatranscriptome' or 'Metagenome'\n",
"analysis_type = 'Metatranscriptome'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5c6ebd0f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6 samples, ETA: 0.02 hours\n"
]
}
],
"source": [
"def available_samples(metadata_file:str, folder_name:str, data_needed:str):\n",
" data = {'ecs': '`Enzyme Count`', 'kos': '`KO Count`'}\n",
" path = folder_name+'/'+metadata_file\n",
" if not os.path.exists(folder_name+'/'+metadata_file):\n",
" raise ValueError('Please ensure folder exists & contains JGI metadata file.')\n",
" meta_df = pd.read_csv(path, sep='\\t', header=0, index_col=0)\n",
" meta_df = meta_df.drop(columns=[i for i in meta_df.columns if 'Unnamed' in i])# or meta_df[i].nunique()<=1])\n",
" meta_df.columns = [i.split('*')[0].strip() for i in meta_df.columns]\n",
" try:\n",
" meta_df['Add Date'] = pd.to_datetime(meta_df['Add Date'])\n",
" except:\n",
" pass\n",
" if data.get(data_needed, ' ').strip('`') in meta_df.columns:\n",
" meta_df = meta_df.query(data[data_needed]+'>0 & `Gene Count`>0').dropna(axis=1, how='all')\n",
" else:\n",
" raise ValueError(data.get(data_needed, 'Attribute count')+' not in columns. Please re-download. \\nCount columns present: '+\n",
" ', '.join(meta_df.columns[meta_df.columns.str.contains('Count')])+\n",
" '\\nMTs:\\thttps://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonList&page=taxonListAlpha2&domain=Metatranscriptome'+\n",
" '\\nMGs:\\thttps://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonList&page=taxonListAlpha2&domain=*Microbiome')\n",
" return(meta_df)\n",
"\n",
"def write_data_urls(meta_df, data_needed, folder_name):\n",
" oids = meta_df['IMG Genome ID']\n",
" data = {'kos':'ko', 'ecs':'enzymes'}\n",
" cols = {'kos': 'KO Count', 'ecs': 'Enzyme Count'}\n",
" prefix = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=MetaDetail&taxon_oid='\n",
" page, gcount, fcount = '&page=', '&data_type=assembled&total_genome_gene_count=', '&total_gene_count='\n",
" urls = (prefix + meta_df['IMG Genome ID'].astype(str) + page + data[data_needed] + gcount + \n",
" meta_df['Gene Count'].astype(str) + fcount + meta_df[cols[data_needed]].astype(str))\n",
" urls.to_csv(folder_name+'/'+data_needed+'_urls.csv', encoding='utf-8')\n",
" return(urls)\n",
"\n",
"metadata = available_samples(metadata_file, folder_name, data_needed=data_needed)\n",
"my_metadata = metadata[metadata['GOLD Analysis Project Type'].str.contains(analysis_type)]\n",
"my_metadata = my_metadata[my_metadata['Domain']=='*Microbiome']\n",
"my_metadata = my_metadata.query(query).dropna(axis=1, how='all')\n",
"print(len(my_metadata), 'samples, ETA:', np.round((12*len(my_metadata)/60)/60,2), 'hours')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ad3a0ee5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done! 6\n"
]
}
],
"source": [
"J = jgi.Jgi()\n",
"data_urls = write_data_urls(my_metadata, data_needed, folder_name).to_dict()\n",
"counts, status = J._scrape_urls_unsafe_alacarte(path=folder_name, domain='*Microbiome',\n",
" data_urls=data_urls, data_needed=data_needed, output_frequency = output_frequency)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading