Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add opf2eset nb #107

Merged
merged 2 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ dcicwrangling
Change Log
----------

2.4.0
=====

`PR:107 add useful notebook #15 to add opf collections to esets <https://github.com/4dn-dcic/dcicwrangling/pull/107>_`

* added a new useful notebook that allows you to use a lab submitted processed file sheet to link replicate sets to other processed files collections for that set

2.3.0
=====
Expand Down
127 changes: 127 additions & 0 deletions notebooks/useful_notebooks/15_add_opf_collections_to_sets.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Uses information from a submitter PF spreadsheet to add files to the appropriate place in indicated linked items\n",
"### Currently for ExperimentSets opfs but should be extend to Experiments, Pubs and possibly pages\n",
"\n",
"#### Setup\n",
"\n",
"- Provide a title and description to use for the opf section.\n",
"- indicate path to the workbook"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dcicutils import ff_utils\n",
"from functions.notebook_functions import *\n",
"from functions.wfr import *\n",
"\n",
"# get key from keypairs.json\n",
"my_key = get_key('andyprod')\n",
"\n",
"# set title here\n",
"opf_type = 'supplementary'\n",
"opf_title = \"Analysis results provided by the data submitters - the Diao Lab\"\n",
"opf_desc = \"The results were generated by the Diao lab using the code available at https://github.com/jianhong/hicar/releases/tag/2.0.0rc\"\n",
"\n",
"# location of excel processed file sheet\n",
"xcel_file = '/Users/andrew/Documents/work/4DN_Metadata/Diao_Yarui_lab/HiCAR_RNAseq_myoblast_diff/230830_hicar_processed_results_ajs_upd.xlsx'\n",
"xcel, sheets = digest_xlsx(xcel_file)\n",
"xcel_data = reader(xcel)\n",
"\n",
"fieldnames = next(xcel_data)\n",
"\n",
"# create a dict with dataset 2 processd files \n",
"dset2opfs = {}\n",
"for row in xcel_data:\n",
" if row[0].startswith('#'):\n",
" continue\n",
" data = dict(zip(fieldnames, row))\n",
" data = {k: v for k, v in data.items() if v}\n",
" file_alias = data.get('aliases')\n",
" dataset = data.get('# linked_datasets')\n",
" dset2opfs.setdefault(dataset, []).append(file_alias)\n",
" \n",
"print(dset2opfs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# go through the dictionary and create a patch for the dataset \n",
"# need to check and maintain other opfs\n",
"\n",
"# if True do the action, if false just report\n",
"action = True\n",
"\n",
"for ds, opfs in dset2opfs.items():\n",
" opf_obj = {'type': opf_type, 'title': opf_title, 'files': opfs}\n",
" if opf_desc:\n",
" opf_obj['description'] = opf_desc\n",
" \n",
" dset = ff_utils.get_metadata(ds, my_key, add_on='frame=raw')\n",
" curr_opfs = dset.get('other_processed_files', [])\n",
" if curr_opfs:\n",
" curr_titles = [i.get('title') for i in curr_opfs]\n",
" if opf_title in curr_titles:\n",
" print('ERROR: {} has been used as a title already for {} - NO GO!'.format(ds, opf_title))\n",
" continue\n",
" # in this specific case we want to insert the new one into\n",
" # the first position of the list\n",
" new_opf_grps = [opf_obj] + curr_opfs\n",
" \n",
" patch_data = {'other_processed_files': new_opf_grps}\n",
" if action:\n",
" res = ff_utils.patch_metadata(patch_data, dset['uuid'], key = my_key)\n",
" print(res)\n",
" else:\n",
" print(\"\\n\")\n",
" print(\"{}\\t{}\".format(dset.get('uuid'), dset.get('accession')))\n",
" print(opf_obj)\n",
" print(\"\\n\\n\")\n",
" print(patch_data)\n",
" \n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicwrangling"
version = "2.3.0"
version = "2.4.0"
description = "Scripts and Jupyter notebooks for 4DN wrangling"
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
Loading