4dn-dcic · aschroed · Sep 6, 2023 · Sep 6, 2023 · Sep 6, 2023
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,12 @@ dcicwrangling
 Change Log
 ----------
 
+2.4.0
+=====
+
+`PR:107 add useful notebook #15 to add opf collections to esets <https://github.com/4dn-dcic/dcicwrangling/pull/107>_`
+
+* added a new useful notebook that allows you to use a lab submitted processed file sheet to link replicate sets to other processed files collections for that set
 
 2.3.0
 =====

diff --git a/notebooks/useful_notebooks/15_add_opf_collections_to_sets.ipynb b/notebooks/useful_notebooks/15_add_opf_collections_to_sets.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Uses information from a submitter PF spreadsheet to add files to the appropriate place in indicated linked items\n",
+    "### Currently for ExperimentSets opfs but should be extend to Experiments, Pubs and possibly pages\n",
+    "\n",
+    "#### Setup\n",
+    "\n",
+    "- Provide a title and description to use for the opf section.\n",
+    "- indicate path to the workbook"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dcicutils import ff_utils\n",
+    "from functions.notebook_functions import *\n",
+    "from functions.wfr import *\n",
+    "\n",
+    "# get key from keypairs.json\n",
+    "my_key = get_key('andyprod')\n",
+    "\n",
+    "# set title here\n",
+    "opf_type = 'supplementary'\n",
+    "opf_title = \"Analysis results provided by the data submitters - the Diao Lab\"\n",
+    "opf_desc = \"The results were generated by the Diao lab using the code available at https://github.com/jianhong/hicar/releases/tag/2.0.0rc\"\n",
+    "\n",
+    "# location of excel processed file sheet\n",
+    "xcel_file = '/Users/andrew/Documents/work/4DN_Metadata/Diao_Yarui_lab/HiCAR_RNAseq_myoblast_diff/230830_hicar_processed_results_ajs_upd.xlsx'\n",
+    "xcel, sheets = digest_xlsx(xcel_file)\n",
+    "xcel_data = reader(xcel)\n",
+    "\n",
+    "fieldnames = next(xcel_data)\n",
+    "\n",
+    "# create a dict with dataset 2 processd files \n",
+    "dset2opfs = {}\n",
+    "for row in xcel_data:\n",
+    "    if row[0].startswith('#'):\n",
+    "        continue\n",
+    "    data = dict(zip(fieldnames, row))\n",
+    "    data = {k: v for k, v in data.items() if v}\n",
+    "    file_alias = data.get('aliases')\n",
+    "    dataset = data.get('# linked_datasets')\n",
+    "    dset2opfs.setdefault(dataset, []).append(file_alias)\n",
+    "    \n",
+    "print(dset2opfs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# go through the dictionary and create a patch for the dataset \n",
+    "# need to check and maintain other opfs\n",
+    "\n",
+    "# if True do the action, if false just report\n",
+    "action = True\n",
+    "\n",
+    "for ds, opfs in dset2opfs.items():\n",
+    "    opf_obj = {'type': opf_type, 'title': opf_title, 'files': opfs}\n",
+    "    if opf_desc:\n",
+    "        opf_obj['description'] = opf_desc\n",
+    "    \n",
+    "    dset = ff_utils.get_metadata(ds, my_key, add_on='frame=raw')\n",
+    "    curr_opfs = dset.get('other_processed_files', [])\n",
+    "    if curr_opfs:\n",
+    "        curr_titles = [i.get('title') for i in curr_opfs]\n",
+    "        if opf_title in curr_titles:\n",
+    "            print('ERROR: {} has been used as a title already for {} - NO GO!'.format(ds, opf_title))\n",
+    "            continue\n",
+    "    # in this specific case we want to insert the new one into\n",
+    "    # the first position of the list\n",
+    "    new_opf_grps = [opf_obj] + curr_opfs\n",
+    "    \n",
+    "    patch_data = {'other_processed_files': new_opf_grps}\n",
+    "    if action:\n",
+    "        res = ff_utils.patch_metadata(patch_data, dset['uuid'], key = my_key)\n",
+    "        print(res)\n",
+    "    else:\n",
+    "        print(\"\\n\")\n",
+    "        print(\"{}\\t{}\".format(dset.get('uuid'), dset.get('accession')))\n",
+    "        print(opf_obj)\n",
+    "        print(\"\\n\\n\")\n",
+    "        print(patch_data)\n",
+    " \n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcicwrangling"
-version = "2.3.0"
+version = "2.4.0"
 description = "Scripts and Jupyter notebooks for 4DN wrangling"
 authors = ["4DN-DCIC Team <[email protected]>"]
 license = "MIT"