Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enh: transfer notebooks from mriqc and implement api client #6

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
333 changes: 333 additions & 0 deletions docs/notebooks/MRIQC Web API.ipynb

Large diffs are not rendered by default.

499 changes: 499 additions & 0 deletions docs/notebooks/MRIQC-workbook.ipynb

Large diffs are not rendered by default.

331 changes: 331 additions & 0 deletions docs/notebooks/Paper-v1.0.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,331 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"%load_ext autoreload\n",
"%autoreload 2\n",
"import matplotlib.pyplot as plt\n",
"import os.path as op\n",
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sn\n",
"\n",
"sn.set(style=\"whitegrid\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from mriqc.classifier import data as mcd\n",
"abide, _ = mcd.read_dataset(x_path, y_path, rate_label='rater_1')\n",
"sites = list(sorted(set(abide.site.values.ravel())))\n",
"\n",
"fmt = r'{site} & \\pixmat{{{size[0]:d}$\\pm${sr[0]:d}}}{{{size[1]:d}$\\pm${sr[1]:d}}}{{{size[2]:d}$\\pm${sr[1]:d}}}'\n",
"fmt += r'& \\pixmat[mm]{{{sp[0]:.2f}$\\pm${spr[0]:.2f}}}{{{sp[1]:.2f}$\\pm${spr[1]:.2f}}}{{{sp[2]:.2f}$\\pm${spr[1]:.2f}}}'\n",
"\n",
"\n",
"for site in sites:\n",
" subabide = abide.loc[abide.site.str.contains(site)]\n",
" \n",
" medians = np.median(subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']],\n",
" axis=0)\n",
" \n",
" mins = np.abs(medians - np.min(\n",
" subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0))\n",
"\n",
" maxs = np.abs(medians - np.max(\n",
" subabide[['size_x', 'size_y', 'size_z', 'spacing_x', 'spacing_y', 'spacing_z']], axis=0))\n",
"\n",
" ranges = np.max(np.vstack((maxs, mins)), axis=0)\n",
" \n",
" print(\n",
" fmt.format(\n",
" site=site,\n",
" size=tuple(medians[:3].astype(int)),\n",
" sr=tuple(ranges[:3].astype(int)),\n",
" sp=tuple(medians[3:]),\n",
" spr=tuple(ranges[3:]),\n",
"\n",
"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#data_path = '/home/oesteban/Google Drive/mriqc'\n",
"data_path = '/home/oesteban/tmp/mriqc-ml-tests-2/'\n",
"out_path = data_path\n",
"loso = pd.read_csv(op.join(data_path, 'cv_loso_inner.csv'), index_col=False)\n",
"kfold = pd.read_csv(op.join(data_path, 'cv_kfold_inner.csv'), index_col=False)\n",
"\n",
"kfold_outer = pd.read_csv(op.join(data_path, 'cv_kfold_outer.csv'), index_col=False)\n",
"loso_outer = pd.read_csv(op.join(data_path, 'cv_loso_outer.csv'), index_col=False)\n",
"\n",
"def gen_newparams(dataframe):\n",
" thisdf = dataframe.copy()\n",
" thisdf['zscored_str'] = ['nzs'] * len(thisdf['zscored'])\n",
" thisdf.loc[thisdf.zscored == 1, 'zscored_str'] = 'zs'\n",
" thisdf['params'] = thisdf['clf'] + '-' + thisdf['zscored_str'] + ' ' + thisdf['params']\n",
" del thisdf['zscored_str']\n",
" return thisdf\n",
"\n",
"loso = gen_newparams(loso)\n",
"kfold = gen_newparams(kfold)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"loso_models_list = list(set(loso.params.values.ravel().tolist()))\n",
"kfold_models_list = list(set(kfold.params.values.ravel().tolist()))\n",
"\n",
"best_param = {}\n",
"\n",
"spstr = ['LoSo', '10-fold']\n",
"best_models = {}\n",
"for i, split_cv in enumerate([loso, kfold]):\n",
" best_models[spstr[i]] = {}\n",
" splitcols = [col for col in split_cv.columns.ravel() if col.startswith('split0')]\n",
" for clf in ['svc_linear-nzs', 'svc_rbf-nzs', 'rfc-nzs', 'svc_linear-zs', 'svc_rbf-zs', 'rfc-zs']:\n",
" thismodeldf = split_cv.loc[split_cv.params.str.contains(clf)]\n",
" max_auc = thismodeldf.mean_auc.max()\n",
" best = thismodeldf.loc[thismodeldf.mean_auc >= max_auc]\n",
" best_list = best.params.values.ravel().tolist()\n",
" \n",
" if len(best_list) == 1:\n",
" best_models[spstr[i]][clf] = best_list[0]\n",
" else:\n",
" overall_means = [thismodeldf.loc[thismodeldf.params.str.contains(pset), 'mean_auc'].mean()\n",
" for pset in best_list]\n",
" overall_max = np.max(overall_means)\n",
" if sum([val >= overall_max for val in overall_means]) == 1:\n",
" best_models[spstr[i]][clf] = best_list[np.argmax(overall_means)]\n",
" else:\n",
" best_models[spstr[i]][clf] = best_list[0]\n",
" \n",
"newdict = {'AUC': [], 'Classifier': [], 'Split scheme': []}\n",
"\n",
"modelnames = {'rfc-nzs': 'RFC-nzs', 'rfc-zs': 'RFC-zs',\n",
" 'svc_linear-nzs': 'SVC_lin-nzs', 'svc_linear-zs': 'SVC_lin-zs',\n",
" 'svc_rbf-nzs': 'SVC_rbf-nzs', 'svc_rbf-zs': 'SVC_rbf-zs'}\n",
"\n",
"for key, val in list(best_models['LoSo'].items()):\n",
" scores = loso.loc[loso.params.str.contains(val), 'mean_auc'].values.ravel().tolist()\n",
" nscores = len(scores)\n",
" \n",
" newdict['AUC'] += scores\n",
" newdict['Classifier'] += [modelnames[key]] * nscores\n",
" newdict['Split scheme'] += ['LoSo (16 folds)'] * nscores\n",
" \n",
"for key, val in list(best_models['10-fold'].items()):\n",
" scores = kfold.loc[kfold.params.str.contains(val), 'mean_auc'].values.ravel().tolist()\n",
" nscores = len(scores)\n",
" \n",
" newdict['AUC'] += scores\n",
" newdict['Classifier'] += [modelnames[key]] * nscores\n",
" newdict['Split scheme'] += ['10-fold'] * nscores\n",
"\n",
"newdf = pd.DataFrame(newdict).sort_values(by=['Split scheme', 'Classifier'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def plot_cv_outer(data, score='auc', zscored=0, ax=None, ds030_score=None,\n",
" split_type='LoSo', color='dodgerblue'):\n",
" \n",
" if ax is None:\n",
" ax = plt.gca()\n",
" \n",
" outer_score = data.loc[data[score].notnull(), [score, 'zscored']]\n",
" sn.distplot(outer_score.loc[outer_score.zscored==zscored, score],\n",
" hist=True, norm_hist=True, ax=ax, color=color, label=split_type)\n",
" ax.set_xlim([0.4, 1.0])\n",
" ax.grid(False)\n",
" ax.set_yticklabels([])\n",
" \n",
" mean = outer_score.loc[outer_score.zscored==zscored, score].mean()\n",
" std = outer_score.loc[outer_score.zscored==zscored, score].std()\n",
"\n",
" mean_coord = draw_line(mean, ax=ax, color=color, lw=2.0, marker='o', extend=True)\n",
" \n",
" ymax = ax.get_ylim()[1]\n",
" draw_line(mean - std, ax=ax, color=color, extend=True)\n",
" draw_line(mean + std, ax=ax, color=color, extend=True)\n",
" \n",
" \n",
" ax.annotate(\n",
" '$\\mu$=%0.3f' % mean, xy=(mean_coord[0], 0.75*ymax), xytext=(-35, 30),\n",
" textcoords='offset points', va='center', color='w', size=14,\n",
" bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0),\n",
" arrowprops=dict(\n",
" arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None,\n",
" fc=color, ec='none', relpos=(0.5, 0.5)))\n",
" sigmay = 0.70*ymax\n",
" ax.annotate(s='', xy=(mean - std, sigmay), xytext=(mean + std, sigmay), arrowprops=dict(arrowstyle='<->'))\n",
" ax.annotate(\n",
" '$2\\sigma$=%0.3f' % (2 * std), xy=(mean_coord[0], 0.70*ymax), xytext=(-25, -12),\n",
" textcoords='offset points', va='center', color='k', size=12,\n",
" bbox=dict(boxstyle='round', fc='w', ec='none', color='none', alpha=.7, lw=0))\n",
" \n",
" if ds030_score is not None:\n",
" ds030_coord = draw_line(ds030_score, ax=ax, color='k', marker='o')\n",
" ax.annotate(\n",
" 'DS030', xy=ds030_coord, xytext=(-100, 0),\n",
" textcoords='offset points', va='center', color='w', size=16,\n",
" bbox=dict(boxstyle='round', fc=color, ec='none', color='none', lw=0),\n",
" arrowprops=dict(\n",
" arrowstyle='wedge,tail_width=0.8', lw=0, patchA=None, patchB=None,\n",
" fc=color, ec='none', relpos=(0.5, 0.5)))\n",
" \n",
" \n",
"def draw_line(score, ax=None, color='k', marker=None, lw=.7, extend=False):\n",
" if ax is None:\n",
" ax = plt.gca()\n",
" \n",
" if score > 1.0:\n",
" score = 1.0\n",
" \n",
" coords = [score, -1]\n",
" pdf_points = ax.lines[0].get_data()\n",
" coords[1] = np.interp([coords[0]], pdf_points[0], pdf_points[1])\n",
" \n",
" if extend:\n",
" ax.axvline(coords[0], ymin=coords[1] / ax.get_ylim()[1], ymax=0.75, color='gray', lw=.7)\n",
" \n",
" ax.axvline(coords[0], ymin=coords[1] / ax.get_ylim()[1], ymax=0, color=color, marker=marker, markevery=2,\n",
" markeredgewidth=1.5, markerfacecolor='w', markeredgecolor=color, lw=lw)\n",
"\n",
" return coords"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sn.set(style=\"whitegrid\")\n",
"\n",
"fig = plt.figure(figsize=(20, 8)) \n",
"ax1 = plt.subplot2grid((2,4), (0,0), colspan=2, rowspan=2)\n",
"\n",
"sn.violinplot(x='Classifier', y='AUC', hue='Split scheme', data=newdf, split=True,\n",
" palette=['dodgerblue', 'darkorange'], ax=ax1)\n",
"ax1.set_ylim([0.70, 1.0])\n",
"ax1.set_ylabel('AUC')\n",
"ax1.set_xlabel('Model')\n",
"ax1.set_title('Model selection - Inner loop of nested cross-validation')\n",
"\n",
"ax2 = plt.subplot2grid((2,4), (0, 2))\n",
"plot_cv_outer(kfold_outer, zscored=0, score='auc', ax=ax2, ds030_score=0.695, split_type='10-fold')\n",
"ax2.set_xlabel('')\n",
"ax2.legend()\n",
"ax2.set_title('Evaluation - Outer loop of nested cross-validation')\n",
"ax2.title.set_position([1.1, 1.0])\n",
"\n",
"ax3 = plt.subplot2grid((2,4), (1, 2))\n",
"plot_cv_outer(loso_outer, zscored=0, score='auc', ax=ax3, ds030_score=0.695, color='darkorange', split_type='LoSo (17 folds)')\n",
"ax3.legend()\n",
"ax3.set_xlabel('AUC')\n",
"\n",
"ax4 = plt.subplot2grid((2,4), (0, 3))\n",
"plot_cv_outer(kfold_outer, zscored=0, score='acc', ax=ax4, ds030_score=0.7283, split_type='10-fold')\n",
"ax4.set_xlabel('')\n",
"ax4.legend()\n",
"\n",
"ax5 = plt.subplot2grid((2,4), (1, 3))\n",
"plot_cv_outer(loso_outer, zscored=0, score='acc', ax=ax5, ds030_score=0.7283, color='darkorange', split_type='LoSo (17 folds)')\n",
"ax5.legend()\n",
"ax5.set_xlabel('Accuracy')\n",
"\n",
"\n",
"fig.savefig(op.join(out_path, 'crossvalidation.pdf'),\n",
" bbox_inches='tight', pad_inches=0, dpi=300)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"zscoreddf = loso_outer.loc[loso_outer.zscored == 0, ['auc', 'acc', 'site']]\n",
"palette = sn.color_palette(\"cubehelix\", len(set(zscoreddf.site)))\n",
"sn.pairplot(zscoreddf.loc[zscoreddf.auc.notnull(), ['auc', 'acc', 'site']], hue='site', palette=palette)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sites = sorted(list(set(loso_outer.site.ravel().tolist())))\n",
"palette = sn.color_palette(\"husl\", len(sites))\n",
"fig = plt.figure()\n",
"for i, site in enumerate(sites):\n",
" sitedf = loso_outer.loc[loso_outer.site == site]\n",
" accdf = sitedf.loc[sitedf.zscored==0]\n",
" sn.distplot(accdf.acc.values.ravel(), bins=20, kde=0, label=site, color=palette[i])\n",
"\n",
"fig.gca().legend()\n",
"fig.gca().set_xlim([0.5, 1.0])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading