diff --git a/docs/conf.py b/docs/conf.py index 517695f4..588e2876 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -30,11 +30,14 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.autodoc', +extensions = [ + 'sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax', 'sphinx.ext.autosummary', - 'sphinx.ext.napoleon'] + 'sphinx.ext.napoleon', + 'nbsphinx' + ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -176,3 +179,5 @@ # generate stub pages where directed with autosummary #autosummary_generate = True + +nbsphinx_allow_errors = True diff --git a/docs/examples.rst b/docs/examples.rst index ce68152a..2e99c894 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -1,3 +1,9 @@ +.. _examples: + Examples ======== + +.. toctree:: + :maxdepth: 1 + diff --git a/docs/index.rst b/docs/index.rst index da8ef91e..5a9939da 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -52,11 +52,20 @@ Contributions are very welcome. If you have bug reports or feature requests or q .. _`Issue Tracker`: https://github.com/alchemistry/alchemlyb/issues .. _`alchemistry/alchemlyb`: https://github.com/alchemistry/alchemlyb + .. toctree:: :maxdepth: 1 - :caption: User Documentation + :caption: Getting Started install + tutorials + examples + +.. toctree:: + :maxdepth: 1 + :caption: Package Documentation + + overview parsing preprocessing estimators diff --git a/docs/overview.rst b/docs/overview.rst new file mode 100644 index 00000000..c56bb398 --- /dev/null +++ b/docs/overview.rst @@ -0,0 +1,6 @@ +.. _overview: + +Overview of the alchemlyb library +================================= + +Something diff --git a/docs/tutorials.rst b/docs/tutorials.rst new file mode 100644 index 00000000..cb4a684d --- /dev/null +++ b/docs/tutorials.rst @@ -0,0 +1,17 @@ +Tutorials +========= + +Welcome to alchemlyb! +If you are new to the library, this content is designed to get you up and running with the library as quickly as possible. + +The tutorials included here are intended to be run through in the order given below. +They go to great lengths to explain what is being done, but may gloss over some technical details in order to prioritize pedagogical value. + +After learning the usage patterns in alchemlyb, you will likely find the :ref:`examples` useful for learning about more detailed use cases. +If you desire more technical detail on the underlying components of the library, we advise you visit the :ref:`overview`. + +.. toctree:: + :maxdepth: 1 + + tutorials/10-minutes-to-alchemlyb.ipynb + tutorials/Pandas-Fu.ipynb diff --git a/docs/tutorials/Pandas-Fu.ipynb b/docs/tutorials/Pandas-Fu.ipynb new file mode 100644 index 00000000..33318fcc --- /dev/null +++ b/docs/tutorials/Pandas-Fu.ipynb @@ -0,0 +1,816 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Pandas basics for using `alchemlyb`\n", + "\n", + "This how-to assumes only working knowlege of Python.\n", + "You will learn how to effectively use the [pandas](https://pandas.pydata.org/) Python library, on which [alchemlyb](https://alchemlyb.readthedocs.io/en/latest/) depends.\n", + "This knowledge will serve you well in using `alchemlyb`, but also generalizes well beyond it.\n", + "\n", + "You will learn how to do the following:\n", + "1. Parsing a dataset into a `pandas.DataFrame`.\n", + "2. Subselecting components of a DataFrame.\n", + "3. Obtaining descriptive statistics.\n", + "4. Modifying a DataFrame." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Parsing a dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll begin by choosing an existing dataset in [alchemtest](https://alchemtest.readthedocs.io/en/latest/).\n", + "`alchemtest` features sample datasets from a variety of software packages, and is used as the test set for `alchemlyb`.\n", + "You will need to [install alchemtest](https://alchemtest.readthedocs.io/en/latest/install.html) if it is not already present in your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from alchemtest.gmx import load_expanded_ensemble_case_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = load_expanded_ensemble_case_2()\n", + "print(dataset.DESCR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`alchemtest` datasets have a `DESCR` attribute that gives metadata details about the dataset. This is important for interpreting the data files themselves correctly. We see from this description that this dataset features 2 \"legs\" for its alchemical pathway, with 20 windows for the Coulomb change and 12 windows for the VDW change.\n", + "\n", + "The `data` attribute gives us paths to the dataset files as they are installed on our machine; if you are running this notebook on your own machines, these paths will likely differ:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(dataset.data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will load only one of these files to get started. To do this, we will need to use a parser function from `alchemlyb`. Since this is a dataset generated with [Gromacs](http://www.gromacs.org/), we will use a parser specific to Gromacs file outputs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "datafile = dataset.data['AllStates'][0]\n", + "datafile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from alchemlyb.parsing import gmx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dHdl = gmx.extract_dHdl(datafile, T=300)\n", + "dHdl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "type(dHdl)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The `extract_dHdl` parser gives us a `pandas.DataFrame`, in `alchemlyb` [standard form for a dHdl](https://alchemlyb.readthedocs.io/en/latest/parsing.html#dhdl-standard-form) dataset. We will use this DataFrame for the rest of the lesson.\n", + "\n", + "`pandas` is a library that provides special data structures for doing fast numerical operations on tabular data. Internally, ``pandas`` uses [``numpy``](http://www.numpy.org/) to do the heavy lifting. We will see in this lesson how `pandas` affords us both the flexibility and structure needed to effectively deal with alchemical data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The anatomy of a DataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We saw earlier that our DataFrame looks essentially like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dHdl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It features 25001 rows, and 4 columns ('fep', 'coul', 'vdw', 'restraint').\n", + "\n", + "Each row is labeled with an *index* value, which for this dataset has 5 components ('time', 'fep-lambda', 'coul-lambda', 'vdw-lambda', 'restraint-lambda'), giving the state of the system at which the row was sampled.\n", + "Each column then gives the $\\frac{\\partial H}{\\partial \\lambda}$ corresponding to each of the lambda values being varied." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Working with columns and rows" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "A ``DataFrame`` allows us to get at individual components of our tabular data. We can get single columns like:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl['coul']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Or multiple columns with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl[['coul', 'restraint']]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Slicing can be used to get back subsets of rows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Python indices are 0-based, meaning counting goes as 0, 1, 2, 3...; this means that the first row is row 0, the second row is row 1, etc. It's best to refer to row 0 as the \"zeroth row\" to avoid confusion.\n", + "\n", + "This slice should be read as \"get the 0th element up to and not including the 5th element\". The \"not including\" is important, and the cause of much initial frustration. It does take some getting used to." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "What if we want a single row?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "For a DataFrame, this is ambiguous, since a single value is interpreted as a column name. We can only get at rows by slicing at the top level:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl[1:2]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Or we could be more explicit and use `.iloc`, which allows one to select rows (and columns) using 0-based indexes as one would do with e.g. `numpy` arrays:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "# select row 1, assuming a 0-based index\n", + "dHdl.iloc[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Getting a single row in this way returns a `pandas.Series`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "type(dHdl.iloc[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "A `Series` is a 1-D column of values, having all the same datatype. Since each of the datatypes of our columns were floating-point numbers, we got a `Series` with dtype `float64` this time. If we had columns with, e.g. strings, then we'd get back dtype `object`, which is a catchall for ``pandas``." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can also get the data in our ``Series`` as a raw ``numpy`` array:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "type(dHdl.iloc[1].values)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Pandas recently made its 1.0 release, making it a stable base on which to build functionality such as that found in `alchemlyb`. What's more, it's built on top of the venerable ``numpy`` array, which makes it possible to do fast numerical work in Python. A `Series` is basically a 1-D ``numpy`` array with the ability to select by labeled indices." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also use `iloc` to select out individual elements of the `DataFrame` by specifying a column index:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dHdl.iloc[1, 2]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that this gets us the `'vdw'` value from the Series we saw above." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Earlier we saw that we can select single columns from a `DataFrame`; doing this also returns a `Series` object:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dHdl['coul']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The columns of a DataFrame are effectively a set of `Series` objects bound together with a common index. Each column can have a different `dtype`, but all elements within a column must be of a single `dtype`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dHdl['coul'].dtype" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dHdl.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case, all our columns consist of floats, so we don't have to think too much about `dtype`s here. This is also true in general when working with `DataFrame`s produced by `alchemlyb` parsers." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Subsetting data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Beyond slicing on an index, we can use boolean indexing to subselect our data. Say we want only data for which the `coul` column is greater than 50?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl[dHdl['coul'] > 50]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "There's no magic here; we get a boolean index directly from a comparison:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "gt_50 = dHdl['coul'] > 50\n", + "gt_50" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "And using this `Series` of bools will then give only the rows for which the `Series` had `True`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl[gt_50]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This is the same behavior as ``numpy`` for arrays: most binary operators, such as ``+``, ``*``, ``>``, ``&``, work element-wise. With a single value on one side (such as ``50``), we get the result of the operation for each element." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "A ``DataFrame`` is an *object*, and objects have **methods**. These are functions that are *part of* the object itself, often doing operations on the object's data. One of these is ``DataFrame.mean``:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We get back the mean value of each column as a single ``Series``. There's more like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl.max()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "There's also ``DataFrame.describe``, which gives common descriptive statistics of the whole `DataFrame`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This is, itself, a ``DataFrame``:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl.describe()['coul']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Documentation is a key part of Python's design. In the notebook, you can get a quick look at the docs for a given Python function or method with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dHdl.mean?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Or more generally (built-in Python behavior):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "help(dHdl.mean)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "--------------\n", + "### Challenge: obtain the standard deviation of `coul` $\\frac{\\partial H}{\\partial \\lambda}$ for samples in which `vdw` $\\frac{\\partial H}{\\partial \\lambda}$ is greater than 20" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "One way we could do this is to first grab the ``\"coul\"`` column, then use a fancy index obtained from comparing the ``\"vdw\"`` column to ``20``. We can then call the ``std`` method of the resulting ``Series``:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "jupyter": { + "outputs_hidden": false + } + }, + "outputs": [], + "source": [ + "dHdl['coul'][dHdl['vdw'] > 20].std()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Note that this is a key part of the power of ``pandas`` objects: operations for subsetting and calculating descriptive statistics can often be stacked to great effect.\n", + "\n", + "-----------------------" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}