diff --git a/examples/1_feature_overview.ipynb b/examples/1_feature_overview.ipynb index 6d8791257..1ac039f0c 100644 --- a/examples/1_feature_overview.ipynb +++ b/examples/1_feature_overview.ipynb @@ -4,9 +4,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# PySINDy Package Feature Overview\n", + "# Feature Overview\n", "\n", - "This notebook provides a simple overview of the basic functionality of the PySINDy software package. In addition to demonstrating the basic usage for fitting a SINDy model, we demonstrate several means of customizing the SINDy fitting procedure. These include different forms of input data, different optimization methods, different differentiation methods, and custom feature libraries.\n", + "This notebook provides a simple overview of the basic functionality of PySINDy. In addition to demonstrating the basic usage for fitting a SINDy model, we demonstrate several means of customizing the SINDy fitting procedure. These include different forms of input data, different optimization methods, different differentiation methods, and custom feature libraries.\n", "\n", "An interactive version of this notebook is available on binder." ] @@ -28,8 +28,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:50.823922Z", - "start_time": "2020-05-04T23:52:49.266665Z" + "end_time": "2020-07-14T20:56:41.027530Z", + "start_time": "2020-07-14T20:56:39.054503Z" } }, "outputs": [], @@ -48,8 +48,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:50.840779Z", - "start_time": "2020-05-04T23:52:50.829437Z" + "end_time": "2020-07-14T20:56:41.062713Z", + "start_time": "2020-07-14T20:56:41.049703Z" } }, "outputs": [], @@ -70,8 +70,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:50.900119Z", - "start_time": "2020-05-04T23:52:50.845423Z" + "end_time": "2020-07-14T20:56:41.125233Z", + "start_time": "2020-07-14T20:56:41.082481Z" } }, "outputs": [], @@ -96,8 +96,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:51.032707Z", - "start_time": "2020-05-04T23:52:50.904277Z" + "end_time": "2020-07-14T20:56:41.213788Z", + "start_time": "2020-07-14T20:56:41.129976Z" } }, "outputs": [], @@ -115,8 +115,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:51.209903Z", - "start_time": "2020-05-04T23:52:51.038409Z" + "end_time": "2020-07-14T20:56:41.363608Z", + "start_time": "2020-07-14T20:56:41.237452Z" } }, "outputs": [ @@ -149,8 +149,8 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:51.308167Z", - "start_time": "2020-05-04T23:52:51.216095Z" + "end_time": "2020-07-14T20:56:41.509540Z", + "start_time": "2020-07-14T20:56:41.369516Z" } }, "outputs": [ @@ -184,8 +184,8 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:53.906824Z", - "start_time": "2020-05-04T23:52:51.312243Z" + "end_time": "2020-07-14T20:56:44.426259Z", + "start_time": "2020-07-14T20:56:41.523337Z" } }, "outputs": [ @@ -232,8 +232,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:58.473379Z", - "start_time": "2020-05-04T23:52:53.914137Z" + "end_time": "2020-07-14T20:56:50.920517Z", + "start_time": "2020-07-14T20:56:44.432856Z" } }, "outputs": [ @@ -308,8 +308,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:58.540961Z", - "start_time": "2020-05-04T23:52:58.478919Z" + "end_time": "2020-07-14T20:56:51.102624Z", + "start_time": "2020-07-14T20:56:50.931502Z" } }, "outputs": [ @@ -333,7 +333,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Single trajectory, pass in pre-computed derivatives" + "### Single trajectory, set default timestep\n", + "Since we used a uniform timestep when defining `t_train` we can tell set a default timestep to be used whenever `t` isn't passed in." ] }, { @@ -341,8 +342,41 @@ "execution_count": 10, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:58.699544Z", - "start_time": "2020-05-04T23:52:58.546614Z" + "end_time": "2020-07-14T20:56:51.237724Z", + "start_time": "2020-07-14T20:56:51.109888Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x0' = -9.999 x0 + 9.999 x1\n", + "x1' = 27.992 x0 + -0.999 x1 + -1.000 x0 x2\n", + "x2' = -2.666 x2 + 1.000 x0 x1\n" + ] + } + ], + "source": [ + "model = ps.SINDy(t_default=dt)\n", + "model.fit(x_train)\n", + "model.print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Single trajectory, pass in pre-computed derivatives" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-14T20:56:51.463725Z", + "start_time": "2020-07-14T20:56:51.244281Z" } }, "outputs": [ @@ -366,6 +400,25 @@ "model.print()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-14T20:56:51.474980Z", + "start_time": "2020-07-14T20:56:51.470074Z" + } + }, + "outputs": [], + "source": [ + "model = ps.SINDy()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -376,11 +429,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:59.544166Z", - "start_time": "2020-05-04T23:52:58.704343Z" + "end_time": "2020-07-14T20:56:52.735821Z", + "start_time": "2020-07-14T20:56:51.479625Z" } }, "outputs": [ @@ -418,11 +471,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:59.765358Z", - "start_time": "2020-05-04T23:52:59.548838Z" + "end_time": "2020-07-14T20:56:53.023200Z", + "start_time": "2020-07-14T20:56:52.742662Z" } }, "outputs": [ @@ -463,11 +516,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:52:59.818594Z", - "start_time": "2020-05-04T23:52:59.771831Z" + "end_time": "2020-07-14T20:56:53.057049Z", + "start_time": "2020-07-14T20:56:53.028120Z" } }, "outputs": [ @@ -505,11 +558,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:00.601087Z", - "start_time": "2020-05-04T23:52:59.822916Z" + "end_time": "2020-07-14T20:56:54.204425Z", + "start_time": "2020-07-14T20:56:53.061055Z" } }, "outputs": [ @@ -555,11 +608,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:00.699959Z", - "start_time": "2020-05-04T23:53:00.605441Z" + "end_time": "2020-07-14T20:56:54.346188Z", + "start_time": "2020-07-14T20:56:54.220097Z" } }, "outputs": [ @@ -590,11 +643,11 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:00.739137Z", - "start_time": "2020-05-04T23:53:00.715138Z" + "end_time": "2020-07-14T20:56:54.433989Z", + "start_time": "2020-07-14T20:56:54.365098Z" } }, "outputs": [ @@ -626,11 +679,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:00.823017Z", - "start_time": "2020-05-04T23:53:00.745372Z" + "end_time": "2020-07-14T20:56:55.162260Z", + "start_time": "2020-07-14T20:56:54.440397Z" } }, "outputs": [ @@ -638,14 +691,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "x0' = -0.310 x0 x2 + 0.342 x1 x2 + -0.002 x2^2\n", - "x1' = 15.952 x1 + 0.009 x0 x1 + -0.219 x0 x2 + -0.474 x1 x2 + 0.007 x2^2\n", - "x2' = 0.711 x0^2 + 0.533 x0 x1 + -0.005 x1 x2 + -0.119 x2^2\n" + "x0' = -10.005 x0 + 10.003 x1\n", + "x1' = 27.990 x0 + -0.997 x1 + -1.000 x0 x2\n", + "x2' = -2.665 x2 + 1.000 x0 x1\n" ] } ], "source": [ - "lasso_optimizer = Lasso(alpha=100, fit_intercept=False)\n", + "lasso_optimizer = Lasso(alpha=2, max_iter=2000, fit_intercept=False)\n", "\n", "model = ps.SINDy(optimizer=lasso_optimizer)\n", "model.fit(x_train, t=dt)\n", @@ -668,11 +721,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:00.895121Z", - "start_time": "2020-05-04T23:53:00.827305Z" + "end_time": "2020-07-14T20:56:55.291443Z", + "start_time": "2020-07-14T20:56:55.168358Z" } }, "outputs": [ @@ -703,11 +756,11 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:00.984647Z", - "start_time": "2020-05-04T23:53:00.899744Z" + "end_time": "2020-07-14T20:56:55.439961Z", + "start_time": "2020-07-14T20:56:55.309785Z" } }, "outputs": [ @@ -738,11 +791,11 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:01.048121Z", - "start_time": "2020-05-04T23:53:00.988405Z" + "end_time": "2020-07-14T20:56:55.563403Z", + "start_time": "2020-07-14T20:56:55.452308Z" } }, "outputs": [ @@ -780,11 +833,11 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:01.108527Z", - "start_time": "2020-05-04T23:53:01.052259Z" + "end_time": "2020-07-14T20:56:55.625166Z", + "start_time": "2020-07-14T20:56:55.572102Z" } }, "outputs": [ @@ -814,11 +867,11 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:01.182768Z", - "start_time": "2020-05-04T23:53:01.111848Z" + "end_time": "2020-07-14T20:56:55.731818Z", + "start_time": "2020-07-14T20:56:55.629200Z" } }, "outputs": [ @@ -847,11 +900,11 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:01.255141Z", - "start_time": "2020-05-04T23:53:01.187317Z" + "end_time": "2020-07-14T20:56:55.884658Z", + "start_time": "2020-07-14T20:56:55.768825Z" } }, "outputs": [ @@ -882,11 +935,11 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:01.338318Z", - "start_time": "2020-05-04T23:53:01.259734Z" + "end_time": "2020-07-14T20:56:56.065575Z", + "start_time": "2020-07-14T20:56:55.899119Z" } }, "outputs": [ @@ -918,11 +971,11 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:01.440676Z", - "start_time": "2020-05-04T23:53:01.341842Z" + "end_time": "2020-07-14T20:56:56.280764Z", + "start_time": "2020-07-14T20:56:56.081442Z" } }, "outputs": [ @@ -968,11 +1021,11 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:01.548264Z", - "start_time": "2020-05-04T23:53:01.443951Z" + "end_time": "2020-07-14T20:56:56.399316Z", + "start_time": "2020-07-14T20:56:56.286131Z" } }, "outputs": [ @@ -1009,11 +1062,11 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 29, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:01.594422Z", - "start_time": "2020-05-04T23:53:01.552310Z" + "end_time": "2020-07-14T20:56:56.468305Z", + "start_time": "2020-07-14T20:56:56.403688Z" } }, "outputs": [ @@ -1044,11 +1097,11 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 30, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:01.698338Z", - "start_time": "2020-05-04T23:53:01.598764Z" + "end_time": "2020-07-14T20:56:56.543797Z", + "start_time": "2020-07-14T20:56:56.484558Z" } }, "outputs": [ @@ -1081,11 +1134,11 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 31, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:01.728992Z", - "start_time": "2020-05-04T23:53:01.702116Z" + "end_time": "2020-07-14T20:56:56.565980Z", + "start_time": "2020-07-14T20:56:56.548517Z" } }, "outputs": [], @@ -1113,11 +1166,11 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:02.011213Z", - "start_time": "2020-05-04T23:53:01.732790Z" + "end_time": "2020-07-14T20:56:56.803086Z", + "start_time": "2020-07-14T20:56:56.573845Z" } }, "outputs": [], @@ -1133,11 +1186,11 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 33, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:02.091134Z", - "start_time": "2020-05-04T23:53:02.017047Z" + "end_time": "2020-07-14T20:56:56.984774Z", + "start_time": "2020-07-14T20:56:56.818167Z" } }, "outputs": [ @@ -1167,11 +1220,11 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 34, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:02.429489Z", - "start_time": "2020-05-04T23:53:02.095920Z" + "end_time": "2020-07-14T20:56:57.377089Z", + "start_time": "2020-07-14T20:56:57.009250Z" } }, "outputs": [ @@ -1203,11 +1256,11 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 35, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:04.466855Z", - "start_time": "2020-05-04T23:53:02.433423Z" + "end_time": "2020-07-14T20:56:59.933485Z", + "start_time": "2020-07-14T20:56:57.389767Z" } }, "outputs": [ @@ -1252,11 +1305,11 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 36, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:10.641341Z", - "start_time": "2020-05-04T23:53:04.473447Z" + "end_time": "2020-07-14T20:57:10.131247Z", + "start_time": "2020-07-14T20:56:59.940940Z" } }, "outputs": [ @@ -1320,11 +1373,11 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 37, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:10.660918Z", - "start_time": "2020-05-04T23:53:10.646583Z" + "end_time": "2020-07-14T20:57:10.175317Z", + "start_time": "2020-07-14T20:57:10.138496Z" } }, "outputs": [], @@ -1337,11 +1390,11 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 38, "metadata": { "ExecuteTime": { - "end_time": "2020-05-04T23:53:17.453546Z", - "start_time": "2020-05-04T23:53:10.666713Z" + "end_time": "2020-07-14T20:57:21.575626Z", + "start_time": "2020-07-14T20:57:10.181650Z" } }, "outputs": [ @@ -1425,7 +1478,12 @@ "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, - "toc_position": {}, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "296.475px" + }, "toc_section_display": true, "toc_window_display": true }, diff --git a/examples/4_scikit_learn_compatibility.ipynb b/examples/4_scikit_learn_compatibility.ipynb new file mode 100644 index 000000000..cb10f1f9f --- /dev/null +++ b/examples/4_scikit_learn_compatibility.ipynb @@ -0,0 +1,380 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Working with Scikit-learn\n", + "This notebook shows how PySINDy objects interface with some useful tools from [Scikit-learn](https://scikit-learn.org/stable/)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-14T21:07:35.177225Z", + "start_time": "2020-07-14T21:07:33.573951Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from scipy.integrate import odeint\n", + "\n", + "import pysindy as ps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's generate some training data from the [Lorenz system](https://en.wikipedia.org/wiki/Lorenz_system) with which to experiment." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-14T21:07:35.285747Z", + "start_time": "2020-07-14T21:07:35.184412Z" + } + }, + "outputs": [], + "source": [ + "def lorenz(z, t):\n", + " return [\n", + " 10 * (z[1] - z[0]),\n", + " z[0] * (28 - z[2]) - z[1],\n", + " z[0] * z[1] - (8 / 3) * z[2]\n", + " ]\n", + "\n", + "# Generate training data\n", + "dt = .002\n", + "\n", + "t_train = np.arange(0, 10, dt)\n", + "x0_train = [-8, 8, 27]\n", + "x_train = odeint(lorenz, x0_train, t_train)\n", + "\n", + "# Evolve the Lorenz equations in time using a different initial condition\n", + "t_test = np.arange(0, 15, dt)\n", + "x0_test = np.array([8, 7, 15])\n", + "x_test = odeint(lorenz, x0_test, t_test) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cross-validation\n", + "PySINDy supports Scikit-learn-type cross-validation with a few caveats.\n", + "\n", + "1. We must use **uniform timesteps** using the `t_default` parameter. This is because the `fit` and `score` methods of `SINDy` differ from those used in Scikit-learn in the sense that they both have an optional `t` parameter. Setting `t_default` is a workaround.\n", + "2. We have to be careful about the way we split up testing and training data during cross-validation. Because the `SINDy` object needs to differentiate the data, we need the training and test data to consist of sequential intervals of time. If we randomly sample the data, then the computed derivatives will be horribly inaccurate. Luckily, Scikit-learn has a `TimeSeriesSplit` object for such situations. If we really want to randomly sample the data during cross-validation, there is a way to do so. However, it's more complicated.\n", + "\n", + "Note that we need to prepend `optimizer__`, `feature_library__`, or `differentiation_method__` to the parameter names." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-validation with TimeSeriesSplit" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-14T21:07:44.101945Z", + "start_time": "2020-07-14T21:07:35.293352Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best parameters: {'differentiation_method__order': 2, 'feature_library': PolynomialLibrary(), 'optimizer__alpha': 0.01, 'optimizer__threshold': 0.01}\n", + "x0' = -9.999 x0 + 9.999 x1\n", + "x1' = 27.992 x0 + -0.999 x1 + -1.000 x0 x2\n", + "x2' = -2.666 x2 + 1.000 x0 x1\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.model_selection import TimeSeriesSplit\n", + "\n", + "model = ps.SINDy(t_default=dt)\n", + "\n", + "param_grid = {\n", + " \"optimizer__threshold\": [0.001, 0.01, 0.1],\n", + " \"optimizer__alpha\": [0.01, 0.05, 0.1],\n", + " \"feature_library\": [ps.PolynomialLibrary(), ps.FourierLibrary()],\n", + " \"differentiation_method__order\": [1, 2]\n", + "}\n", + "\n", + "search = GridSearchCV(\n", + " model,\n", + " param_grid,\n", + " cv=TimeSeriesSplit(n_splits=5)\n", + ")\n", + "search.fit(x_train)\n", + "\n", + "print(\"Best parameters:\", search.best_params_)\n", + "search.best_estimator_.print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-validation without TimeSeriesSplit\n", + "If we want to use another cross-validation splitter, we'll need to (a) define a wrapper class which uses the argument \"y\" instead of \"x_dot\" and (b) precompute the derivatives. Note that (b) means that we will not be able to perform cross-validation on the parameters of the differentiation method." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-14T21:07:44.133790Z", + "start_time": "2020-07-14T21:07:44.116536Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.metrics import r2_score\n", + "\n", + "class SINDyCV(ps.SINDy):\n", + " def __init__(\n", + " self,\n", + " optimizer=None,\n", + " feature_library=None,\n", + " differentiation_method=None,\n", + " feature_names=None,\n", + " t_default=1,\n", + " discrete_time=False,\n", + " n_jobs=1\n", + " ):\n", + " super(SINDyCV, self).__init__(\n", + " optimizer=optimizer,\n", + " feature_library=feature_library,\n", + " differentiation_method=differentiation_method,\n", + " feature_names=feature_names,\n", + " t_default=t_default,\n", + " discrete_time=discrete_time,\n", + " n_jobs=n_jobs\n", + " )\n", + "\n", + " def fit(self, x, y, **kwargs):\n", + " return super(SINDyCV, self).fit(x, x_dot=y, **kwargs)\n", + " \n", + " def score(\n", + " self,\n", + " x,\n", + " y,\n", + " t=None,\n", + " u=None,\n", + " multiple_trajectories=False,\n", + " metric=r2_score,\n", + " **metric_kws\n", + " ):\n", + " return super(SINDyCV, self).score(\n", + " x,\n", + " x_dot=y,\n", + " t=t,\n", + " u=u,\n", + " multiple_trajectories=multiple_trajectories,\n", + " metric=metric,\n", + " **metric_kws\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-14T21:07:50.098150Z", + "start_time": "2020-07-14T21:07:44.137586Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best parameters: {'feature_library__degree': 2, 'optimizer__alpha': 0.01, 'optimizer__threshold': 0.002}\n", + "x0' = -9.999 x0 + 9.999 x1\n", + "x1' = 27.992 x0 + -0.999 x1 + -1.000 x0 x2\n", + "x2' = -2.666 x2 + 1.000 x0 x1\n" + ] + } + ], + "source": [ + "from sklearn.model_selection import ShuffleSplit\n", + "\n", + "model = SINDyCV()\n", + "x_dot = model.differentiate(x_train, t=t_train)\n", + "\n", + "param_grid = {\n", + " \"optimizer__threshold\": [0.002, 0.01, 0.1],\n", + " \"optimizer__alpha\": [0.01, 0.05, 0.1],\n", + " \"feature_library__degree\": [1, 2, 3],\n", + "}\n", + "\n", + "search = GridSearchCV(\n", + " model,\n", + " param_grid,\n", + " cv=ShuffleSplit(n_splits=3, test_size=0.25)\n", + ")\n", + "search.fit(x_train, y=x_dot)\n", + "print(\"Best parameters:\", search.best_params_)\n", + "search.best_estimator_.print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sparse optimizers\n", + "Any of Scikit-learn's [linear models ](https://scikit-learn.org/stable/modules/linear_model.html) can be used for the `optimizer` parameter of a `SINDy` object, though we only recommend using those designed for sparse regression.\n", + "\n", + "In the examples below we set `fit_intercept` to `False` since the default feature library (polynomials of degree up to two) already includes constant functions." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-14T21:07:50.552501Z", + "start_time": "2020-07-14T21:07:50.107361Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x0' = -10.005 x0 + 10.003 x1\n", + "x1' = 27.990 x0 + -0.997 x1 + -1.000 x0 x2\n", + "x2' = -2.665 x2 + 0.001 x0^2 + 0.999 x0 x1\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import ElasticNet\n", + "\n", + "model = ps.SINDy(optimizer=ElasticNet(l1_ratio=0.9, fit_intercept=False), t_default=dt)\n", + "model.fit(x_train)\n", + "model.print()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-07-14T21:07:50.660764Z", + "start_time": "2020-07-14T21:07:50.577603Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x0' = -10.005 x0 + 10.003 x1\n", + "x1' = 27.990 x0 + -0.997 x1 + -1.000 x0 x2\n", + "x2' = -2.665 x2 + 0.001 x0^2 + 0.999 x0 x1\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import OrthogonalMatchingPursuit\n", + "\n", + "model = ps.SINDy(\n", + " optimizer=OrthogonalMatchingPursuit(n_nonzero_coefs=8, fit_intercept=False),\n", + " t_default=dt\n", + ")\n", + "model.fit(x_train)\n", + "model.print()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pysindy/differentiation/base.py b/pysindy/differentiation/base.py index d5e5baca5..b9c2f336a 100644 --- a/pysindy/differentiation/base.py +++ b/pysindy/differentiation/base.py @@ -3,10 +3,12 @@ """ import abc +from sklearn.base import BaseEstimator + from pysindy.utils.base import validate_input -class BaseDifferentiation: +class BaseDifferentiation(BaseEstimator): """ Base class for differentiation methods. diff --git a/pysindy/pysindy.py b/pysindy/pysindy.py index 7f95aee6f..b356efd0b 100644 --- a/pysindy/pysindy.py +++ b/pysindy/pysindy.py @@ -13,10 +13,10 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.metrics import r2_score from sklearn.pipeline import Pipeline -from sklearn.preprocessing import PolynomialFeatures from sklearn.utils.validation import check_is_fitted from pysindy.differentiation import FiniteDifference +from pysindy.feature_library import PolynomialLibrary from pysindy.optimizers import SINDyOptimizer from pysindy.optimizers import STLSQ from pysindy.utils.base import drop_nan_rows @@ -51,6 +51,9 @@ class SINDy(BaseEstimator): Names for the input features (e.g. ``['x', 'y', 'z']``). If None, will use ``['x0', 'x1', ...]``. + t_default : float, optional (default 1) + Default value for the time step. + discrete_time : boolean, optional (default False) If True, dynamical system is treated as a map. Rather than predicting derivatives, the right hand side functions step the system forward by @@ -141,6 +144,7 @@ def __init__( feature_library=None, differentiation_method=None, feature_names=None, + t_default=1, discrete_time=False, n_jobs=1, ): @@ -148,11 +152,17 @@ def __init__( optimizer = STLSQ() self.optimizer = optimizer if feature_library is None: - feature_library = PolynomialFeatures() + feature_library = PolynomialLibrary() self.feature_library = feature_library if differentiation_method is None: differentiation_method = FiniteDifference() self.differentiation_method = differentiation_method + if not isinstance(t_default, float) and not isinstance(t_default, int): + raise ValueError("t_default must be a positive number") + elif t_default <= 0: + raise ValueError("t_default must be a positive number") + else: + self.t_default = t_default self.feature_names = feature_names self.discrete_time = discrete_time self.n_jobs = n_jobs @@ -160,7 +170,7 @@ def __init__( def fit( self, x, - t=1, + t=None, x_dot=None, u=None, multiple_trajectories=False, @@ -178,7 +188,7 @@ def fit( trajectories may contain different numbers of samples. t: float, numpy array of shape [n_samples], or list of numpy arrays, optional \ - (default 1) + (default None) If t is a float, it specifies the timestep between each sample. If array-like, it specifies the time at which each sample was collected. @@ -186,7 +196,7 @@ def fit( In the case of multi-trajectory training data, t may also be a list of arrays containing the collection times for each individual trajectory. - Default value is a timestep of 1 between samples. + If None, the default time step ``t_default`` will be used. x_dot: array-like or list of array-like, shape (n_samples, n_input_features), \ optional (default None) @@ -229,6 +239,8 @@ def fit( ------- self: returns an instance of self """ + if t is None: + t = self.t_default if u is None: self.n_control_features_ = 0 else: @@ -242,7 +254,7 @@ def fit( self.n_control_features_ = u.shape[1] if multiple_trajectories: - x, x_dot = self.process_multiple_trajectories(x, t, x_dot) + x, x_dot = self._process_multiple_trajectories(x, t, x_dot) else: x = validate_input(x, t) @@ -393,7 +405,7 @@ def print(self, lhs=None, precision=3): def score( self, x, - t=1, + t=None, x_dot=None, u=None, multiple_trajectories=False, @@ -409,10 +421,11 @@ def score( Samples t: float, numpy array of shape [n_samples], or list of numpy arrays, optional \ - (default 1) + (default None) Time step between samples or array of collection times. Optional, used to compute the time derivatives of the samples if x_dot is not provided. + If None, the default time step ``t_default`` will be used. x_dot: array-like or list of array-like, shape (n_samples, n_input_features), \ optional (default None) @@ -443,6 +456,8 @@ def score( score: float Metric function value for the model prediction of x_dot. """ + if t is None: + t = self.t_default if u is None or self.n_control_features_ == 0: if self.n_control_features_ > 0: raise TypeError( @@ -463,7 +478,7 @@ def score( ) if multiple_trajectories: - x, x_dot = self.process_multiple_trajectories( + x, x_dot = self._process_multiple_trajectories( x, t, x_dot, return_array=True ) else: @@ -488,7 +503,7 @@ def score( x_dot_predict = self.model.predict(x) return metric(x_dot, x_dot_predict, **metric_kws) - def process_multiple_trajectories(self, x, t, x_dot, return_array=True): + def _process_multiple_trajectories(self, x, t, x_dot, return_array=True): """ Handle input data that contains multiple trajectories by doing the necessary validation, reshaping, and computation of derivatives. @@ -536,7 +551,7 @@ def process_multiple_trajectories(self, x, t, x_dot, return_array=True): else: return x, x_dot - def differentiate(self, x, t=1, multiple_trajectories=False): + def differentiate(self, x, t=None, multiple_trajectories=False): """ Apply the model's differentiation method to data. @@ -546,9 +561,9 @@ def differentiate(self, x, t=1, multiple_trajectories=False): Data to be differentiated. t: int, numpy array of shape [n_samples], or list of numpy arrays, optional \ - (default 1) - Time step between samples or array of collection times. Default is - a time step of 1 between samples. + (default None) + Time step between samples or array of collection times. + If None, the default time step ``t_default`` will be used. multiple_trajectories: boolean, optional (default False) If True, x contains multiple trajectories and must be a list of @@ -560,11 +575,15 @@ def differentiate(self, x, t=1, multiple_trajectories=False): Time derivatives computed by using the model's differentiation method """ + if t is None: + t = self.t_default if self.discrete_time: raise RuntimeError("No differentiation implemented for discrete time model") if multiple_trajectories: - return self.process_multiple_trajectories(x, t, None, return_array=False)[1] + return self._process_multiple_trajectories(x, t, None, return_array=False)[ + 1 + ] else: x = validate_input(x, t) return self.differentiation_method(x, t) @@ -629,7 +648,7 @@ def simulate( raise TypeError("Model was fit using control variables, so u is required") if self.discrete_time: - if not isinstance(t, int): + if not isinstance(t, int) or t <= 0: raise ValueError( "For discrete time model, t must be an integer (indicating" "the number of steps to predict)" diff --git a/pysindy/utils/base.py b/pysindy/utils/base.py index 882c5d03c..0174b50fa 100644 --- a/pysindy/utils/base.py +++ b/pysindy/utils/base.py @@ -24,7 +24,7 @@ def validate_input(x, t=T_DEFAULT): if t is None: raise ValueError("t must be a scalar or array-like.") # Apply this check if t is a scalar - elif np.ndim(t) == 0: + elif np.ndim(t) == 0 and (isinstance(t, int) or isinstance(t, float)): if t <= 0: raise ValueError("t must be positive") # Only apply these tests if t is array-like diff --git a/test/test_pysindy.py b/test/test_pysindy.py index dd8646cb4..7942f8f14 100644 --- a/test/test_pysindy.py +++ b/test/test_pysindy.py @@ -115,9 +115,9 @@ def test_bad_t(data): x, t = data model = SINDy() - # No t + # Wrong type with pytest.raises(ValueError): - model.fit(x, t=None) + model.fit(x, t="1") # Invalid value of t with pytest.raises(ValueError): @@ -143,6 +143,31 @@ def test_bad_t(data): model.fit(x, t) +@pytest.mark.parametrize( + "data", [pytest.lazy_fixture("data_1d"), pytest.lazy_fixture("data_lorenz")] +) +def test_t_default(data): + x, t = data + dt = t[1] - t[0] + + with pytest.raises(ValueError): + model = SINDy(t_default=0) + with pytest.raises(ValueError): + model = SINDy(t_default="1") + + model = SINDy() + model.fit(x, dt) + + model_t_default = SINDy(t_default=dt) + model_t_default.fit(x) + + np.testing.assert_allclose(model.coefficients(), model_t_default.coefficients()) + np.testing.assert_almost_equal(model.score(x, t=dt), model_t_default.score(x)) + np.testing.assert_almost_equal( + model.differentiate(x, t=dt), model_t_default.differentiate(x) + ) + + @pytest.mark.parametrize( "data, optimizer", [ @@ -254,6 +279,11 @@ def test_fit_multiple_trajectores(data_multiple_trajctories): model.fit(x, t=t, x_dot=x, multiple_trajectories=True) check_is_fitted(model) + # Test validate_input + t[0] = None + with pytest.raises(ValueError): + model.fit(x, t=t, multiple_trajectories=True) + def test_predict_multiple_trajectories(data_multiple_trajctories): x, t = data_multiple_trajctories @@ -460,17 +490,17 @@ def test_multiple_trajectories_errors(data_multiple_trajctories, data_discrete_t model = SINDy() with pytest.raises(TypeError): - model.process_multiple_trajectories(np.array(x), t, x) + model._process_multiple_trajectories(np.array(x), t, x) with pytest.raises(TypeError): - model.process_multiple_trajectories(x, t, np.array(x)) + model._process_multiple_trajectories(x, t, np.array(x)) # Test an option that doesn't get tested elsewhere - model.process_multiple_trajectories(x, t, x, return_array=False) + model._process_multiple_trajectories(x, t, x, return_array=False) x = data_discrete_time model = SINDy(discrete_time=True) with pytest.raises(TypeError): - model.process_multiple_trajectories(x, t, np.array(x)) + model._process_multiple_trajectories(x, t, np.array(x)) def test_simulate_errors(data_lorenz): diff --git a/test/test_sindyc.py b/test/test_sindyc.py index 75001c8d5..4bdd7f0f4 100644 --- a/test/test_sindyc.py +++ b/test/test_sindyc.py @@ -181,9 +181,9 @@ def test_bad_t(data): x, t, u, _ = data model = SINDy() - # No t + # Wrong type with pytest.raises(ValueError): - model.fit(x, u=u, t=None) + model.fit(x, u=u, t="1") # Invalid value of t with pytest.raises(ValueError): @@ -209,6 +209,26 @@ def test_bad_t(data): model.fit(x, u=u, t=t) +@pytest.mark.parametrize( + "data", + [pytest.lazy_fixture("data_lorenz_c_1d"), pytest.lazy_fixture("data_lorenz_c_2d")], +) +def test_t_default(data): + x, t, u, _ = data + dt = t[1] - t[0] + + model = SINDy() + model.fit(x, u=u, t=dt) + + model_t_default = SINDy(t_default=dt) + model_t_default.fit(x, u=u) + + np.testing.assert_allclose(model.coefficients(), model_t_default.coefficients()) + np.testing.assert_almost_equal( + model.score(x, u=u, t=dt), model_t_default.score(x, u=u) + ) + + @pytest.mark.parametrize( "data, optimizer", [