Bugfix/drive downloads (#42)

* Black changes. * Black changes * Add gdown dependency, and missing file overwriting ignore. * Update test import. * Remove unncessary import. * Update original model with new trained weights, add tested notebooks from clean state, save classes file after fitting model. * Removeimport of removed dependency. * Check if the zip file is present, to avoid confusion with symlink dir. * Change data download, remove symlink dir.
mitmedialab · Feb 22, 2022 · 5b3ac69 · 5b3ac69
1 parent 6c7e847
commit 5b3ac69
Show file tree

Hide file tree

Showing 14 changed files with 841 additions and 758 deletions.
diff --git a/data/data b/data/data
diff --git a/model_files/classes_sherlock.npy b/model_files/classes_sherlock.npy
diff --git a/model_files/sherlock_weights.h5 b/model_files/sherlock_weights.h5
diff --git a/notebooks/00-use-sherlock-out-of-the-box.ipynb b/notebooks/00-use-sherlock-out-of-the-box.ipynb
@@ -8,7 +8,8 @@
     "# Using Sherlock out-of-the-box\n",
     "This notebook shows how to predict a semantic type for a given table column.\n",
     "The steps are basically:\n",
-    "- Extract features from a column.\n",
+    "- Download files for word embedding and paragraph vector feature extraction (downloads only once) and initialize feature extraction models.\n",
+    "- Extract features from table columns.\n",
     "- Initialize Sherlock.\n",
     "- Make a prediction for the feature representation of the column."
    ]
@@ -44,11 +45,14 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "UsageError: Environment does not have key: PYTHONHASHSEED\n"
-     ]
+     "data": {
+      "text/plain": [
+       "'13'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -57,20 +61,10 @@
   },
   {
    "cell_type": "markdown",
-   "id": "2b3b7967",
+   "id": "f1101303",
    "metadata": {},
    "source": [
-    "## Extract features"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "164f74ff",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# helpers.download_data()"
+    "## Initialize feature extraction models"
    ]
   },
   {
@@ -93,9 +87,9 @@
       "        \n",
       "All files for extracting word and paragraph embeddings are present.\n",
       "Initialising word embeddings\n",
-      "Initialise Word Embeddings process took 0:00:05.607905 seconds.\n",
-      "Initialise Doc2Vec Model, 400 dim, process took 0:00:02.443327 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)\n",
-      "Initialised NLTK, process took 0:00:00.181374 seconds.\n"
+      "Initialise Word Embeddings process took 0:00:05.513540 seconds.\n",
+      "Initialise Doc2Vec Model, 400 dim, process took 0:00:04.191875 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)\n",
+      "Initialised NLTK, process took 0:00:00.209930 seconds.\n"
      ]
     },
     {
@@ -117,9 +111,17 @@
     "initialise_nltk()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "2b3b7967",
+   "metadata": {},
+   "source": [
+    "## Extract features"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 4,
    "id": "db04ccf9",
    "metadata": {},
    "outputs": [],
@@ -128,26 +130,28 @@
     "    [\n",
     "        [\"Jane Smith\", \"Lute Ahorn\", \"Anna James\"],\n",
     "        [\"Amsterdam\", \"Haarlem\", \"Zwolle\"],\n",
+    "        [\"Chabot Street 19\", \"1200 fifth Avenue\", \"Binnenkant 22, 1011BH\"]\n",
     "    ],\n",
     "    name=\"values\"\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 5,
    "id": "4875f6c7",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0    [Jane Smith, Lute Ahorn, Anna James]\n",
-       "1            [Amsterdam, Haarlem, Zwolle]\n",
+       "0                 [Jane Smith, Lute Ahorn, Anna James]\n",
+       "1                         [Amsterdam, Haarlem, Zwolle]\n",
+       "2    [Chabot Street 19, 1200 fifth Avenue, Binnenka...\n",
        "Name: values, dtype: object"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -158,15 +162,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 8,
    "id": "f7f2c846",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Extracting Features: 100%|██████████| 2/2 [00:00<00:00, 62.37it/s]\n"
+      "Extracting Features: 100%|██████████| 3/3 [00:00<00:00, 167.51it/s]"
      ]
     },
     {
@@ -175,19 +179,26 @@
      "text": [
       "Exporting 1588 column features\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
     }
    ],
    "source": [
     "extract_features(\n",
     "    \"../temporary.csv\",\n",
     "    data\n",
     ")\n",
-    "feature_vector = pd.read_csv(\"../temporary.csv\", dtype=np.float32)"
+    "feature_vectors = pd.read_csv(\"../temporary.csv\", dtype=np.float32)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 9,
    "id": "0c42ce71",
    "metadata": {},
    "outputs": [
@@ -241,95 +252,116 @@
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>-3.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>-0.115819</td>\n",
-       "      <td>0.023961</td>\n",
-       "      <td>-0.130739</td>\n",
-       "      <td>0.006393</td>\n",
-       "      <td>-0.135118</td>\n",
-       "      <td>-0.071956</td>\n",
-       "      <td>-0.051051</td>\n",
-       "      <td>-0.068307</td>\n",
-       "      <td>0.087342</td>\n",
-       "      <td>-0.145716</td>\n",
+       "      <td>-0.116468</td>\n",
+       "      <td>0.023982</td>\n",
+       "      <td>-0.130867</td>\n",
+       "      <td>0.006825</td>\n",
+       "      <td>-0.135098</td>\n",
+       "      <td>-0.070616</td>\n",
+       "      <td>-0.052172</td>\n",
+       "      <td>-0.067250</td>\n",
+       "      <td>0.086256</td>\n",
+       "      <td>-0.144385</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
-       "      <td>0.0</td>\n",
+       "      <td>0.000000</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>-3.0</td>\n",
        "      <td>0.0</td>\n",
        "      <td>...</td>\n",
-       "      <td>-0.054351</td>\n",
-       "      <td>0.023650</td>\n",
-       "      <td>-0.165681</td>\n",
-       "      <td>-0.016137</td>\n",
-       "      <td>-0.059402</td>\n",
-       "      <td>0.008454</td>\n",
-       "      <td>-0.044624</td>\n",
-       "      <td>0.025160</td>\n",
-       "      <td>0.037831</td>\n",
-       "      <td>-0.086235</td>\n",
+       "      <td>-0.054949</td>\n",
+       "      <td>0.024502</td>\n",
+       "      <td>-0.166001</td>\n",
+       "      <td>-0.014375</td>\n",
+       "      <td>-0.058199</td>\n",
+       "      <td>0.009978</td>\n",
+       "      <td>-0.046423</td>\n",
+       "      <td>0.025163</td>\n",
+       "      <td>0.036946</td>\n",
+       "      <td>-0.086611</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.666667</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>-1.5</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-0.022804</td>\n",
+       "      <td>0.001741</td>\n",
+       "      <td>0.047479</td>\n",
+       "      <td>0.118293</td>\n",
+       "      <td>-0.093435</td>\n",
+       "      <td>0.036759</td>\n",
+       "      <td>-0.004508</td>\n",
+       "      <td>-0.087898</td>\n",
+       "      <td>-0.117796</td>\n",
+       "      <td>-0.191386</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
-       "<p>2 rows × 1588 columns</p>\n",
+       "<p>3 rows × 1588 columns</p>\n",
        "</div>"
       ],
       "text/plain": [
        "   n_[0]-agg-any  n_[0]-agg-all  n_[0]-agg-mean  n_[0]-agg-var  n_[0]-agg-min  \\\n",
-       "0            0.0            0.0             0.0            0.0            0.0   \n",
-       "1            0.0            0.0             0.0            0.0            0.0   \n",
+       "0            0.0            0.0             0.0       0.000000            0.0   \n",
+       "1            0.0            0.0             0.0       0.000000            0.0   \n",
+       "2            1.0            0.0             1.0       0.666667            0.0   \n",
        "\n",
        "   n_[0]-agg-max  n_[0]-agg-median  n_[0]-agg-sum  n_[0]-agg-kurtosis  \\\n",
        "0            0.0               0.0            0.0                -3.0   \n",
        "1            0.0               0.0            0.0                -3.0   \n",
+       "2            2.0               1.0            3.0                -1.5   \n",
        "\n",
        "   n_[0]-agg-skewness  ...  par_vec_390  par_vec_391  par_vec_392  \\\n",
-       "0                 0.0  ...    -0.115819     0.023961    -0.130739   \n",
-       "1                 0.0  ...    -0.054351     0.023650    -0.165681   \n",
+       "0                 0.0  ...    -0.116468     0.023982    -0.130867   \n",
+       "1                 0.0  ...    -0.054949     0.024502    -0.166001   \n",
+       "2                 0.0  ...    -0.022804     0.001741     0.047479   \n",
        "\n",
        "   par_vec_393  par_vec_394  par_vec_395  par_vec_396  par_vec_397  \\\n",
-       "0     0.006393    -0.135118    -0.071956    -0.051051    -0.068307   \n",
-       "1    -0.016137    -0.059402     0.008454    -0.044624     0.025160   \n",
+       "0     0.006825    -0.135098    -0.070616    -0.052172    -0.067250   \n",
+       "1    -0.014375    -0.058199     0.009978    -0.046423     0.025163   \n",
+       "2     0.118293    -0.093435     0.036759    -0.004508    -0.087898   \n",
        "\n",
        "   par_vec_398  par_vec_399  \n",
-       "0     0.087342    -0.145716  \n",
-       "1     0.037831    -0.086235  \n",
+       "0     0.086256    -0.144385  \n",
+       "1     0.036946    -0.086611  \n",
+       "2    -0.117796    -0.191386  \n",
        "\n",
-       "[2 rows x 1588 columns]"
+       "[3 rows x 1588 columns]"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "feature_vector"
+    "feature_vectors"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "52047a6b",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -343,18 +375,18 @@
    "id": "9027fa4a",
    "metadata": {},
    "source": [
-    "## Initialize Sherlock."
+    "## Initialize Sherlock"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 11,
    "id": "b9ec13ec",
    "metadata": {},
    "outputs": [],
    "source": [
     "model = SherlockModel();\n",
-    "model.initialize_model_from_json(with_weights=True);"
+    "model.initialize_model_from_json(with_weights=True, model_id=\"sherlock\");"
    ]
   },
   {
@@ -375,27 +407,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 12,
    "id": "fc079fa9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "predicted_labels = model.predict(feature_vector, \"sherlock\")"
+    "predicted_labels = model.predict(feature_vectors, \"sherlock\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 13,
    "id": "0feb9584",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "array(['creator', 'city'], dtype=object)"
+       "array(['person', 'city', 'address'], dtype=object)"
       ]
      },
-     "execution_count": 41,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }