Skip to content

Commit

Permalink
Bugfix/drive downloads (#42)
Browse files Browse the repository at this point in the history
* Black changes.

* Black changes

* Add gdown dependency, and missing file overwriting ignore.

* Update test import.

* Remove unncessary import.

* Update original model with new trained weights, add tested notebooks from clean state, save classes file after fitting model.

* Removeimport of removed dependency.

* Check if the zip file is present, to avoid confusion with symlink dir.

* Change data download, remove symlink dir.
  • Loading branch information
madelonhulsebos authored Feb 22, 2022
1 parent 6c7e847 commit 5b3ac69
Show file tree
Hide file tree
Showing 14 changed files with 841 additions and 758 deletions.
1 change: 0 additions & 1 deletion data/data

This file was deleted.

Binary file added model_files/classes_sherlock.npy
Binary file not shown.
Binary file modified model_files/sherlock_weights.h5
Binary file not shown.
192 changes: 112 additions & 80 deletions notebooks/00-use-sherlock-out-of-the-box.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
"# Using Sherlock out-of-the-box\n",
"This notebook shows how to predict a semantic type for a given table column.\n",
"The steps are basically:\n",
"- Extract features from a column.\n",
"- Download files for word embedding and paragraph vector feature extraction (downloads only once) and initialize feature extraction models.\n",
"- Extract features from table columns.\n",
"- Initialize Sherlock.\n",
"- Make a prediction for the feature representation of the column."
]
Expand Down Expand Up @@ -44,11 +45,14 @@
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"UsageError: Environment does not have key: PYTHONHASHSEED\n"
]
"data": {
"text/plain": [
"'13'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
Expand All @@ -57,20 +61,10 @@
},
{
"cell_type": "markdown",
"id": "2b3b7967",
"id": "f1101303",
"metadata": {},
"source": [
"## Extract features"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "164f74ff",
"metadata": {},
"outputs": [],
"source": [
"# helpers.download_data()"
"## Initialize feature extraction models"
]
},
{
Expand All @@ -93,9 +87,9 @@
" \n",
"All files for extracting word and paragraph embeddings are present.\n",
"Initialising word embeddings\n",
"Initialise Word Embeddings process took 0:00:05.607905 seconds.\n",
"Initialise Doc2Vec Model, 400 dim, process took 0:00:02.443327 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)\n",
"Initialised NLTK, process took 0:00:00.181374 seconds.\n"
"Initialise Word Embeddings process took 0:00:05.513540 seconds.\n",
"Initialise Doc2Vec Model, 400 dim, process took 0:00:04.191875 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)\n",
"Initialised NLTK, process took 0:00:00.209930 seconds.\n"
]
},
{
Expand All @@ -117,9 +111,17 @@
"initialise_nltk()"
]
},
{
"cell_type": "markdown",
"id": "2b3b7967",
"metadata": {},
"source": [
"## Extract features"
]
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 4,
"id": "db04ccf9",
"metadata": {},
"outputs": [],
Expand All @@ -128,26 +130,28 @@
" [\n",
" [\"Jane Smith\", \"Lute Ahorn\", \"Anna James\"],\n",
" [\"Amsterdam\", \"Haarlem\", \"Zwolle\"],\n",
" [\"Chabot Street 19\", \"1200 fifth Avenue\", \"Binnenkant 22, 1011BH\"]\n",
" ],\n",
" name=\"values\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 5,
"id": "4875f6c7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 [Jane Smith, Lute Ahorn, Anna James]\n",
"1 [Amsterdam, Haarlem, Zwolle]\n",
"0 [Jane Smith, Lute Ahorn, Anna James]\n",
"1 [Amsterdam, Haarlem, Zwolle]\n",
"2 [Chabot Street 19, 1200 fifth Avenue, Binnenka...\n",
"Name: values, dtype: object"
]
},
"execution_count": 36,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -158,15 +162,15 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 8,
"id": "f7f2c846",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Extracting Features: 100%|██████████| 2/2 [00:00<00:00, 62.37it/s]\n"
"Extracting Features: 100%|██████████| 3/3 [00:00<00:00, 167.51it/s]"
]
},
{
Expand All @@ -175,19 +179,26 @@
"text": [
"Exporting 1588 column features\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"extract_features(\n",
" \"../temporary.csv\",\n",
" data\n",
")\n",
"feature_vector = pd.read_csv(\"../temporary.csv\", dtype=np.float32)"
"feature_vectors = pd.read_csv(\"../temporary.csv\", dtype=np.float32)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 9,
"id": "0c42ce71",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -241,95 +252,116 @@
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>-3.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>-0.115819</td>\n",
" <td>0.023961</td>\n",
" <td>-0.130739</td>\n",
" <td>0.006393</td>\n",
" <td>-0.135118</td>\n",
" <td>-0.071956</td>\n",
" <td>-0.051051</td>\n",
" <td>-0.068307</td>\n",
" <td>0.087342</td>\n",
" <td>-0.145716</td>\n",
" <td>-0.116468</td>\n",
" <td>0.023982</td>\n",
" <td>-0.130867</td>\n",
" <td>0.006825</td>\n",
" <td>-0.135098</td>\n",
" <td>-0.070616</td>\n",
" <td>-0.052172</td>\n",
" <td>-0.067250</td>\n",
" <td>0.086256</td>\n",
" <td>-0.144385</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>-3.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>-0.054351</td>\n",
" <td>0.023650</td>\n",
" <td>-0.165681</td>\n",
" <td>-0.016137</td>\n",
" <td>-0.059402</td>\n",
" <td>0.008454</td>\n",
" <td>-0.044624</td>\n",
" <td>0.025160</td>\n",
" <td>0.037831</td>\n",
" <td>-0.086235</td>\n",
" <td>-0.054949</td>\n",
" <td>0.024502</td>\n",
" <td>-0.166001</td>\n",
" <td>-0.014375</td>\n",
" <td>-0.058199</td>\n",
" <td>0.009978</td>\n",
" <td>-0.046423</td>\n",
" <td>0.025163</td>\n",
" <td>0.036946</td>\n",
" <td>-0.086611</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.666667</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>-1.5</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>-0.022804</td>\n",
" <td>0.001741</td>\n",
" <td>0.047479</td>\n",
" <td>0.118293</td>\n",
" <td>-0.093435</td>\n",
" <td>0.036759</td>\n",
" <td>-0.004508</td>\n",
" <td>-0.087898</td>\n",
" <td>-0.117796</td>\n",
" <td>-0.191386</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 1588 columns</p>\n",
"<p>3 rows × 1588 columns</p>\n",
"</div>"
],
"text/plain": [
" n_[0]-agg-any n_[0]-agg-all n_[0]-agg-mean n_[0]-agg-var n_[0]-agg-min \\\n",
"0 0.0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 0.0 \n",
"0 0.0 0.0 0.0 0.000000 0.0 \n",
"1 0.0 0.0 0.0 0.000000 0.0 \n",
"2 1.0 0.0 1.0 0.666667 0.0 \n",
"\n",
" n_[0]-agg-max n_[0]-agg-median n_[0]-agg-sum n_[0]-agg-kurtosis \\\n",
"0 0.0 0.0 0.0 -3.0 \n",
"1 0.0 0.0 0.0 -3.0 \n",
"2 2.0 1.0 3.0 -1.5 \n",
"\n",
" n_[0]-agg-skewness ... par_vec_390 par_vec_391 par_vec_392 \\\n",
"0 0.0 ... -0.115819 0.023961 -0.130739 \n",
"1 0.0 ... -0.054351 0.023650 -0.165681 \n",
"0 0.0 ... -0.116468 0.023982 -0.130867 \n",
"1 0.0 ... -0.054949 0.024502 -0.166001 \n",
"2 0.0 ... -0.022804 0.001741 0.047479 \n",
"\n",
" par_vec_393 par_vec_394 par_vec_395 par_vec_396 par_vec_397 \\\n",
"0 0.006393 -0.135118 -0.071956 -0.051051 -0.068307 \n",
"1 -0.016137 -0.059402 0.008454 -0.044624 0.025160 \n",
"0 0.006825 -0.135098 -0.070616 -0.052172 -0.067250 \n",
"1 -0.014375 -0.058199 0.009978 -0.046423 0.025163 \n",
"2 0.118293 -0.093435 0.036759 -0.004508 -0.087898 \n",
"\n",
" par_vec_398 par_vec_399 \n",
"0 0.087342 -0.145716 \n",
"1 0.037831 -0.086235 \n",
"0 0.086256 -0.144385 \n",
"1 0.036946 -0.086611 \n",
"2 -0.117796 -0.191386 \n",
"\n",
"[2 rows x 1588 columns]"
"[3 rows x 1588 columns]"
]
},
"execution_count": 38,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_vector"
"feature_vectors"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52047a6b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -343,18 +375,18 @@
"id": "9027fa4a",
"metadata": {},
"source": [
"## Initialize Sherlock."
"## Initialize Sherlock"
]
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 11,
"id": "b9ec13ec",
"metadata": {},
"outputs": [],
"source": [
"model = SherlockModel();\n",
"model.initialize_model_from_json(with_weights=True);"
"model.initialize_model_from_json(with_weights=True, model_id=\"sherlock\");"
]
},
{
Expand All @@ -375,27 +407,27 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 12,
"id": "fc079fa9",
"metadata": {},
"outputs": [],
"source": [
"predicted_labels = model.predict(feature_vector, \"sherlock\")"
"predicted_labels = model.predict(feature_vectors, \"sherlock\")"
]
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 13,
"id": "0feb9584",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['creator', 'city'], dtype=object)"
"array(['person', 'city', 'address'], dtype=object)"
]
},
"execution_count": 41,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
Loading

0 comments on commit 5b3ac69

Please sign in to comment.