Skip to content

Commit

Permalink
notebook changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Andrey Vykhodtsev committed Feb 9, 2024
1 parent d764baa commit 4b1b6a8
Show file tree
Hide file tree
Showing 12 changed files with 1,241 additions and 979 deletions.
121 changes: 57 additions & 64 deletions 01-Load-Data-ACogSearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -255,51 +255,51 @@
" }\n",
" ]\n",
" },\n",
" {\n",
" \"@odata.type\": \"#Microsoft.Skills.Text.KeyPhraseExtractionSkill\",\n",
" \"context\": \"/document/pages/*\",\n",
" \"maxKeyPhraseCount\": 2,\n",
" \"defaultLanguageCode\": \"en\",\n",
" \"inputs\": [\n",
" {\n",
" \"name\": \"text\", \n",
" \"source\": \"/document/pages/*\"\n",
" }\n",
" ],\n",
" \"outputs\": [\n",
" {\n",
" \"name\": \"keyPhrases\",\n",
" \"targetName\": \"keyPhrases\"\n",
" }\n",
" ]\n",
" },\n",
" {\n",
" \"@odata.type\": \"#Microsoft.Skills.Text.V3.EntityRecognitionSkill\",\n",
" \"context\": \"/document/pages/*\",\n",
" \"categories\": [\"Person\", \"URL\", \"Email\"],\n",
" \"minimumPrecision\": 0.5, \n",
" \"defaultLanguageCode\": \"en\",\n",
" \"inputs\": [\n",
" {\n",
" \"name\": \"text\", \n",
" \"source\":\"/document/pages/*\"\n",
" }\n",
" ],\n",
" \"outputs\": [\n",
" {\n",
" \"name\": \"persons\", \n",
" \"targetName\": \"persons\"\n",
" },\n",
" {\n",
" \"name\": \"urls\", \n",
" \"targetName\": \"urls\"\n",
" },\n",
" {\n",
" \"name\": \"emails\", \n",
" \"targetName\": \"emails\"\n",
" }\n",
" ]\n",
" }\n",
" # {\n",
" # \"@odata.type\": \"#Microsoft.Skills.Text.KeyPhraseExtractionSkill\",\n",
" # \"context\": \"/document/pages/*\",\n",
" # \"maxKeyPhraseCount\": 2,\n",
" # \"defaultLanguageCode\": \"en\",\n",
" # \"inputs\": [\n",
" # {\n",
" # \"name\": \"text\", \n",
" # \"source\": \"/document/pages/*\"\n",
" # }\n",
" # ],\n",
" # \"outputs\": [\n",
" # {\n",
" # \"name\": \"keyPhrases\",\n",
" # \"targetName\": \"keyPhrases\"\n",
" # }\n",
" # ]\n",
" # },\n",
" # {\n",
" # \"@odata.type\": \"#Microsoft.Skills.Text.V3.EntityRecognitionSkill\",\n",
" # \"context\": \"/document/pages/*\",\n",
" # \"categories\": [\"Person\", \"URL\", \"Email\"],\n",
" # \"minimumPrecision\": 0.5, \n",
" # \"defaultLanguageCode\": \"en\",\n",
" # \"inputs\": [\n",
" # {\n",
" # \"name\": \"text\", \n",
" # \"source\":\"/document/pages/*\"\n",
" # }\n",
" # ],\n",
" # \"outputs\": [\n",
" # {\n",
" # \"name\": \"persons\", \n",
" # \"targetName\": \"persons\"\n",
" # },\n",
" # {\n",
" # \"name\": \"urls\", \n",
" # \"targetName\": \"urls\"\n",
" # },\n",
" # {\n",
" # \"name\": \"emails\", \n",
" # \"targetName\": \"emails\"\n",
" # }\n",
" # ]\n",
" # }\n",
" ],\n",
" \"cognitiveServices\": {\n",
" \"@odata.type\": \"#Microsoft.Azure.Search.CognitiveServicesByKey\",\n",
Expand Down Expand Up @@ -548,7 +548,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 19,
"metadata": {
"tags": []
},
Expand All @@ -557,10 +557,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"200\n",
"Status: inProgress\n",
"Items Processed: 400\n",
"True\n"
"{'status': 'inProgress', 'statusDetail': None, 'errorMessage': None, 'startTime': '2024-01-28T10:33:09.321Z', 'endTime': None, 'itemsProcessed': 4000, 'itemsFailed': 0, 'initialTrackingState': None, 'finalTrackingState': '{\\r\\n \"lastFullEnumerationStartTime\": \"0001-01-01T00:00:00Z\",\\r\\n \"lastAttemptedEnumerationStartTime\": \"2024-01-28T10:33:09.602Z\",\\r\\n \"nameHighWaterMark\": \"https://datasetsgptsmartsearch.blob.core.windows.net/arxivcs/pdf/0501/0501020v1.pdf\"\\r\\n}', 'mode': 'indexingAllDocs', 'errors': [], 'warnings': [{'key': 'localId=0402023v1.pdf&documentKey=https%3a%2f%2fdatasetsgptsmartsearch.blob.core.windows.net%2farxivcs%2fpdf%2f0402%2f0402023v1.pdf', 'name': 'DocumentExtraction.azureblob.0402023v1.pdf', 'message': 'Could not extract content or metadata from your document. ', 'details': \"Document has unsupported content type 'application/x-gtar'. Blob metadata was indexed, but content extraction was skipped.\", 'documentationLink': 'https://go.microsoft.com/fwlink/?linkid=2104227'}, {'key': 'localId=https%3a%2f%2fdatasetsgptsmartsearch.blob.core.windows.net%2farxivcs%2fpdf%2f0402%2f0402023v1.pdf&documentKey=https%3a%2f%2fdatasetsgptsmartsearch.blob.core.windows.net%2farxivcs%2fpdf%2f0402%2f0402023v1.pdf', 'name': 'Enrichment.SplitSkill.#3', 'message': 'Could not execute skill because one or more skill input was invalid.', 'details': \"Required skill input is missing or empty. Name: 'text', Source: '$(/document/merged_text)'.\", 'documentationLink': 'https://go.microsoft.com/fwlink/?linkid=2106385'}], 'metrics': None}\n"
]
}
],
Expand All @@ -570,10 +567,11 @@
" r = requests.get(os.environ['AZURE_SEARCH_ENDPOINT'] + \"/indexers/\" + indexer_name +\n",
" \"/status\", headers=headers, params=params)\n",
" # pprint(json.dumps(r.json(), indent=1))\n",
" print(r.status_code)\n",
" print(\"Status:\",r.json().get('lastResult').get('status'))\n",
" print(\"Items Processed:\",r.json().get('lastResult').get('itemsProcessed'))\n",
" print(r.ok)\n",
" #print(r.status_code)\n",
" #print(\"Status:\",r.json().get('lastResult').get('status'))\n",
" #print(\"Items Processed:\",r.json().get('lastResult').get('itemsProcessed'))\n",
" #print(r.ok)\n",
" print(r.json().get('lastResult'))\n",
" \n",
"except Exception as e:\n",
" print(\"Wait a few seconds until the process starts and run this cell again.\")"
Expand Down Expand Up @@ -613,7 +611,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -695,9 +693,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10 - SDK v2",
"display_name": ".venv",
"language": "python",
"name": "python310-sdkv2"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -709,12 +707,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"vscode": {
"interpreter": {
"hash": "9ff083f0c83558f9261023d47a77b9b3eb892c62cdbe066d046abcad1a5edb5c"
}
"version": "3.11.7"
}
},
"nbformat": 4,
Expand Down
115 changes: 64 additions & 51 deletions 02-LoadCSVOneToMany-ACogSearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"201\n",
"204\n",
"True\n"
]
}
Expand Down Expand Up @@ -123,7 +123,22 @@
"execution_count": 5,
"id": "2fbbbd0d-3015-4601-9ef1-7008ad168167",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/mf/1n_x1d_51fs2m4_6tj03p9jm0000gn/T/ipykernel_89913/3169803804.py:2: DeprecationWarning: \n",
"Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),\n",
"(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)\n",
"but was not found to be installed on your system.\n",
"If this would cause problems for you,\n",
"please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466\n",
" \n",
" import pandas as pd\n"
]
}
],
"source": [
"#Download the csv files to disk and inspect using pandas\n",
"import pandas as pd\n",
Expand All @@ -148,69 +163,69 @@
"text/html": [
"<style type=\"text/css\">\n",
"</style>\n",
"<table id=\"T_87464\">\n",
"<table id=\"T_18016\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
" <th id=\"T_87464_level0_col0\" class=\"col_heading level0 col0\" >cord_uid</th>\n",
" <th id=\"T_87464_level0_col1\" class=\"col_heading level0 col1\" >source_x</th>\n",
" <th id=\"T_87464_level0_col2\" class=\"col_heading level0 col2\" >title</th>\n",
" <th id=\"T_87464_level0_col3\" class=\"col_heading level0 col3\" >abstract</th>\n",
" <th id=\"T_87464_level0_col4\" class=\"col_heading level0 col4\" >authors</th>\n",
" <th id=\"T_87464_level0_col5\" class=\"col_heading level0 col5\" >url</th>\n",
" <th id=\"T_18016_level0_col0\" class=\"col_heading level0 col0\" >cord_uid</th>\n",
" <th id=\"T_18016_level0_col1\" class=\"col_heading level0 col1\" >source_x</th>\n",
" <th id=\"T_18016_level0_col2\" class=\"col_heading level0 col2\" >title</th>\n",
" <th id=\"T_18016_level0_col3\" class=\"col_heading level0 col3\" >abstract</th>\n",
" <th id=\"T_18016_level0_col4\" class=\"col_heading level0 col4\" >authors</th>\n",
" <th id=\"T_18016_level0_col5\" class=\"col_heading level0 col5\" >url</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_87464_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
" <td id=\"T_87464_row0_col0\" class=\"data row0 col0\" >ug7v899j</td>\n",
" <td id=\"T_87464_row0_col1\" class=\"data row0 col1\" >PMC</td>\n",
" <td id=\"T_87464_row0_col2\" class=\"data row0 col2\" >Clinical features of culture-p...</td>\n",
" <td id=\"T_87464_row0_col3\" class=\"data row0 col3\" >OBJECTIVE: This retrospective ...</td>\n",
" <td id=\"T_87464_row0_col4\" class=\"data row0 col4\" >Madani, Tariq A; Al-Ghamdi, Ai...</td>\n",
" <td id=\"T_87464_row0_col5\" class=\"data row0 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/</a></td>\n",
" <th id=\"T_18016_level0_row0\" class=\"row_heading level0 row0\" >0</th>\n",
" <td id=\"T_18016_row0_col0\" class=\"data row0 col0\" >ug7v899j</td>\n",
" <td id=\"T_18016_row0_col1\" class=\"data row0 col1\" >PMC</td>\n",
" <td id=\"T_18016_row0_col2\" class=\"data row0 col2\" >Clinical features of culture-p...</td>\n",
" <td id=\"T_18016_row0_col3\" class=\"data row0 col3\" >OBJECTIVE: This retrospective ...</td>\n",
" <td id=\"T_18016_row0_col4\" class=\"data row0 col4\" >Madani, Tariq A; Al-Ghamdi, Ai...</td>\n",
" <td id=\"T_18016_row0_col5\" class=\"data row0 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC35282/</a></td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_87464_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
" <td id=\"T_87464_row1_col0\" class=\"data row1 col0\" >02tnwd4m</td>\n",
" <td id=\"T_87464_row1_col1\" class=\"data row1 col1\" >PMC</td>\n",
" <td id=\"T_87464_row1_col2\" class=\"data row1 col2\" >Nitric oxide: a pro-inflammato...</td>\n",
" <td id=\"T_87464_row1_col3\" class=\"data row1 col3\" >Inflammatory diseases of the r...</td>\n",
" <td id=\"T_87464_row1_col4\" class=\"data row1 col4\" >Vliet, Albert van der; Eiseric...</td>\n",
" <td id=\"T_87464_row1_col5\" class=\"data row1 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/</a></td>\n",
" <th id=\"T_18016_level0_row1\" class=\"row_heading level0 row1\" >1</th>\n",
" <td id=\"T_18016_row1_col0\" class=\"data row1 col0\" >02tnwd4m</td>\n",
" <td id=\"T_18016_row1_col1\" class=\"data row1 col1\" >PMC</td>\n",
" <td id=\"T_18016_row1_col2\" class=\"data row1 col2\" >Nitric oxide: a pro-inflammato...</td>\n",
" <td id=\"T_18016_row1_col3\" class=\"data row1 col3\" >Inflammatory diseases of the r...</td>\n",
" <td id=\"T_18016_row1_col4\" class=\"data row1 col4\" >Vliet, Albert van der; Eiseric...</td>\n",
" <td id=\"T_18016_row1_col5\" class=\"data row1 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59543/</a></td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_87464_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
" <td id=\"T_87464_row2_col0\" class=\"data row2 col0\" >ejv2xln0</td>\n",
" <td id=\"T_87464_row2_col1\" class=\"data row2 col1\" >PMC</td>\n",
" <td id=\"T_87464_row2_col2\" class=\"data row2 col2\" >Surfactant protein-D and pulmo...</td>\n",
" <td id=\"T_87464_row2_col3\" class=\"data row2 col3\" >Surfactant protein-D (SP-D) pa...</td>\n",
" <td id=\"T_87464_row2_col4\" class=\"data row2 col4\" >Crouch, Erika C...</td>\n",
" <td id=\"T_87464_row2_col5\" class=\"data row2 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/</a></td>\n",
" <th id=\"T_18016_level0_row2\" class=\"row_heading level0 row2\" >2</th>\n",
" <td id=\"T_18016_row2_col0\" class=\"data row2 col0\" >ejv2xln0</td>\n",
" <td id=\"T_18016_row2_col1\" class=\"data row2 col1\" >PMC</td>\n",
" <td id=\"T_18016_row2_col2\" class=\"data row2 col2\" >Surfactant protein-D and pulmo...</td>\n",
" <td id=\"T_18016_row2_col3\" class=\"data row2 col3\" >Surfactant protein-D (SP-D) pa...</td>\n",
" <td id=\"T_18016_row2_col4\" class=\"data row2 col4\" >Crouch, Erika C...</td>\n",
" <td id=\"T_18016_row2_col5\" class=\"data row2 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59549/</a></td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_87464_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
" <td id=\"T_87464_row3_col0\" class=\"data row3 col0\" >2b73a28n</td>\n",
" <td id=\"T_87464_row3_col1\" class=\"data row3 col1\" >PMC</td>\n",
" <td id=\"T_87464_row3_col2\" class=\"data row3 col2\" >Role of endothelin-1 in lung d...</td>\n",
" <td id=\"T_87464_row3_col3\" class=\"data row3 col3\" >Endothelin-1 (ET-1) is a 21 am...</td>\n",
" <td id=\"T_87464_row3_col4\" class=\"data row3 col4\" >Fagan, Karen A; McMurtry, Ivan...</td>\n",
" <td id=\"T_87464_row3_col5\" class=\"data row3 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/</a></td>\n",
" <th id=\"T_18016_level0_row3\" class=\"row_heading level0 row3\" >3</th>\n",
" <td id=\"T_18016_row3_col0\" class=\"data row3 col0\" >2b73a28n</td>\n",
" <td id=\"T_18016_row3_col1\" class=\"data row3 col1\" >PMC</td>\n",
" <td id=\"T_18016_row3_col2\" class=\"data row3 col2\" >Role of endothelin-1 in lung d...</td>\n",
" <td id=\"T_18016_row3_col3\" class=\"data row3 col3\" >Endothelin-1 (ET-1) is a 21 am...</td>\n",
" <td id=\"T_18016_row3_col4\" class=\"data row3 col4\" >Fagan, Karen A; McMurtry, Ivan...</td>\n",
" <td id=\"T_18016_row3_col5\" class=\"data row3 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/</a></td>\n",
" </tr>\n",
" <tr>\n",
" <th id=\"T_87464_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
" <td id=\"T_87464_row4_col0\" class=\"data row4 col0\" >9785vg6d</td>\n",
" <td id=\"T_87464_row4_col1\" class=\"data row4 col1\" >PMC</td>\n",
" <td id=\"T_87464_row4_col2\" class=\"data row4 col2\" >Gene expression in epithelial ...</td>\n",
" <td id=\"T_87464_row4_col3\" class=\"data row4 col3\" >Respiratory syncytial virus (R...</td>\n",
" <td id=\"T_87464_row4_col4\" class=\"data row4 col4\" >Domachowske, Joseph B; Bonvill...</td>\n",
" <td id=\"T_87464_row4_col5\" class=\"data row4 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/</a></td>\n",
" <th id=\"T_18016_level0_row4\" class=\"row_heading level0 row4\" >4</th>\n",
" <td id=\"T_18016_row4_col0\" class=\"data row4 col0\" >9785vg6d</td>\n",
" <td id=\"T_18016_row4_col1\" class=\"data row4 col1\" >PMC</td>\n",
" <td id=\"T_18016_row4_col2\" class=\"data row4 col2\" >Gene expression in epithelial ...</td>\n",
" <td id=\"T_18016_row4_col3\" class=\"data row4 col3\" >Respiratory syncytial virus (R...</td>\n",
" <td id=\"T_18016_row4_col4\" class=\"data row4 col4\" >Domachowske, Joseph B; Bonvill...</td>\n",
" <td id=\"T_18016_row4_col5\" class=\"data row4 col5\" ><a href=\"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/\">https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59580/</a></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
],
"text/plain": [
"<pandas.io.formats.style.Styler at 0x7f36d4016ad0>"
"<pandas.io.formats.style.Styler at 0x10c683b90>"
]
},
"execution_count": 6,
Expand Down Expand Up @@ -452,9 +467,7 @@
"output_type": "stream",
"text": [
"200\n",
"Status: inProgress\n",
"Items Processed: 15000\n",
"True\n"
"Wait a few seconds until the process starts and run this cell again.\n"
]
}
],
Expand Down Expand Up @@ -582,9 +595,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10 - SDK v2",
"display_name": ".venv",
"language": "python",
"name": "python310-sdkv2"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -596,7 +609,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 4b1b6a8

Please sign in to comment.