diff --git a/docs/docs/.gitignore b/docs/docs/.gitignore index 25a6e30a4b775..e586a74dfb131 100644 --- a/docs/docs/.gitignore +++ b/docs/docs/.gitignore @@ -4,4 +4,5 @@ node_modules/ .docusaurus .cache-loader -docs/api \ No newline at end of file +docs/api +example.sqlite diff --git a/docs/docs/how_to/document_loader_sql_database.ipynb b/docs/docs/how_to/document_loader_sql_database.ipynb new file mode 100644 index 0000000000000..9b3fe41df43fa --- /dev/null +++ b/docs/docs/how_to/document_loader_sql_database.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SQL Database\n", + "\n", + "## About\n", + "\n", + "The `SQLDatabaseLoader` loads records from any database supported by\n", + "[SQLAlchemy], see [SQLAlchemy dialects] for the whole list of supported\n", + "SQL databases and dialects.\n", + "\n", + "For talking to the database, the document loader uses the [SQLDatabase]\n", + "utility from the LangChain integration toolkit.\n", + "\n", + "You can either use plain SQL for querying, or use an SQLAlchemy `Select`\n", + "statement object, if you are using SQLAlchemy-Core or -ORM.\n", + "\n", + "You can select which columns to place into the document, which columns\n", + "to place into its metadata, which columns to use as a `source` attribute\n", + "in metadata, and whether to include the result row number and/or the SQL\n", + "query expression into the metadata.\n", + "\n", + "## What's inside\n", + "\n", + "This notebook covers how to load documents from an [SQLite] database,\n", + "using the [SQLAlchemy] document loader.\n", + "\n", + "It loads the result of a database query with one document per row.\n", + "\n", + "[SQLAlchemy]: https://www.sqlalchemy.org/\n", + "[SQLAlchemy dialects]: https://docs.sqlalchemy.org/en/latest/dialects/\n", + "[SQLDatabase]: https://python.langchain.com/docs/integrations/toolkits/sql_database\n", + "[SQLite]: https://sqlite.org/\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## Prerequisites" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#!pip install langchain langchain-community sqlalchemy termsql" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Populate SQLite database with example input data." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nationals|81.34|98\r\n", + "Reds|82.2|97\r\n", + "Yankees|197.96|95\r\n", + "Giants|117.62|94\r\n", + "Braves|83.31|94\r\n", + "Athletics|55.37|94\r\n", + "Rangers|120.51|93\r\n", + "Orioles|81.43|93\r\n", + "Rays|64.17|90\r\n", + "Angels|154.49|89\r\n", + "Tigers|132.3|88\r\n", + "Cardinals|110.3|88\r\n", + "Dodgers|95.14|86\r\n", + "White Sox|96.92|85\r\n", + "Brewers|97.65|83\r\n", + "Phillies|174.54|81\r\n", + "Diamondbacks|74.28|81\r\n", + "Pirates|63.43|79\r\n", + "Padres|55.24|76\r\n", + "Mariners|81.97|75\r\n", + "Mets|93.35|74\r\n", + "Blue Jays|75.48|73\r\n", + "Royals|60.91|72\r\n", + "Marlins|118.07|69\r\n", + "Red Sox|173.18|69\r\n", + "Indians|78.43|68\r\n", + "Twins|94.08|66\r\n", + "Rockies|78.06|64\r\n", + "Cubs|88.19|61\r\n", + "Astros|60.65|55\r\n", + "||\r\n" + ] + } + ], + "source": [ + "!termsql --infile=./example_data/mlb_teams_2012.csv --head --csv --outfile=example.sqlite --table=payroll" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "## Basic usage" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "from langchain_community.document_loaders import SQLDatabaseLoader\n", + "\n", + "loader = SQLDatabaseLoader(\n", + " \"SELECT * FROM payroll LIMIT 2\",\n", + " url=\"sqlite:///example.sqlite\",\n", + ")\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Nationals\\nPayroll (millions): 81.34\\nWins: 98'),\n", + " Document(page_content='Team: Reds\\nPayroll (millions): 82.2\\nWins: 97')]\n" + ] + } + ], + "source": [ + "pprint(documents)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Specify which columns are content vs. metadata\n", + "\n", + "Use the `page_content_mapper` keyword argument to optionally customize how to derive\n", + "a page content string from an input database record / row. By default, all columns\n", + "will be used.\n", + "\n", + "Use the `metadata_mapper` keyword argument to optionally customize how to derive\n", + "a document metadata dictionary from an input database record / row. By default,\n", + "document metadata will be empty." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "import functools\n", + "\n", + "# Configure built-in page content mapper to include only specified columns.\n", + "row_to_content = functools.partial(\n", + " SQLDatabaseLoader.page_content_default_mapper, column_names=[\"Team\", \"Wins\"]\n", + ")\n", + "\n", + "# Configure built-in metadata dictionary mapper to include specified columns.\n", + "row_to_metadata = functools.partial(\n", + " SQLDatabaseLoader.metadata_default_mapper, column_names=[\"Payroll (millions)\"]\n", + ")\n", + "\n", + "loader = SQLDatabaseLoader(\n", + " \"SELECT * FROM payroll LIMIT 2\",\n", + " url=\"sqlite:///example.sqlite\",\n", + " page_content_mapper=row_to_content,\n", + " metadata_mapper=row_to_metadata,\n", + ")\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Nationals\\nWins: 98', metadata={'Payroll (millions)': 81.34}),\n", + " Document(page_content='Team: Reds\\nWins: 97', metadata={'Payroll (millions)': 82.2})]\n" + ] + } + ], + "source": [ + "pprint(documents)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Those examples demonstrate how to use custom functions to define arbitrary\n", + "mapping rules by using Python code.\n", + "```python\n", + "def page_content_mapper(row: sa.RowMapping, column_names: Optional[List[str]] = None) -> str:\n", + " return f\"Team: {row['Team']}\"\n", + "```\n", + "```python\n", + "def metadata_default_mapper(row: sa.RowMapping, column_names: Optional[List[str]] = None) -> Dict[str, Any]:\n", + " return {\"team\": row['Team']}\n", + "```" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Specify column(s) to identify the document source\n", + "\n", + "Use the `source_columns` option to specify the columns to use as a \"source\" for the\n", + "document created from each row. This is useful for identifying documents through\n", + "their metadata. Typically, you may use the primary key column(s) for that purpose." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "loader = SQLDatabaseLoader(\n", + " \"SELECT * FROM payroll LIMIT 2\",\n", + " url=\"sqlite:///example.sqlite\",\n", + " source_columns=[\"Team\"],\n", + ")\n", + "documents = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Nationals\\nPayroll (millions): 81.34\\nWins: 98', metadata={'source': 'Nationals'}),\n", + " Document(page_content='Team: Reds\\nPayroll (millions): 82.2\\nWins: 97', metadata={'source': 'Reds'})]\n" + ] + } + ], + "source": [ + "pprint(documents)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Enrich metadata with row number and/or original SQL query\n", + "\n", + "Use the `include_rownum_into_metadata` and `include_query_into_metadata` options to\n", + "optionally populate the `metadata` dictionary with corresponding information.\n", + "\n", + "Having the `query` within metadata is useful when using documents loaded from\n", + "database tables for chains that answer questions using their origin queries." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 49, + "outputs": [], + "source": [ + "loader = SQLDatabaseLoader(\n", + " \"SELECT * FROM payroll LIMIT 2\",\n", + " url=\"sqlite:///example.sqlite\",\n", + " include_rownum_into_metadata=True,\n", + " include_query_into_metadata=True,\n", + ")\n", + "documents = loader.load()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 50, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Document(page_content='Team: Nationals\\nPayroll (millions): 81.34\\nWins: 98', metadata={'row': 0, 'query': 'SELECT * FROM payroll LIMIT 2'}),\n", + " Document(page_content='Team: Reds\\nPayroll (millions): 82.2\\nWins: 97', metadata={'row': 1, 'query': 'SELECT * FROM payroll LIMIT 2'})]\n" + ] + } + ], + "source": [ + "pprint(documents)" + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/docs/how_to/document_loader_sql_database.mdx b/docs/docs/how_to/document_loader_sql_database.mdx new file mode 100644 index 0000000000000..1ecdeda75b307 --- /dev/null +++ b/docs/docs/how_to/document_loader_sql_database.mdx @@ -0,0 +1,165 @@ +# SQLDatabaseLoader + + +## About + +The `SQLDatabaseLoader` loads records from any database supported by +[SQLAlchemy], see [SQLAlchemy dialects] for the whole list of supported +SQL databases and dialects. + +You can either use plain SQL for querying, or use an SQLAlchemy `Select` +statement object, if you are using SQLAlchemy-Core or -ORM. + +You can select which columns to place into the document, which columns +to place into its metadata, which columns to use as a `source` attribute +in metadata, and whether to include the result row number and/or the SQL +query expression into the metadata. + + +## Example + +This example uses PostgreSQL, and the `psycopg2` driver. + + +### Prerequisites + +```shell +psql postgresql://postgres@localhost/ --command "CREATE DATABASE testdrive;" +psql postgresql://postgres@localhost/testdrive < ./libs/langchain/tests/integration_tests/examples/mlb_teams_2012.sql +``` + + +### Basic loading + +```python +from langchain_community.document_loaders.sql_database import SQLDatabaseLoader +from pprint import pprint + + +loader = SQLDatabaseLoader( + query="SELECT * FROM mlb_teams_2012 LIMIT 3;", + url="postgresql+psycopg2://postgres@localhost:5432/testdrive", +) +docs = loader.load() +``` + +```python +pprint(docs) +``` + + + +``` +[Document(page_content='Team: Nationals\nPayroll (millions): 81.34\nWins: 98', metadata={}), + Document(page_content='Team: Reds\nPayroll (millions): 82.2\nWins: 97', metadata={}), + Document(page_content='Team: Yankees\nPayroll (millions): 197.96\nWins: 95', metadata={})] +``` + + + + +## Enriching metadata + +Use the `include_rownum_into_metadata` and `include_query_into_metadata` options to +optionally populate the `metadata` dictionary with corresponding information. + +Having the `query` within metadata is useful when using documents loaded from +database tables for chains that answer questions using their origin queries. + +```python +loader = SQLDatabaseLoader( + query="SELECT * FROM mlb_teams_2012 LIMIT 3;", + url="postgresql+psycopg2://postgres@localhost:5432/testdrive", + include_rownum_into_metadata=True, + include_query_into_metadata=True, +) +docs = loader.load() +``` + +```python +pprint(docs) +``` + + + +``` +[Document(page_content='Team: Nationals\nPayroll (millions): 81.34\nWins: 98', metadata={'row': 0, 'query': 'SELECT * FROM mlb_teams_2012 LIMIT 3;'}), + Document(page_content='Team: Reds\nPayroll (millions): 82.2\nWins: 97', metadata={'row': 1, 'query': 'SELECT * FROM mlb_teams_2012 LIMIT 3;'}), + Document(page_content='Team: Yankees\nPayroll (millions): 197.96\nWins: 95', metadata={'row': 2, 'query': 'SELECT * FROM mlb_teams_2012 LIMIT 3;'})] +``` + + + + +## Customizing metadata + +Use the `page_content_columns`, and `metadata_columns` options to optionally populate +the `metadata` dictionary with corresponding information. When `page_content_columns` +is empty, all columns will be used. + +```python +import functools + +row_to_content = functools.partial( + SQLDatabaseLoader.page_content_default_mapper, column_names=["Payroll (millions)", "Wins"] +) +row_to_metadata = functools.partial( + SQLDatabaseLoader.metadata_default_mapper, column_names=["Team"] +) + +loader = SQLDatabaseLoader( + query="SELECT * FROM mlb_teams_2012 LIMIT 3;", + url="postgresql+psycopg2://postgres@localhost:5432/testdrive", + page_content_mapper=row_to_content, + metadata_mapper=row_to_metadata, +) +docs = loader.load() +``` + +```python +pprint(docs) +``` + + + +``` +[Document(page_content='Payroll (millions): 81.34\nWins: 98', metadata={'Team': 'Nationals'}), + Document(page_content='Payroll (millions): 82.2\nWins: 97', metadata={'Team': 'Reds'}), + Document(page_content='Payroll (millions): 197.96\nWins: 95', metadata={'Team': 'Yankees'})] +``` + + + + +## Specify column(s) to identify the document source + +Use the `source_columns` option to specify the columns to use as a "source" for the +document created from each row. This is useful for identifying documents through +their metadata. Typically, you may use the primary key column(s) for that purpose. + +```python +loader = SQLDatabaseLoader( + query="SELECT * FROM mlb_teams_2012 LIMIT 3;", + url="postgresql+psycopg2://postgres@localhost:5432/testdrive", + source_columns=["Team"], +) +docs = loader.load() +``` + +```python +pprint(docs) +``` + + + +``` +[Document(page_content='Team: Nationals\nPayroll (millions): 81.34\nWins: 98', metadata={'source': 'Nationals'}), + Document(page_content='Team: Reds\nPayroll (millions): 82.2\nWins: 97', metadata={'source': 'Reds'}), + Document(page_content='Team: Yankees\nPayroll (millions): 197.96\nWins: 95', metadata={'source': 'Yankees'})] +``` + + + + +[SQLAlchemy]: https://www.sqlalchemy.org/ +[SQLAlchemy dialects]: https://docs.sqlalchemy.org/en/20/dialects/ diff --git a/libs/community/tests/unit_tests/test_sql_database.py b/libs/community/tests/unit_tests/test_sql_database.py index 6acb734a54309..d7795d59badae 100644 --- a/libs/community/tests/unit_tests/test_sql_database.py +++ b/libs/community/tests/unit_tests/test_sql_database.py @@ -55,6 +55,12 @@ def db_lazy_reflection(engine: Engine) -> SQLDatabase: return SQLDatabase(engine, lazy_table_reflection=True) +@pytest.mark.xfail(is_sqlalchemy_v1, reason="SQLAlchemy 1.x issues") +def test_configure_mappers() -> None: + """Test that configuring table mappers works.""" + sa.orm.configure_mappers() + + @pytest.mark.xfail(is_sqlalchemy_v1, reason="SQLAlchemy 1.x issues") def test_table_info(db: SQLDatabase) -> None: """Test that table info is constructed properly."""