From 63439b3900a424c5991395065e1cc78824cea939 Mon Sep 17 00:00:00 2001 From: John Rouillard Date: Thu, 19 Oct 2023 16:11:25 -0400 Subject: [PATCH] fix: out of memory error when importing under postgresql If you try importing more than 20k items under postgresql you can run out of memory: psycopg2.errors.OutOfMemory: out of shared memory HINT: You might need to increase max_locks_per_transaction. Tuning memory may help, it's unknown at this point. This checkin forces a commit to the postgres database after 10,000 rows have been added. This clears out the savepoints for each row and starts a new transaction. back_postgresql.py: Implement commit mechanism in checkpoint_data(). Add two class level attributes for tracking the number of savepoints and the limit when the commit should happen. roundup_admin.py: implement pragma and dynamically create the config item RDBMS_SAVEPOINT_LIMIT used by checkpoint_data. Also fixed formatting of descriptions when using pragma list in verbose mode. admin_guide.txt, upgrading.txt: Document change and use of pragma savepoint_limit in roundup-admin for changing the default of 10,000. test/db_test_base.py: add some more asserts. In existing testAdminImportExport, set the savepoint limit to 5 to test setting method and so that the commit code will be run by existing tests. This provides coverage, but does not actually test that the commit is done every 5 savepoints 8-(. The verification of every 5 savepoints was done manually using a pdb breakpoint just before the commit. acknowledgements.txt: Added 2.4.0 section mentioning Norbert as he has done a ton of testing with much larger datasets than I can test with. --- CHANGES.txt | 3 ++ doc/acknowledgements.txt | 18 +++++++++ doc/admin_guide.txt | 10 +++++ doc/upgrading.txt | 14 +++++++ roundup/admin.py | 44 +++++++++++++------- roundup/backends/back_postgresql.py | 62 ++++++++++++++++++++++++----- test/db_test_base.py | 14 +++++++ 7 files changed, 140 insertions(+), 25 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 720efca1..a1f4e050 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -59,6 +59,9 @@ Fixed: - Fix error handling so failure during import of a non-user item doesn't cause a second traceback. (Found by Norbert Schlemmer, fix John Rouillard) +- Handle out of memory error when importing large trackers in + PostgreSQL. (Found by Norbert Schlemmer, extensive testing by + Norbert, fix John Rouillard) Features: diff --git a/doc/acknowledgements.txt b/doc/acknowledgements.txt index e5e307d4..5277e90a 100644 --- a/doc/acknowledgements.txt +++ b/doc/acknowledgements.txt @@ -16,6 +16,24 @@ ideas and everything else that helped! .. _`Announcement with changelog for current release.`: announcement.html +2.4 +--- + +2.4.0 +~~~~~ + +Maintainer: John Rouillard + +Release Manager: John Rouillard + +Developer activity by changesets:: + + TBD + +Other contributers + +Norbert Schlemmer + 2.3 --- diff --git a/doc/admin_guide.txt b/doc/admin_guide.txt index 523b731a..9b06239a 100644 --- a/doc/admin_guide.txt +++ b/doc/admin_guide.txt @@ -962,6 +962,16 @@ Migrating Backends move the new tracker home into its place. 9. Restart web and email frontends. +If you are importing into PostgreSQL, it autocommits the data every +10000 objects/rows by default. This can slow down importing, but it +prevents an out of memory error caused by using a savepoint for each +object. You can control the commit frequency by using:: + + pragma savepoint_limit=20000 + +to set a higher or lower number in roundup-admin. In this example a +commit will be done every 20,000 objects/rows. The pragma can also be +set on the roundup-admin command line as described below. Moving a Tracker ---------------- diff --git a/doc/upgrading.txt b/doc/upgrading.txt index 31e04632..cf225212 100644 --- a/doc/upgrading.txt +++ b/doc/upgrading.txt @@ -141,6 +141,20 @@ It is unlikey that you will care unless you have done some expert level Roundup customization. If you have, use one of the imports above if you plan on running on Python 3.13 (expected in 2024) or newer. +Fixing PostgreSQL Out of Memory Errors when Importing Tracker (info) +-------------------------------------------------------------------- + +Importing a tracker into PostgreSQL can run out of memory with the +error:: + + psycopg2.errors.OutOfMemory: out of shared memory + HINT: You might need to increase max_locks_per_transaction. + +before changing your PostgreSQL configuration, try changing the pragma +``savepoint_limit`` to a lower value. By default it is set to +``10000``. In some cases this may be too high. See the `administration +guide`_ for further details. + .. index:: Upgrading; 2.2.0 to 2.3.0 Migrating from 2.2.0 to 2.3.0 diff --git a/roundup/admin.py b/roundup/admin.py index 7902ed47..7d38db32 100644 --- a/roundup/admin.py +++ b/roundup/admin.py @@ -35,8 +35,9 @@ from roundup import date, hyperdb, init, password, token_r from roundup import __version__ as roundup_version import roundup.instance -from roundup.configuration import (CoreConfig, NoConfigError, OptionUnsetError, - OptionValueError, ParsingOptionError, UserConfig) +from roundup.configuration import (CoreConfig, NoConfigError, Option, + OptionUnsetError, OptionValueError, + ParsingOptionError, UserConfig) from roundup.i18n import _, get_translation from roundup.exceptions import UsageError from roundup.anypy.my_input import my_input @@ -108,6 +109,7 @@ def __init__(self): 'display_protected': False, 'indexer_backend': "as set in config.ini", '_reopen_tracker': False, + 'savepoint_limit': 10000, 'show_retired': "no", '_retired_val': False, 'verbose': False, @@ -116,25 +118,29 @@ def __init__(self): } self.settings_help = { 'display_header': - _("Have 'display designator[,designator*]' show header inside " - " []'s before items. Includes retired/active status."), + _("Have 'display designator[,designator*]' show header inside\n" + " []'s before items. Includes retired/active status.\n"), 'display_protected': - _("Have 'display designator' and 'specification class' show " - "protected fields: creator, id etc."), + _("Have 'display designator' and 'specification class' show\n" + " protected fields: creator, id etc.\n"), 'indexer_backend': - _("Set indexer to use when running 'reindex' NYI"), + _("Set indexer to use when running 'reindex' NYI\n"), '_reopen_tracker': - _("Force reopening of tracker when running each command."), - - 'show_retired': _("Show retired items in table, list etc. One of 'no', 'only', 'both'"), - '_retired_val': _("internal mapping for show_retired."), - 'verbose': _("Enable verbose output: tracing, descriptions..."), - - '_inttest': "Integer valued setting. For testing only.", - '_floattest': "Float valued setting. For testing only.", + _("Force reopening of tracker when running each command.\n"), + + 'savepoint_limit': + _("set the number of rows imported before a database commit is\n" + " done. Used only for imports on PostgreSQL.\n"), + 'show_retired': _("Show retired items in table, list etc. " + "One of 'no', 'only', 'both'\n"), + '_retired_val': _("internal mapping for show_retired.\n"), + 'verbose': _("Enable verbose output: tracing, descriptions...\n"), + + '_inttest': "Integer valued setting. For testing only.\n", + '_floattest': "Float valued setting. For testing only.\n", } def get_class(self, classname): @@ -1049,6 +1055,14 @@ def do_import(self, args, import_files=True): if hasattr(csv, 'field_size_limit'): csv.field_size_limit(self.db.config.CSV_FIELD_SIZE) + # default value is 10000, only go through this if default + # is different. + if self.settings['savepoint_limit'] != 10000: + self.db.config.add_option(Option(self.db.config, + "rdbms", "savepoint_limit")) + self.db.config.options["RDBMS_SAVEPOINT_LIMIT"].set( + self.settings['savepoint_limit']) + # directory to import from dir = args[0] diff --git a/roundup/backends/back_postgresql.py b/roundup/backends/back_postgresql.py index 4da8c12f..09848f0c 100644 --- a/roundup/backends/back_postgresql.py +++ b/roundup/backends/back_postgresql.py @@ -152,12 +152,25 @@ class Database(rdbms_common.Database): holds the value for the type of db. It is used by indexer to identify the database type so it can import the correct indexer module when using native text search mode. + + import_savepoint_count: + count the number of savepoints that have been created during + import. Once the limit of savepoints is reached, a commit is + done and this is reset to 0. + """ arg = '%s' dbtype = "postgres" + import_savepoint_count = 0 + + # Value is set from roundup-admin using db.config["RDBMS_SAVEPOINT_LIMIT"] + # or to the default of 10_000 at runtime. Use 0 here to trigger + # initialization. + savepoint_limit = 0 + # used by some code to switch styles of query implements_intersect = 1 @@ -218,20 +231,49 @@ def checkpoint_data(self, savepoint="importing"): of operation in exception handler because postgres requires a rollback in case of error generating exception. Used with - restore_connecion_on_error to handle uniqueness + restore_connection_on_error to handle uniqueness conflict in import_table(). + + Savepoints take memory resources. Postgres keeps all + savepoints (rather than overwriting) until a + commit(). Commit every ~10,000 savepoints to prevent + running out of memory on import. + + NOTE: a commit outside of this method will not reset the + import_savepoint_count. This can result in an unneeded + commit on a new cursor (that has no savepoints) as there is + no way to find out if there is a savepoint or how many + savepoints are opened on a db connection/cursor. + + Because an import is a one shot deal and not part of a long + running daemon (e.g. the roundup-server), I am not too + worried about it. It will just slow the import down a tad. """ - # Savepoints take resources. Postgres keeps all - # savepoints (rather than overwriting) until a - # commit(). If an import fails because of a resource - # issue with savepoints, uncomment this line. I - # expect it will slow down the import but it should - # eliminate any issue with stored savepoints and - # resource use. - # - # self.sql('RELEASE SAVEPOINT %s' % savepoint) + self.sql('SAVEPOINT %s' % savepoint) + self.import_savepoint_count += 1 + + if not self.savepoint_limit: + if "RDBMS_SAVEPOINT_LIMIT" in self.config.keys(): + # note this config option is created on the fly + # by admin.py::do_import. It is never listed in + # config.ini. + self.savepoint_limit = self.config["RDBMS_SAVEPOINT_LIMIT"] + else: + self.savepoint_limit = 10000 + + if self.import_savepoint_count > self.savepoint_limit: + # track savepoints and commit every 10000 (or user value) + # so we don't run postgres out of memory. An import of a + # customer's tracker ran out of memory after importing + # ~23000 items with: psycopg2.errors.OutOfMemory: out of + # shared memory HINT: You might need to increase + # max_locks_per_transaction. + + self.commit() + self.import_savepoint_count = 0 + def restore_connection_on_error(self, savepoint="importing"): """Postgres leaves a connection/cursor in an unusable state after an error. Rollback the transaction to a diff --git a/test/db_test_base.py b/test/db_test_base.py index df913abb..29534deb 100644 --- a/test/db_test_base.py +++ b/test/db_test_base.py @@ -3061,6 +3061,7 @@ def testImportExport(self): self.db.commit() self.assertEqual(self.db.user.lookup("duplicate"), active_dupe_id) + self.assertEqual(self.db.user.is_retired(retired_dupe_id), True) finally: shutil.rmtree('_test_export') @@ -3151,12 +3152,25 @@ def stderrwrite(s): self.assertRaises(csv.Error, tool.do_import, ['_test_export']) self.nukeAndCreate() + + # make sure we have an empty db + with self.assertRaises(IndexError) as e: + # users 1 and 2 always are created on schema load. + # so don't use them. + self.db.user.getnode("5").values() + self.db.config.CSV_FIELD_SIZE = 3200 tool = roundup.admin.AdminTool() tool.tracker_home = home tool.db = self.db + # Force import code to commit when more than 5 + # savepoints have been created. + tool.settings['savepoint_limit'] = 5 tool.verbose = False tool.do_import(['_test_export']) + + # verify the data is loaded. + self.db.user.getnode("5").values() finally: roundup.admin.sys = sys shutil.rmtree('_test_export')