Skip to content

Commit

Permalink
Merge pull request #1784 from UlrichB22/import19_perf
Browse files Browse the repository at this point in the history
import19: Add procs and limitmb options to increase performance
  • Loading branch information
RogerHaase authored Oct 24, 2024
2 parents e6c53fc + 66e8b3d commit c9d4c50
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 11 deletions.
8 changes: 8 additions & 0 deletions docs/admin/upgrade.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,14 @@ home pages and subpages will be converted to the "users" directory. The data fro
directory will be converted to the "userprofiles" directory. The "userprofiles" directory
contains data used internally and should always be protected from any access by ACLs.

If you are importing a large wiki with more than 1000 entries or revisions, the index building
part of the import will be time-consuming. You can use the following options to speed up the process::

--procs <number of processors> --limitmb <memory in mb for each process>

Choose the values according to your available hardware resources. The defaults are 1 process and 256 mb memory.
See the `Whoosh Tips for speeding up batch indexing docs <https://whoosh.readthedocs.io/en/latest/batch.html>`_ for details.

Testing
-------
Review the logs for error messages. Start the moin server and try the "Index" and "History"
Expand Down
7 changes: 5 additions & 2 deletions src/moin/cli/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,19 @@ def get_backends(backends: Optional[str], all_backends: bool) -> set[Backend]:
return set()


def drop_and_recreate_index(indexer):
def drop_and_recreate_index(indexer, procs=None, limitmb=None, multisegment=False):
"""Drop index and recreate, rebuild and optimize
:param indexer: IndexingMiddleware object
:param procs: Number of processors the writer will use.
:param limitmb: Maximum memory (in megabytes) each index-writer will use for the indexing pool
"""
indexer.close()
indexer.destroy()
logging.debug("Create index")
indexer.create()
logging.debug("Rebuild index")
indexer.rebuild()
# the use of multisegment leads to one index segment per process, the optimize step merges them later
indexer.rebuild(procs=procs, limitmb=limitmb, multisegment=multisegment)
logging.debug("Optimize index")
indexer.optimize_index()
indexer.open()
Expand Down
4 changes: 2 additions & 2 deletions src/moin/cli/maint/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,13 @@ def IndexDestroy(tmp):


@cli.command("index-build", help="Build the indexes")
@click.option("--procs", "-p", required=False, type=int, default=1, help="Number of processors the writer will use.")
@click.option("--procs", "-p", required=False, type=int, default=None, help="Number of processors the writer will use.")
@click.option(
"--limitmb",
"-l",
required=False,
type=int,
default=10,
default=None,
help="Maximum memory (in megabytes) each index-writer will use for the indexing pool.",
)
@click.option("--tmp", is_flag=True, required=False, default=False, help="use the temporary location.")
Expand Down
13 changes: 11 additions & 2 deletions src/moin/cli/migration/moin19/import19.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,16 @@ def migr_statistics(unknown_macros):
default=NAMESPACE_DEFAULT,
help="target namespace, e.g. used for members of a wikifarm.",
)
def ImportMoin19(data_dir=None, markup_out=None, namespace=None):
@click.option("--procs", "-p", required=False, type=int, default=1, help="Number of processors the writer will use.")
@click.option(
"--limitmb",
"-l",
required=False,
type=int,
default=256,
help="Maximum memory (in megabytes) each index-writer will use for the indexing pool.",
)
def ImportMoin19(data_dir=None, markup_out=None, namespace=None, procs=None, limitmb=None):
"""Import content and user data from a moin wiki with version 1.9"""

target_namespace = namespace
Expand Down Expand Up @@ -263,7 +272,7 @@ def ImportMoin19(data_dir=None, markup_out=None, namespace=None):
backend.store(meta, out)

logging.info("PHASE4: Rebuilding the index ...")
drop_and_recreate_index(app.storage)
drop_and_recreate_index(app.storage, procs=procs, limitmb=limitmb, multisegment=True)

logging.info("Finished conversion!")
if hasattr(conv_out, "unknown_macro_list"):
Expand Down
31 changes: 26 additions & 5 deletions src/moin/storage/middleware/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,14 +638,19 @@ def remove_revision(self, revid, async_=True):
# this is no revision left in this item that could be the new "latest rev", just kill the rev
writer.delete_document(docnum_remove)

def _modify_index(self, index, schema, wikiname, revids, mode="add", procs=1, limitmb=256):
def _modify_index(self, index, schema, wikiname, revids, mode="add", procs=None, limitmb=None, multisegment=False):
"""
modify index contents - add, update, delete the indexed documents for all given revids
Note: mode == 'add' is faster but you need to make sure to not create duplicate
documents in the index.
"""
with index.writer(procs=procs, limitmb=limitmb) as writer:
if procs is None:
procs = 1
if limitmb is None:
limitmb = 256
logging.info(f"Using options procs={procs}, limitmb={limitmb}, multisegment={multisegment}")
with index.writer(procs=procs, limitmb=limitmb, multisegment=multisegment) as writer:
for backend_name, revid in revids:
if mode in ["add", "update"]:
meta, data = self.backend.retrieve(backend_name, revid)
Expand Down Expand Up @@ -680,7 +685,7 @@ def _find_latest_backends_revids(self, index, query=None):
]
return latest_backends_revids

def rebuild(self, tmp=False, procs=1, limitmb=256):
def rebuild(self, tmp=False, procs=None, limitmb=None, multisegment=False):
"""
Add all items/revisions from the backends of this wiki to the index
(which is expected to have no items/revisions from this wiki yet).
Expand All @@ -694,7 +699,16 @@ def rebuild(self, tmp=False, procs=1, limitmb=256):
try:
# build an index of all we have (so we know what we have)
all_revids = self.backend # the backend is an iterator over all revids
self._modify_index(index, self.schemas[ALL_REVS], self.wikiname, all_revids, "add", procs, limitmb)
self._modify_index(
index,
self.schemas[ALL_REVS],
self.wikiname,
all_revids,
"add",
procs=procs,
limitmb=limitmb,
multisegment=multisegment,
)
latest_backends_revids = self._find_latest_backends_revids(index)
finally:
index.close()
Expand All @@ -703,7 +717,14 @@ def rebuild(self, tmp=False, procs=1, limitmb=256):
index = storage.open_index(LATEST_REVS)
try:
self._modify_index(
index, self.schemas[LATEST_REVS], self.wikiname, latest_backends_revids, "add", procs, limitmb
index,
self.schemas[LATEST_REVS],
self.wikiname,
latest_backends_revids,
"add",
procs=procs,
limitmb=limitmb,
multisegment=multisegment,
)
finally:
index.close()
Expand Down

0 comments on commit c9d4c50

Please sign in to comment.