From 6505d5b5c5a41519b53bb784bf9e6a0f28343852 Mon Sep 17 00:00:00 2001 From: Charlotte Vermandel Date: Wed, 16 Aug 2023 19:03:10 +0200 Subject: [PATCH] Fix bug where same batches are send multiple times --- README.md | 2 +- package.json | 2 +- src/package_version.ts | 2 +- src/scrapers/docssearch.ts | 4 +++- src/sender.ts | 23 +++++++++++++++++------ 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 24b4693..b5c7373 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ data: "meilisearch_index_uid": "google", "stategy": "default", // docssearch, schema*, custom or default "headless": true, // Open browser or not - "batch_size": 100, //null with send documents one by one + "batch_size": 1000, //null with send documents one by one "primary_key": null, "meilisearch_settings": { "searchableAttributes": [ diff --git a/package.json b/package.json index 3c24bee..7336396 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@meilisearch/scrapix", - "version": "0.1.1", + "version": "0.1.3", "description": "Automatic scraper and indexer to Meilisearch of any website.", "main": "dist/src/index.js", "dependencies": { diff --git a/src/package_version.ts b/src/package_version.ts index 93e80e5..dc56681 100644 --- a/src/package_version.ts +++ b/src/package_version.ts @@ -1 +1 @@ -export const PACKAGE_VERSION = '0.1.1' +export const PACKAGE_VERSION = '0.1.3' diff --git a/src/scrapers/docssearch.ts b/src/scrapers/docssearch.ts index 235b239..eec617e 100644 --- a/src/scrapers/docssearch.ts +++ b/src/scrapers/docssearch.ts @@ -218,7 +218,9 @@ export default class DocsearchScaper { } } // Send remaining data - await this._send_data({ ...document }) + if (document.content && document.content?.length > 0) { + await this._send_data({ ...document }) + } } async _send_data(data: DocsSearchDocument) { diff --git a/src/sender.ts b/src/sender.ts index 17c9905..f92e8d1 100644 --- a/src/sender.ts +++ b/src/sender.ts @@ -18,7 +18,7 @@ export class Sender { this.config = config this.initial_index_uid = config.meilisearch_index_uid this.index_uid = this.initial_index_uid - this.batch_size = config.batch_size || 100 + this.batch_size = config.batch_size || 1000 //Create a Meilisearch client this.client = initMeilisearchClient({ @@ -54,9 +54,9 @@ export class Sender { //Add a json object to the queue async add(data: DocumentType) { + console.log('Sender::add') this.nb_documents_sent++ - console.log('Sender::add') if (this.config.primary_key && this.config.primary_key !== 'uid') { delete data['uid'] } @@ -64,7 +64,8 @@ export class Sender { if (this.batch_size) { this.queue.push(data) if (this.queue.length >= this.batch_size) { - await this.__batchSend() + this.__batchSend() + this.queue = [] } } else { await this.client.index(this.index_uid).addDocuments([data]) @@ -80,7 +81,7 @@ export class Sender { } async finish() { - await this.__batchSend() + await this.__batchSendSync() const index = await this.client.getIndex(this.index_uid) const stats = await index.getStats() if ( @@ -99,12 +100,22 @@ export class Sender { ) } - async __batchSend() { + __batchSend() { + console.log(`Sender::__batchSend - size: ${this.queue.length}`) + this.client + .index(this.index_uid) + .addDocuments(this.queue) + .catch((e) => { + console.log(e) + console.log('Error while sending data to MeiliSearch') + }) + } + + async __batchSendSync() { console.log(`Sender::__batchSend - size: ${this.queue.length}`) const task = await this.client .index(this.index_uid) .addDocuments(this.queue) - this.queue = [] await this.client.waitForTask(task.taskUid) }