Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow to reuse a browser by passing a browserContext #884

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
28 changes: 14 additions & 14 deletions src/scrapers/base-isracard-amex.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ const DATE_FORMAT = 'DD/MM/YYYY';

const debug = getDebug('base-isracard-amex');

interface ExtendedScraperOptions extends ScraperOptions {
type CompanyServiceOptions = {
servicesUrl: string;
companyCode: string;
}
};

type ScrapedAccountsWithIndex = Record<string, TransactionsAccount & { index: number }>;

Expand Down Expand Up @@ -204,9 +204,9 @@ function convertTransactions(txns: ScrapedTransaction[], processedDate: string):
});
}

async function fetchTransactions(page: Page, options: ExtendedScraperOptions, startMoment: Moment, monthMoment: Moment): Promise<ScrapedAccountsWithIndex> {
const accounts = await fetchAccounts(page, options.servicesUrl, monthMoment);
const dataUrl = getTransactionsUrl(options.servicesUrl, monthMoment);
async function fetchTransactions(page: Page, options: ScraperOptions, companyServiceOptions:CompanyServiceOptions, startMoment: Moment, monthMoment: Moment): Promise<ScrapedAccountsWithIndex> {
const accounts = await fetchAccounts(page, companyServiceOptions.servicesUrl, monthMoment);
const dataUrl = getTransactionsUrl(companyServiceOptions.servicesUrl, monthMoment);
daniel-hauser marked this conversation as resolved.
Show resolved Hide resolved
const dataResult = await fetchGetWithinPage<ScrapedTransactionData>(page, dataUrl);
if (dataResult && _.get(dataResult, 'Header.Status') === '1' && dataResult.CardsTransactionsListBean) {
const accountTxns: ScrapedAccountsWithIndex = {};
Expand Down Expand Up @@ -255,7 +255,7 @@ function getTransactionExtraDetails(servicesUrl: string, month: Moment, accountI
},
});
}
async function getExtraScrapTransaction(page: Page, options: ExtendedScraperOptions, month: Moment, accountIndex: number, transaction: Transaction): Promise<Transaction> {
async function getExtraScrapTransaction(page: Page, options:CompanyServiceOptions, month: Moment, accountIndex: number, transaction: Transaction): Promise<Transaction> {
const dataUrl = getTransactionExtraDetails(options.servicesUrl, month, accountIndex, transaction);
const data = await fetchGetWithinPage<ScrapedTransactionData>(page, dataUrl);

Expand All @@ -270,13 +270,13 @@ async function getExtraScrapTransaction(page: Page, options: ExtendedScraperOpti
};
}

function getExtraScrapTransactions(accountWithIndex: TransactionsAccount & { index: number }, page: Page, options: ExtendedScraperOptions, month: moment.Moment): Promise<Transaction[]> {
function getExtraScrapTransactions(accountWithIndex: TransactionsAccount & { index: number }, page: Page, options: CompanyServiceOptions, month: moment.Moment): Promise<Transaction[]> {
const promises = accountWithIndex.txns
.map((t) => getExtraScrapTransaction(page, options, month, accountWithIndex.index, t));
return Promise.all(promises);
}

async function getExtraScrapAccount(page: Page, options: ExtendedScraperOptions, accountMap: ScrapedAccountsWithIndex, month: moment.Moment): Promise<ScrapedAccountsWithIndex> {
async function getExtraScrapAccount(page: Page, options: CompanyServiceOptions, accountMap: ScrapedAccountsWithIndex, month: moment.Moment): Promise<ScrapedAccountsWithIndex> {
const promises = Object.keys(accountMap)
.map(async (a) => ({
...accountMap[a],
Expand All @@ -286,20 +286,20 @@ async function getExtraScrapAccount(page: Page, options: ExtendedScraperOptions,
return accounts.reduce((m, x) => ({ ...m, [x.accountNumber]: x }), {});
}

function getExtraScrap(accountsWithIndex: ScrapedAccountsWithIndex[], page: Page, options: ExtendedScraperOptions, allMonths: moment.Moment[]): Promise<ScrapedAccountsWithIndex[]> {
function getExtraScrap(accountsWithIndex: ScrapedAccountsWithIndex[], page: Page, options: CompanyServiceOptions, allMonths: moment.Moment[]): Promise<ScrapedAccountsWithIndex[]> {
const actions = accountsWithIndex.map((a, i) => () => getExtraScrapAccount(page, options, a, allMonths[i]));
return runSerial(actions);
}

async function fetchAllTransactions(page: Page, options: ExtendedScraperOptions, startMoment: Moment) {
async function fetchAllTransactions(page: Page, options: ScraperOptions, companyServiceOptions:CompanyServiceOptions, startMoment: Moment) {
const futureMonthsToScrape = options.futureMonthsToScrape ?? 1;
const allMonths = getAllMonthMoments(startMoment, futureMonthsToScrape);
const results: ScrapedAccountsWithIndex[] = await Promise.all(allMonths.map(async (monthMoment) => {
return fetchTransactions(page, options, startMoment, monthMoment);
return fetchTransactions(page, options, companyServiceOptions, startMoment, monthMoment);
}));

const finalResult = options.additionalTransactionInformation ?
await getExtraScrap(results, page, options, allMonths) : results;
await getExtraScrap(results, page, companyServiceOptions, allMonths) : results;

const combinedTxns: Record<string, Transaction[]> = {};

Expand Down Expand Up @@ -431,8 +431,8 @@ class IsracardAmexBaseScraper extends BaseScraperWithBrowser<ScraperSpecificCred
const startDate = this.options.startDate || defaultStartMoment.toDate();
const startMoment = moment.max(defaultStartMoment, moment(startDate));

return fetchAllTransactions(this.page, {
...this.options,
return fetchAllTransactions(this.page, this.options, {

servicesUrl: this.servicesUrl,
companyCode: this.companyCode,
}, startMoment);
Expand Down
108 changes: 59 additions & 49 deletions src/scrapers/base-scraper-with-browser.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
import puppeteer, {
type Browser, type Frame, type GoToOptions, type Page, type PuppeteerLifeCycleEvent,
} from 'puppeteer';

import puppeteer, { type Frame, type GoToOptions, type Page, type PuppeteerLifeCycleEvent } from 'puppeteer';
import { ScraperProgressTypes } from '../definitions';
import { getDebug } from '../helpers/debug';
import { clickButton, fillInput, waitUntilElementFound } from '../helpers/elements-interactions';
Expand Down Expand Up @@ -85,9 +82,7 @@ function createGeneralError(): ScraperScrapingResult {
}

class BaseScraperWithBrowser<TCredentials extends ScraperCredentials> extends BaseScraper<TCredentials> {
// NOTICE - it is discouraged to use bang (!) in general. It is used here because
// all the classes that inherit from this base assume is it mandatory.
protected browser!: Browser;
private cleanups: Array<() => Promise<void>> = [];

// NOTICE - it is discouraged to use bang (!) in general. It is used here because
// all the classes that inherit from this base assume is it mandatory.
Expand All @@ -100,53 +95,71 @@ class BaseScraperWithBrowser<TCredentials extends ScraperCredentials> extends Ba
};
}

async initialize() {
await super.initialize();
debug('initialize scraper');
this.emitProgress(ScraperProgressTypes.Initializing);

let env: Record<string, any> | undefined;
if (this.options.verbose) {
env = { DEBUG: '*', ...process.env };
private async initializePage() {
debug('initialize browser page');
if ('browserContext' in this.options) {
debug('Using the browser context provided in options');
return this.options.browserContext.newPage();
}

if (typeof this.options.browser !== 'undefined' && this.options.browser !== null) {
debug('use custom browser instance provided in options');
this.browser = this.options.browser;
} else {
const executablePath = this.options.executablePath || undefined;
const args = this.options.args || [];
const { timeout } = this.options;

const headless = !this.options.showBrowser;
debug(`launch a browser with headless mode = ${headless}`);
this.browser = await puppeteer.launch({
env,
headless,
executablePath,
args,
timeout,
});
}
if ('browser' in this.options) {
debug('Using the browser instance provided in options');
const { browser } = this.options;

Comment on lines +141 to +143
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as in #884 (comment) - but with ExternalBrowserOptions

/**
* For backward compatibility, we will close the browser even if we didn't create it
*/
daniel-hauser marked this conversation as resolved.
Show resolved Hide resolved
if (!this.options.skipCloseBrowser) {
this.cleanups.push(async () => {
debug('closing the browser');
await browser.close();
});
}
Comment on lines +149 to +152
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the browser exists and you run multiple accounts, this cleanup task will be pushed for each account (except the first one):

  1. It will run the same cleanup multiple times
  2. If only one account triggered, is it will close the browser?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic here will run from scrape > initialize for each scrape, and we will have one cleanup call.

This is the exact same behavior as the current terminate implementation that is:

    if (!this.browser) {
      return;
    }

    await this.browser.close();


return browser.newPage();
}

const { timeout, args, executablePath, showBrowser } = this.options;

const headless = !showBrowser;
debug(`launch a browser with headless mode = ${headless}`);

const browser = await puppeteer.launch({
env: this.options.verbose ? { DEBUG: '*', ...process.env } : undefined,
headless,
executablePath,
args,
timeout,
});

this.cleanups.push(async () => {
debug('closing the browser');
await browser.close();
});

if (this.options.prepareBrowser) {
debug("execute 'prepareBrowser' interceptor provided in options");
await this.options.prepareBrowser(this.browser);
await this.options.prepareBrowser(browser);
}

if (!this.browser) {
debug('failed to initiate a browser, exit');
debug('create a new browser page');
return browser.newPage();
}

async initialize() {
await super.initialize();
debug('initialize scraper');
this.emitProgress(ScraperProgressTypes.Initializing);

const page = await this.initializePage();
if (!page) {
debug('failed to initiate a browser page, exit');
return;
}

const pages = await this.browser.pages();
if (pages.length) {
debug('browser has already pages open, use the first one');
[this.page] = pages;
} else {
debug('create a new browser page');
this.page = await this.browser.newPage();
}
this.page = page;

this.cleanups.push( () => page.close());

if (this.options.defaultTimeout) {
this.page.setDefaultTimeout(this.options.defaultTimeout);
Expand Down Expand Up @@ -274,11 +287,8 @@ class BaseScraperWithBrowser<TCredentials extends ScraperCredentials> extends Ba
});
}

if (!this.browser) {
return;
}

await this.browser.close();
await Promise.all(this.cleanups.reverse().map((cleanup) => cleanup()));
this.cleanups = [];
}

private handleLoginResult(loginResult: LoginResults) {
Expand Down
94 changes: 54 additions & 40 deletions src/scrapers/interface.ts
baruchiro marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { type Browser, type Page } from 'puppeteer';
import { type BrowserContext, type Browser, type Page } from 'puppeteer';
import { type CompanyTypes, type ScraperProgressTypes } from '../definitions';
import { type TransactionsAccount } from '../transactions';
import { type ErrorResult, type ScraperErrorTypes } from './errors';
Expand Down Expand Up @@ -26,7 +26,7 @@ export interface FutureDebit {
bankAccountNumber?: string;
}

export interface ScraperOptions {
export type ScraperOptions = ScraperBrowserOptions & {
/**
baruchiro marked this conversation as resolved.
Show resolved Hide resolved
* The company you want to scrape
*/
Expand All @@ -42,82 +42,96 @@ export interface ScraperOptions {
*/
startDate: Date;

/**
* shows the browser while scraping, good for debugging (default false)
*/
showBrowser?: boolean;

/**
* scrape transactions to be processed X months in the future
*/
futureMonthsToScrape?: number;

/**
* option from init puppeteer browser instance outside the libary scope. you can get
* browser diretly from puppeteer via `puppeteer.launch()`
*/
browser?: any;

/**
* provide a patch to local chromium to be used by puppeteer. Relevant when using
* `israeli-bank-scrapers-core` library
* if set to true, all installment transactions will be combine into the first one
*/
executablePath?: string;
combineInstallments?: boolean;

/**
* if set to true, all installment transactions will be combine into the first one
* if set, store a screenshot if failed to scrape. Used for debug purposes
*/
combineInstallments?: boolean;
storeFailureScreenShotPath?: string;

/**
* additional arguments to pass to the browser instance. The list of flags can be found in
*
* https://developer.mozilla.org/en-US/docs/Mozilla/Command_Line_Options
* https://peter.sh/experiments/chromium-command-line-switches/
* if set, will set the timeout in milliseconds of puppeteer's `page.setDefaultTimeout`.
*/
args?: string[];
defaultTimeout?: number;

/**
* Maximum navigation time in milliseconds, pass 0 to disable timeout.
* @default 30000
* Options for manipulation of output data
*/
timeout?: number | undefined;
outputData?: OutputDataOptions;

/**
* adjust the browser instance before it is being used
*
* @param browser
* Perform additional operation for each transaction to get more information (Like category) about it.
* Please note: It will take more time to finish the process.
*/
prepareBrowser?: (browser: Browser) => Promise<void>;
additionalTransactionInformation?: boolean;

/**
* adjust the page instance before it is being used.
*
* @param page
*/
preparePage?: (page: Page) => Promise<void>;
};

type ScraperBrowserOptions = {
/**
* if set, store a screenshot if failed to scrape. Used for debug purposes
* An externally created browser instance.
* you can get a browser directly from puppeteer via `puppeteer.launch()`
*
* Note: The browser will be closed by the library after the scraper finishes unless `skipCloseBrowser` is set to true
*/
storeFailureScreenShotPath?: string;
browser: Browser;

/**
* if set, will set the timeout in milliseconds of puppeteer's `page.setDefaultTimeout`.
* If true, the browser will not be closed by the library after the scraper finishes
*/
defaultTimeout?: number;
skipCloseBrowser?: boolean;
} | {
/**
* An externally managed browser context. This is useful when you want to manage the browser
*/
browserContext: BrowserContext;
} | {
/**
* shows the browser while scraping, good for debugging (default false)
*/
showBrowser?: boolean;

/**
* Options for manipulation of output data
* provide a patch to local chromium to be used by puppeteer. Relevant when using
* `israeli-bank-scrapers-core` library
*/
outputData?: OutputDataOptions;
executablePath?: string;

/**
* Perform additional operation for each transaction to get more information (Like category) about it.
* Please note: It will take more time to finish the process.
* additional arguments to pass to the browser instance. The list of flags can be found in
*
* https://developer.mozilla.org/en-US/docs/Mozilla/Command_Line_Options
* https://peter.sh/experiments/chromium-command-line-switches/
*/
additionalTransactionInformation?: boolean;
}
args?: string[];

/**
* Maximum navigation time in milliseconds, pass 0 to disable timeout.
* @default 30000
*/
timeout?: number | undefined;

/**
* adjust the browser instance before it is being used
*
* @param browser
*/
prepareBrowser?: (browser: Browser) => Promise<void>;
};

export interface OutputDataOptions {
/**
Expand Down
Loading