Skip to content

Commit

Permalink
feat(scraper): expand crawler metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
mytlogos committed Sep 6, 2022
1 parent 0e6acfd commit b7f0974
Show file tree
Hide file tree
Showing 26 changed files with 174 additions and 47 deletions.
9 changes: 2 additions & 7 deletions packages/core/src/asyncStorage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import {
} from "async_hooks";
import { writeSync } from "fs";
import { AsyncContextError } from "./error";
import { Modification, Optional } from "./types";
import { Modification, NetworkTrack, Optional } from "./types";

const localStorage = new AsyncLocalStorage();

Expand Down Expand Up @@ -181,12 +181,7 @@ export interface StoreMapping {
[StoreKey.MODIFICATIONS]: Record<string, Modification>;
[StoreKey.RESULT]: "success" | "warning" | "failed" | "aborted";
[StoreKey.MESSAGE]: string;
[StoreKey.NETWORK]: {
count: number;
sent: number;
received: number;
history: Array<{ url: string; method: string; statusCode: number; send: number; received: number }>;
};
[StoreKey.NETWORK]: NetworkTrack;
[StoreKey.LAST_RUN]: Date;
[StoreKey.ERROR]: unknown;
[StoreKey.LAST_REQUEST_URL]: string;
Expand Down
13 changes: 4 additions & 9 deletions packages/core/src/database/contexts/jobContext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import {
QueryJobHistory,
Paginated,
} from "../../types";
import { isString, promiseMultiSingle, multiSingle } from "../../tools";
import { isString, promiseMultiSingle, multiSingle, defaultNetworkTrack } from "../../tools";
import logger from "../../logger";
import mysql from "promise-mysql";
import { escapeLike } from "../storages/storageTools";
Expand Down Expand Up @@ -518,12 +518,7 @@ export class JobContext extends SubContext {

const jobTrack: JobTrack = {
modifications: store.get(StoreKey.MODIFICATIONS) || {},
network: store.get(StoreKey.NETWORK) || {
count: 0,
sent: 0,
received: 0,
history: [],
},
network: store.get(StoreKey.NETWORK) || defaultNetworkTrack(),
queryCount: store.get(StoreKey.QUERY_COUNT) || 0,
};

Expand Down Expand Up @@ -581,8 +576,8 @@ export class JobContext extends SubContext {
item.max_deleted = Math.max(item.max_deleted, modification.deleted);
});
item.sql_queries = jobTrack.queryCount;
item.min_sql_queries = Math.min(jobTrack.queryCount);
item.max_sql_queries = Math.max(jobTrack.queryCount);
item.min_sql_queries = Math.min(jobTrack.queryCount, item.min_sql_queries);
item.max_sql_queries = Math.max(jobTrack.queryCount, item.max_sql_queries);

item.failed += result === "failed" ? 1 : 0;
item.succeeded += result === "success" ? 1 : 0;
Expand Down
14 changes: 14 additions & 0 deletions packages/core/src/tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {
Nullable,
Indexable,
ExtractedIndex,
NetworkTrack,
} from "./types";
import crypto from "crypto";
import bcrypt from "bcryptjs";
Expand Down Expand Up @@ -1041,3 +1042,16 @@ export function deferableTimeout(timeoutMillis: number, maxRetries = 0): Deferab
result.defer();
return result;
}

export function defaultNetworkTrack(): NetworkTrack {
return {
count: 0,
sent: 0,
received: 0,
cloudflareCount: 0,
puppeteerCount: 0,
retryCount: 0,
hooksUsed: [],
history: [],
};
}
4 changes: 4 additions & 0 deletions packages/core/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1516,6 +1516,10 @@ export interface NetworkTrack {
count: number;
sent: number;
received: number;
cloudflareCount: number;
puppeteerCount: number;
retryCount: number;
hooksUsed: string[];
history: Array<{
url: string;
method: string;
Expand Down
3 changes: 3 additions & 0 deletions packages/scraper/src/externals/custom/download.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { Cheerio, Element } from "cheerio";
import { JSONSchema7 } from "json-schema";
import { validate } from "jsonschema";
import { storeHookName } from "../scraperTools";
import { ContentDownloader, EpisodeContent } from "../types";
import { defaultContext, extract, makeRequest } from "./common";
import { CustomHookError } from "./errors";
Expand Down Expand Up @@ -50,6 +51,8 @@ export function createDownloadScraper(config: HookConfig): ContentDownloader | u
}

const scraper: ContentDownloader = async (url) => {
storeHookName(config.name);

const context = defaultContext();

async function scrape(downloadConfig: DownloadConfig) {
Expand Down
2 changes: 2 additions & 0 deletions packages/scraper/src/externals/custom/news.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { HookConfig } from "./types";
import { CustomHookError } from "./errors";
import { ValidationError } from "enterprise-core/dist/error";
import request from "../request";
import { storeHookName } from "../scraperTools";

function validateEpisodeNews(episodes: Array<Partial<EpisodeNews>>): EpisodeNews[] {
for (const episode of episodes) {
Expand Down Expand Up @@ -51,6 +52,7 @@ export function createNewsScraper(config: HookConfig): NewsScraper | undefined {
const newsConfig = config.news;

const scraper: NewsScraper = async () => {
storeHookName(config.name);
const $ = await request.getCheerio({ url: newsConfig.newsUrl });
const baseUri = newsConfig.base || config.base;
const context = defaultContext();
Expand Down
2 changes: 2 additions & 0 deletions packages/scraper/src/externals/custom/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { SearchScraper } from "../types";
import { defaultContext, extractJSON, makeRequest } from "./common";
import { SearchResult } from "enterprise-core/dist/types";
import { CustomHookError } from "./errors";
import { storeHookName } from "../scraperTools";

const tocSchema: JSONSchema7 = {
$schema: "https://json-schema.org/draft/2020-12/schema",
Expand Down Expand Up @@ -47,6 +48,7 @@ export function createSearchScraper(config: HookConfig): SearchScraper | undefin
}

const scraper: SearchScraper = async (text) => {
storeHookName(config.name);
const context = defaultContext();
// @ts-expect-error
context.variables.PARAM = [text];
Expand Down
2 changes: 2 additions & 0 deletions packages/scraper/src/externals/custom/toc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { validate } from "jsonschema";
import { JSONSchema7 } from "json-schema";
import { CustomHookError } from "./errors";
import { Response } from "../request";
import { storeHookName } from "../scraperTools";

const tocSchema: JSONSchema7 = {
$schema: "https://json-schema.org/draft/2020-12/schema",
Expand Down Expand Up @@ -95,6 +96,7 @@ export function createTocScraper(config: HookConfig): TocScraper | undefined {
}

const scraper: TocScraper = async (url) => {
storeHookName(config.name);
const context = defaultContext();
let lastUrl = url;

Expand Down
5 changes: 5 additions & 0 deletions packages/scraper/src/externals/customv2/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import { EpisodeNews, ReleaseState, SearchResult } from "enterprise-core/dist/ty
import { datePattern } from "./analyzer";
import { validateEpisodeNews, validateToc } from "./validation";
import request, { Response } from "../request";
import { storeHookName } from "../scraperTools";

type Conditional<T, R> = T extends undefined ? undefined : R;
type Context = Record<string, any>;
Expand Down Expand Up @@ -183,6 +184,7 @@ function createNewsScraper(config: HookConfig): NewsScraper | undefined {
const context: Context = {};

const scraper: NewsScraper = async (): Promise<NewsScrapeResult> => {
storeHookName(config.name);
const results: Array<Array<NewsNestedResult | NewsSingleResult>> = [];
for (const datum of newsConfig.data) {
const selector: Selector = {
Expand Down Expand Up @@ -265,6 +267,7 @@ function createTocScraper(config: HookConfig): TocScraper | undefined {
const x = createScraper(tocConfig.regexes);

const scraper: TocScraper = async (link: string): Promise<Toc[]> => {
storeHookName(config.name);
const results = [];
let firstResponseUrl: string | undefined;

Expand Down Expand Up @@ -416,6 +419,7 @@ function createDownloadScraper(config: HookConfig): ContentDownloader | undefine
const context: Context = {};

const scraper: ContentDownloader = async (link) => {
storeHookName(config.name);
const results = [];
for (const datum of downloadConfig.data) {
const selector = {
Expand Down Expand Up @@ -451,6 +455,7 @@ function createSearchScraper(config: HookConfig): SearchScraper | undefined {
return;
}
const scraper: SearchScraper = async (text) => {
storeHookName(config.name);
const results = [];
for (const datum of searchConfig.data) {
if (datum._request) {
Expand Down
7 changes: 6 additions & 1 deletion packages/scraper/src/externals/direct/boxNovelScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import {
LogType,
getText,
} from "./directTools";
import { checkTocContent } from "../scraperTools";
import { checkTocContent, storeHookName } from "../scraperTools";
import { MissingResourceError, UrlError } from "../errors";
import * as cheerio from "cheerio";
import request, { ResponseError } from "../request";
Expand All @@ -45,10 +45,12 @@ interface NovelSearchData {
const BASE_URI = "https://boxnovel.com/";

async function tocSearch(medium: TocSearchMedium): VoidablePromise<Toc> {
storeHookName("boxnovel");
return searchToc(medium, tocAdapter, BASE_URI, (searchString) => searchAjax(searchString, medium));
}

async function search(text: string): Promise<SearchResult[]> {
storeHookName("boxnovel");
const urlString = BASE_URI + "wp-admin/admin-ajax.php";
let response: NovelSearchResponse;
const searchResults: SearchResult[] = [];
Expand Down Expand Up @@ -111,6 +113,7 @@ export async function searchAjax(searchWords: string, medium: TocSearchMedium):
}

async function contentDownloadAdapter(urlString: string): Promise<EpisodeContent[]> {
storeHookName("boxnovel");
if (!urlString.match(/https:\/\/boxnovel\.com\/novel\/.+\/chapter-.+/)) {
return [];
}
Expand Down Expand Up @@ -159,6 +162,7 @@ async function contentDownloadAdapter(urlString: string): Promise<EpisodeContent
}

async function tocAdapter(tocLink: string): Promise<Toc[]> {
storeHookName("boxnovel");
const uri = BASE_URI;

if (!tocLink.startsWith(BASE_URI + "novel/")) {
Expand Down Expand Up @@ -305,6 +309,7 @@ async function tocAdapter(tocLink: string): Promise<Toc[]> {
}

async function newsAdapter(): VoidablePromise<{ news?: News[]; episodes?: EpisodeNews[] }> {
storeHookName("boxnovel");
const uri = BASE_URI;
const $ = await request.getCheerio({ url: uri });
const items = $(".page-item-detail");
Expand Down
7 changes: 6 additions & 1 deletion packages/scraper/src/externals/direct/gogoAnimeScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ import { EpisodeNews, ReleaseState, SearchResult, TocSearchMedium, VoidablePromi
import * as cheerio from "cheerio";
import logger from "enterprise-core/dist/logger";
import * as url from "url";
import { checkTocContent } from "../scraperTools";
import { checkTocContent, storeHookName } from "../scraperTools";
import { getText, LogType, scraperLog, SearchResult as TocSearchResult, searchToc } from "./directTools";
import { UrlError } from "../errors";
import request from "../request";

const BASE_URI = "https://www.gogoanime.vc/";

async function scrapeNews(): Promise<NewsScrapeResult> {
storeHookName("gogoanime");
const uri = BASE_URI;
const $ = await request.getCheerio({ url: uri });

Expand Down Expand Up @@ -80,6 +81,7 @@ async function scrapeNews(): Promise<NewsScrapeResult> {
}

async function scrapeToc(urlString: string): Promise<Toc[]> {
storeHookName("gogoanime");
const animeAliasReg = /^https?:\/\/(www\d*\.)?gogoanime\.(vc|wiki)\/category\/(.+)/;
const aliasExec = animeAliasReg.exec(urlString);

Expand Down Expand Up @@ -185,10 +187,12 @@ async function scrapeSearch(searchString: string, searchMedium: TocSearchMedium)
}

async function searchForToc(searchMedium: TocSearchMedium): VoidablePromise<Toc> {
storeHookName("gogoanime");
return searchToc(searchMedium, scrapeToc, BASE_URI, (searchString) => scrapeSearch(searchString, searchMedium));
}

async function search(searchWords: string): Promise<SearchResult[]> {
storeHookName("gogoanime");
const urlString = `https://ajax.apimovie.xyz/site/loadAjaxSearch?keyword=${encodeURIComponent(
searchWords,
)}&id=-1&link_web=https%3A%2F%2Fwww.gogoanime.vc%2F`;
Expand Down Expand Up @@ -240,6 +244,7 @@ search.medium = MediaType.VIDEO;
* @deprecated behind recaptcha
*/
async function contentDownloader(link: string): Promise<EpisodeContent[]> {
storeHookName("gogoanime");
const episodeRegex = /https:\/\/www\d*\.gogoanime\.(vc|wiki)\/.+-episode-(\d+)/;
const exec = episodeRegex.exec(link);
if (!exec) {
Expand Down
7 changes: 6 additions & 1 deletion packages/scraper/src/externals/direct/mangaHasuScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import {
import * as url from "url";
import logger from "enterprise-core/dist/logger";
import { equalsIgnore, extractIndices, MediaType, sanitizeString, delay, hasProp } from "enterprise-core/dist/tools";
import { checkTocContent } from "../scraperTools";
import { checkTocContent, storeHookName } from "../scraperTools";
import {
SearchResult as TocSearchResult,
searchToc,
Expand Down Expand Up @@ -68,6 +68,7 @@ function enforceHttps(link: string): string {
}

async function scrapeNews(): Promise<NewsScrapeResult> {
storeHookName("mangahasu");
// TODO scrape more than just the first page if there is an open end
const baseUri = BASE_URI;
const requestUrl = baseUri + "latest-releases.html";
Expand Down Expand Up @@ -179,6 +180,7 @@ async function scrapeNews(): Promise<NewsScrapeResult> {
}

async function contentDownloadAdapter(chapterLink: string): Promise<EpisodeContent[]> {
storeHookName("mangahasu");
const $ = await tryRequest(chapterLink);
if (getText($("head > title")) === "Page not found!") {
throw new MissingResourceError("Missing Toc on NovelFull", chapterLink);
Expand Down Expand Up @@ -229,6 +231,7 @@ async function contentDownloadAdapter(chapterLink: string): Promise<EpisodeConte
}

async function scrapeToc(urlString: string): Promise<Toc[]> {
storeHookName("mangahasu");
if (!/https?:\/\/mangahasu\.se\/[^/]+\.html/.test(urlString)) {
throw new UrlError("not a toc link for MangaHasu: " + urlString, urlString);
}
Expand Down Expand Up @@ -398,6 +401,7 @@ async function scrapeToc(urlString: string): Promise<Toc[]> {
}

async function tocSearchAdapter(searchMedium: TocSearchMedium): VoidablePromise<Toc> {
storeHookName("mangahasu");
return searchToc(searchMedium, scrapeToc, BASE_URI, (searchString) => scrapeSearch(searchString, searchMedium));
}

Expand Down Expand Up @@ -437,6 +441,7 @@ async function scrapeSearch(searchWords: string, medium: TocSearchMedium): Promi
}

async function search(searchWords: string): Promise<SearchResult[]> {
storeHookName("mangahasu");
const urlString = BASE_URI + "search/autosearch";

const body = "key=" + searchWords;
Expand Down
5 changes: 4 additions & 1 deletion packages/scraper/src/externals/direct/mangadexScraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { EpisodeContentData, EpisodeNews, ReleaseState, Optional } from "enterpr
import * as url from "url";
import logger from "enterprise-core/dist/logger";
import { extractIndices, ignore, hasProp, MediaType, sanitizeString } from "enterprise-core/dist/tools";
import { checkTocContent } from "../scraperTools";
import { checkTocContent, storeHookName } from "../scraperTools";
import { episodeStorage } from "enterprise-core/dist/database/storages/storage";
import { MissingResourceError, ScraperError, UrlError } from "../errors";
import { extractLinkable, getText, LogType, scraperLog } from "./directTools";
Expand Down Expand Up @@ -83,6 +83,7 @@ interface ChapterChapterItem {
}

async function contentDownloadAdapter(chapterLink: string): Promise<EpisodeContent[]> {
storeHookName("mangadex");
const linkReg = /^https:\/\/mangadex\.org\/chapter\/(\d+)/;
const exec = linkReg.exec(chapterLink);
if (!exec) {
Expand Down Expand Up @@ -145,6 +146,7 @@ async function contentDownloadAdapter(chapterLink: string): Promise<EpisodeConte
}

async function scrapeNews(): Promise<NewsScrapeResult> {
storeHookName("mangadex");
// TODO: 19.07.2019 set the cookie 'mangadex_filter_langs:"1"'
// with expiration date somewhere in 100 years to lessen load

Expand Down Expand Up @@ -257,6 +259,7 @@ async function scrapeNews(): Promise<NewsScrapeResult> {
}

async function scrapeToc(urlString: string): Promise<Toc[]> {
storeHookName("mangadex");
const urlRegex = /^https?:\/\/mangadex\.org\/title\/\d+\/[^/]+\/?$/;

if (!urlRegex.test(urlString)) {
Expand Down
Loading

0 comments on commit b7f0974

Please sign in to comment.