diff --git a/clamscan/clamscan.c b/clamscan/clamscan.c index 2b10a1023b..3c35d6aa0c 100644 --- a/clamscan/clamscan.c +++ b/clamscan/clamscan.c @@ -295,6 +295,7 @@ void help(void) mprintf(LOGG_INFO, " --phishing-sigs[=yes(*)/no] Enable email signature-based phishing detection\n"); mprintf(LOGG_INFO, " --phishing-scan-urls[=yes(*)/no] Enable URL signature-based phishing detection\n"); mprintf(LOGG_INFO, " --heuristic-alerts[=yes(*)/no] Heuristic alerts\n"); + mprintf(LOGG_INFO, " --store-html-urls[=yes(*)/no] Store html URLs in metadata\n"); mprintf(LOGG_INFO, " --heuristic-scan-precedence[=yes/no(*)] Stop scanning as soon as a heuristic match is found\n"); mprintf(LOGG_INFO, " --normalize[=yes(*)/no] Normalize html, script, and text files. Use normalize=no for yara compatibility\n"); mprintf(LOGG_INFO, " --scan-pe[=yes(*)/no] Scan PE files\n"); diff --git a/clamscan/manager.c b/clamscan/manager.c index db3a8f46b6..8c75e75010 100644 --- a/clamscan/manager.c +++ b/clamscan/manager.c @@ -1557,6 +1557,10 @@ int scanmanager(const struct optstruct *opts) options.general |= CL_SCAN_GENERAL_HEURISTICS; } + if (optget(opts, "store-html-urls")->enabled) { + options.general |= CL_SCAN_STORE_HTML_URLS; + } + /* TODO: Remove deprecated option in a future feature release */ if ((optget(opts, "block-max")->enabled) || (optget(opts, "alert-exceeds-max")->enabled)) { diff --git a/common/optparser.c b/common/optparser.c index dd99f43eb2..8caf869c99 100644 --- a/common/optparser.c +++ b/common/optparser.c @@ -389,6 +389,7 @@ const struct clam_option __clam_options[] = { {"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"}, {"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"}, + {"StoreHTMLUrls", "store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML
pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end); i++) { uint8_t c = *begin++; if (mbchar && (c < 0x80 || mbchar >= 0x10000)) { @@ -687,6 +688,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha uint32_t mbchar = 0; uint32_t mbchar2 = 0; + /* * Initialize stack buffers. */ @@ -1929,6 +1931,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha cli_js_destroy(js_state); js_state = NULL; } + html_tag_arg_free(&tag_args); if (!m_area) { fclose(stream_in); diff --git a/libclamav/others.h b/libclamav/others.h index 8cebf78d35..4ffb7d0a50 100644 --- a/libclamav/others.h +++ b/libclamav/others.h @@ -552,6 +552,7 @@ extern LIBCLAMAV_EXPORT int have_rar; #define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS) #define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE) #define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED) +#define STORE_HTML_URLS (ctx->options->general & CL_SCAN_STORE_HTML_URLS) #define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE) #define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF) diff --git a/libclamav/scanners.c b/libclamav/scanners.c index 8cc19297af..d9a577f02b 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -2082,6 +2082,69 @@ static cl_error_t cli_ole2_tempdir_scan_for_xlm_and_images(const char *dir, cli_ return ret; } +const char * const HTML_URLS_JSON_KEY = "HTMLUrls"; + + + +static bool is_url(const char * const str){ + +#define MATCH(str, prefix) \ + do { \ + if (str && (strlen(str) > strlen(prefix)) \ + && (0 == strncasecmp(str, prefix, strlen(prefix)))) { \ + bRet = true; \ + goto done; \ + } \ + } while (0); + + bool bRet = false; + + MATCH(str, "https://"); + MATCH(str, "http://"); + MATCH(str, "ftp://"); +done: + return bRet; +#undef MATCH +} +static void save_urls(cli_ctx * ctx, tag_arguments_t * hrefs) { + int i = 0; + bool haveOne = false; + if (NULL == hrefs) { + return; + } + + if (ctx->wrkproperty != ctx->properties) { + return; + } + + if (!(STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL))) { + return; + } + + for (i = 0; i < hrefs->count; i++){ + if (is_url((const char *) hrefs->value[i])) { + haveOne = true; + break; + } + } + + if (!haveOne){ + return; + } + + json_object *ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY ); + if (ary) { + for (i = 0; i < hrefs->count; i++){ + if (is_url((const char *) hrefs->value[i])){ + cli_jsonstr(ary, NULL, (const char *) hrefs->value[i]); + } + } + } else { + cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY ); + } + +} + static cl_error_t cli_scanhtml(cli_ctx *ctx) { cl_error_t status = CL_SUCCESS; @@ -2113,7 +2176,15 @@ static cl_error_t cli_scanhtml(cli_ctx *ctx) cli_dbgmsg("cli_scanhtml: using tempdir %s\n", tempname); - (void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf); + /* Output JSON Summary Information */ + if (STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL)) { + tag_arguments_t hrefs = {0}; + hrefs.scanContents = 1; + (void)html_normalise_map(ctx, map, tempname, &hrefs, ctx->dconf); + save_urls(ctx, &hrefs); + } else { + (void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf); + } snprintf(fullname, 1024, "%s" PATHSEP "nocomment.html", tempname); fd = open(fullname, O_RDONLY | O_BINARY);