Skip to content

Commit

Permalink
Store URLs from html <a and <form tags in json metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
ragusaa committed Jun 18, 2024
1 parent 8317780 commit 73a2786
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 1 deletion.
1 change: 1 addition & 0 deletions clamscan/clamscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ void help(void)
mprintf(LOGG_INFO, " --phishing-sigs[=yes(*)/no] Enable email signature-based phishing detection\n");
mprintf(LOGG_INFO, " --phishing-scan-urls[=yes(*)/no] Enable URL signature-based phishing detection\n");
mprintf(LOGG_INFO, " --heuristic-alerts[=yes(*)/no] Heuristic alerts\n");
mprintf(LOGG_INFO, " --store-html-urls[=yes(*)/no] Store html URLs in metadata\n");
mprintf(LOGG_INFO, " --heuristic-scan-precedence[=yes/no(*)] Stop scanning as soon as a heuristic match is found\n");
mprintf(LOGG_INFO, " --normalize[=yes(*)/no] Normalize html, script, and text files. Use normalize=no for yara compatibility\n");
mprintf(LOGG_INFO, " --scan-pe[=yes(*)/no] Scan PE files\n");
Expand Down
4 changes: 4 additions & 0 deletions clamscan/manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -1557,6 +1557,10 @@ int scanmanager(const struct optstruct *opts)
options.general |= CL_SCAN_GENERAL_HEURISTICS;
}

if (optget(opts, "store-html-urls")->enabled) {
options.general |= CL_SCAN_STORE_HTML_URLS;
}

/* TODO: Remove deprecated option in a future feature release */
if ((optget(opts, "block-max")->enabled) ||
(optget(opts, "alert-exceeds-max")->enabled)) {
Expand Down
1 change: 1 addition & 0 deletions common/optparser.c
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ const struct clam_option __clam_options[] = {
{"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"},

{"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"},
{"StoreHTMLUrls", "store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML <form and <a tags.", "yes"},

{"HeuristicScanPrecedence", "heuristic-scan-precedence", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Allow heuristic match to take precedence.\nWhen enabled, if a heuristic scan (such as phishingScan) detects\na possible virus/phish it will stop scan immediately. Recommended, saves CPU\nscan-time.\nWhen disabled, virus/phish detected by heuristic scans will be reported only\nat the end of a scan. If an archive contains both a heuristically detected\nvirus/phish, and a real malware, the real malware will be reported.\nKeep this disabled if you intend to handle \"Heuristics.*\" viruses\ndifferently from \"real\" malware.\nIf a non-heuristically-detected virus (signature-based) is found first,\nthe scan is interrupted immediately, regardless of this config option.", "yes"},

Expand Down
1 change: 1 addition & 0 deletions libclamav/clamav.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ struct cl_scan_options {
#define CL_SCAN_HEURISTIC_STRUCTURED_SSN_STRIPPED 0x800 /* alert when detecting stripped social security numbers */
#define CL_SCAN_HEURISTIC_STRUCTURED_CC 0x1000 /* alert when detecting credit card numbers */
#define CL_SCAN_HEURISTIC_BROKEN_MEDIA 0x2000 /* alert if a file does not match the identified file format, works with JPEG, TIFF, GIF, PNG */
#define CL_SCAN_STORE_HTML_URLS 0x4000 /* Store urls found in html <a and <form tags*/

/* mail scanning options */
#define CL_SCAN_MAIL_PARTIAL_MESSAGE 0x1
Expand Down
3 changes: 3 additions & 0 deletions libclamav/htmlnorm.c
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,7 @@ static inline void html_tag_contents_append(struct tag_contents *cont, const uns
uint32_t mbchar = 0;
if (!begin || !end)
return;

for (i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end); i++) {
uint8_t c = *begin++;
if (mbchar && (c < 0x80 || mbchar >= 0x10000)) {
Expand Down Expand Up @@ -687,6 +688,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
uint32_t mbchar = 0;
uint32_t mbchar2 = 0;


/*
* Initialize stack buffers.
*/
Expand Down Expand Up @@ -1929,6 +1931,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
cli_js_destroy(js_state);
js_state = NULL;
}

html_tag_arg_free(&tag_args);
if (!m_area) {
fclose(stream_in);
Expand Down
1 change: 1 addition & 0 deletions libclamav/others.h
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ extern LIBCLAMAV_EXPORT int have_rar;
#define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS)
#define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE)
#define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED)
#define STORE_HTML_URLS (ctx->options->general & CL_SCAN_STORE_HTML_URLS)

#define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE)
#define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF)
Expand Down
73 changes: 72 additions & 1 deletion libclamav/scanners.c
Original file line number Diff line number Diff line change
Expand Up @@ -2082,6 +2082,69 @@ static cl_error_t cli_ole2_tempdir_scan_for_xlm_and_images(const char *dir, cli_
return ret;
}

const char * const HTML_URLS_JSON_KEY = "HTMLUrls";



static bool is_url(const char * const str){

#define MATCH(str, prefix) \
do { \
if (str && (strlen(str) > strlen(prefix)) \
&& (0 == strncasecmp(str, prefix, strlen(prefix)))) { \
bRet = true; \
goto done; \
} \
} while (0);

bool bRet = false;

MATCH(str, "https://");
MATCH(str, "http://");
MATCH(str, "ftp://");
done:
return bRet;
#undef MATCH
}
static void save_urls(cli_ctx * ctx, tag_arguments_t * hrefs) {
int i = 0;
bool haveOne = false;
if (NULL == hrefs) {
return;
}

if (ctx->wrkproperty != ctx->properties) {
return;
}

if (!(STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL))) {
return;
}

for (i = 0; i < hrefs->count; i++){
if (is_url((const char *) hrefs->value[i])) {
haveOne = true;
break;
}
}

if (!haveOne){
return;
}

json_object *ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY );
if (ary) {
for (i = 0; i < hrefs->count; i++){
if (is_url((const char *) hrefs->value[i])){
cli_jsonstr(ary, NULL, (const char *) hrefs->value[i]);
}
}
} else {
cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY );
}

}

static cl_error_t cli_scanhtml(cli_ctx *ctx)
{
cl_error_t status = CL_SUCCESS;
Expand Down Expand Up @@ -2113,7 +2176,15 @@ static cl_error_t cli_scanhtml(cli_ctx *ctx)

cli_dbgmsg("cli_scanhtml: using tempdir %s\n", tempname);

(void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf);
/* Output JSON Summary Information */
if (STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL)) {
tag_arguments_t hrefs = {0};
hrefs.scanContents = 1;
(void)html_normalise_map(ctx, map, tempname, &hrefs, ctx->dconf);
save_urls(ctx, &hrefs);
} else {
(void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf);
}

snprintf(fullname, 1024, "%s" PATHSEP "nocomment.html", tempname);
fd = open(fullname, O_RDONLY | O_BINARY);
Expand Down

0 comments on commit 73a2786

Please sign in to comment.