Skip to content

Commit

Permalink
Store URLs from html <a and <form tags in json metadata
Browse files Browse the repository at this point in the history
Thank you Matt Jolly for the helpful comment on the pull request.
  • Loading branch information
ragusaa committed Jul 10, 2024
1 parent 8317780 commit ba05070
Show file tree
Hide file tree
Showing 10 changed files with 658 additions and 10 deletions.
1 change: 1 addition & 0 deletions clamscan/clamscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ void help(void)
mprintf(LOGG_INFO, " --phishing-sigs[=yes(*)/no] Enable email signature-based phishing detection\n");
mprintf(LOGG_INFO, " --phishing-scan-urls[=yes(*)/no] Enable URL signature-based phishing detection\n");
mprintf(LOGG_INFO, " --heuristic-alerts[=yes(*)/no] Heuristic alerts\n");
mprintf(LOGG_INFO, " --store-html-urls[=yes(*)/no] Store html URLs in metadata\n");
mprintf(LOGG_INFO, " --heuristic-scan-precedence[=yes/no(*)] Stop scanning as soon as a heuristic match is found\n");
mprintf(LOGG_INFO, " --normalize[=yes(*)/no] Normalize html, script, and text files. Use normalize=no for yara compatibility\n");
mprintf(LOGG_INFO, " --scan-pe[=yes(*)/no] Scan PE files\n");
Expand Down
4 changes: 4 additions & 0 deletions clamscan/manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -1557,6 +1557,10 @@ int scanmanager(const struct optstruct *opts)
options.general |= CL_SCAN_GENERAL_HEURISTICS;
}

if (optget(opts, "store-html-urls")->enabled) {
options.general |= CL_SCAN_STORE_HTML_URLS;
}

/* TODO: Remove deprecated option in a future feature release */
if ((optget(opts, "block-max")->enabled) ||
(optget(opts, "alert-exceeds-max")->enabled)) {
Expand Down
1 change: 1 addition & 0 deletions common/optparser.c
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ const struct clam_option __clam_options[] = {
{"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"},

{"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"},
{"StoreHTMLUrls", "store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML <form and <a tags.", "yes"},

{"HeuristicScanPrecedence", "heuristic-scan-precedence", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Allow heuristic match to take precedence.\nWhen enabled, if a heuristic scan (such as phishingScan) detects\na possible virus/phish it will stop scan immediately. Recommended, saves CPU\nscan-time.\nWhen disabled, virus/phish detected by heuristic scans will be reported only\nat the end of a scan. If an archive contains both a heuristically detected\nvirus/phish, and a real malware, the real malware will be reported.\nKeep this disabled if you intend to handle \"Heuristics.*\" viruses\ndifferently from \"real\" malware.\nIf a non-heuristically-detected virus (signature-based) is found first,\nthe scan is interrupted immediately, regardless of this config option.", "yes"},

Expand Down
1 change: 1 addition & 0 deletions libclamav/clamav.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ struct cl_scan_options {
#define CL_SCAN_HEURISTIC_STRUCTURED_SSN_STRIPPED 0x800 /* alert when detecting stripped social security numbers */
#define CL_SCAN_HEURISTIC_STRUCTURED_CC 0x1000 /* alert when detecting credit card numbers */
#define CL_SCAN_HEURISTIC_BROKEN_MEDIA 0x2000 /* alert if a file does not match the identified file format, works with JPEG, TIFF, GIF, PNG */
#define CL_SCAN_STORE_HTML_URLS 0x4000 /* Store urls found in html <a and <form tags*/

/* mail scanning options */
#define CL_SCAN_MAIL_PARTIAL_MESSAGE 0x1
Expand Down
67 changes: 60 additions & 7 deletions libclamav/htmlnorm.c
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,44 @@ static void js_process(struct parser_state *js_state, const unsigned char *js_be
}
}

static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
bool html_insert_form_data(const char * const value, form_data_t *tags) {
bool bRet = false;
size_t cnt = tags->count + 1;
char ** tmp = NULL;

/*
* Do NOT use cli_max_realloc_or_free because all the previously malloc'd tag
* values will be leaked when tag is free'd in the case where realloc fails.
*/
tmp = cli_max_realloc(tags->urls, cnt * sizeof(unsigned char *));
if (!tmp) {
goto done;
}
tags->urls = tmp;

tags->urls[tags->count] = cli_safer_strdup(value);
if (tags->urls[tags->count]) {
tags->count = cnt;
}

bRet = true;
done:
if (!bRet){
memset(tags, 0, sizeof(*tags));
}

return bRet;
}

void html_form_data_tag_free(form_data_t *tags) {
size_t i;
for (i = 0; i < tags->count; i++){
CLI_FREE_AND_SET_NULL(tags->urls[i]);
}
CLI_FREE_AND_SET_NULL(tags->urls);
}

static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t * form_data)
{
int fd_tmp, tag_length = 0, tag_arg_length = 0;
bool binary, retval = false, escape = false, hex = false;
Expand All @@ -659,7 +696,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
FILE *stream_in = NULL;
html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE;
char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1];
char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value;
char tag_val[HTML_STR_LENGTH + 1], *tmp_file = NULL, *arg_value = NULL;
unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;
tag_arguments_t tag_args;
quoted_state quoted = NOT_QUOTED;
Expand Down Expand Up @@ -1224,8 +1261,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
href_contents_begin = ptr;
}
if (strcmp(tag, "/form") == 0) {
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
in_form_action = NULL;
}
} else if (strcmp(tag, "script") == 0) {
Expand Down Expand Up @@ -1310,9 +1348,13 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
} else if (strcmp(tag, "form") == 0 && hrefs->scanContents) {
const char *arg_action_value = html_tag_arg_value(&tag_args, "action");
if (arg_action_value) {
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value);
if (form_data){
html_insert_form_data((const char * const) in_form_action, form_data);
}
}
} else if (strcmp(tag, "img") == 0) {
arg_value = html_tag_arg_value(&tag_args, "src");
Expand Down Expand Up @@ -1917,8 +1959,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
done:
if (line) /* only needed for done case */
free(line);
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
if (in_ahref) /* tag not closed, force closing */
html_tag_contents_done(hrefs, in_ahref, &contents);

Expand Down Expand Up @@ -1960,6 +2003,11 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
}

bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
{
return html_normalise_mem_form_data(ctx, in_buff, in_size, dirname, hrefs, dconf, NULL);
}

bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t * form_data)
{
m_area_t m_area;

Expand All @@ -1968,18 +2016,23 @@ bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, con
m_area.offset = 0;
m_area.map = NULL;

return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
}

bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
{
return html_normalise_map_form_data(ctx, map, dirname, hrefs, dconf, NULL);
}

bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t * form_data)
{
bool retval = false;
m_area_t m_area;

m_area.length = map->len;
m_area.offset = 0;
m_area.map = map;
retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
return retval;
}

Expand Down
9 changes: 9 additions & 0 deletions libclamav/htmlnorm.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,19 @@ typedef struct m_area_tag {
fmap_t *map;
} m_area_t;

typedef struct form_data_tag {
char ** urls;
size_t count;
} form_data_t;

bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t * form_data);
bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t * form_data);
void html_tag_arg_free(tag_arguments_t *tags);
bool html_screnc_decode(fmap_t *map, const char *dirname);
void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value);

void html_form_data_tag_free(form_data_t *tags);

#endif
1 change: 1 addition & 0 deletions libclamav/others.h
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ extern LIBCLAMAV_EXPORT int have_rar;
#define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS)
#define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE)
#define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED)
#define STORE_HTML_URLS (ctx->options->general & CL_SCAN_STORE_HTML_URLS)

#define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE)
#define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF)
Expand Down
Loading

0 comments on commit ba05070

Please sign in to comment.