Skip to content

Commit

Permalink
Store URLs from HTML when recording scan metadata json
Browse files Browse the repository at this point in the history
Store URLs found in HTML `<a>` and `<form>` tags during scan of HTML files
when recording scan metadata.

HTML URL recording will be ON by default, but is a part of the
generate-metadata-json feature.
The generate-metadata-json feature is OFF by default.

This introduces a new general scan option:
- libclamav: `CL_SCAN_GENERAL_STORE_HTML_URLS`.
- ClamD: `JsonStoreHTMLUrls`.
- ClamScan: `--json-store-html-urls`

Thank you Matt Jolly for the helpful comment on the pull request.
  • Loading branch information
ragusaa authored and micahsnyder committed Sep 11, 2024
1 parent 8ae19ec commit 666e047
Show file tree
Hide file tree
Showing 13 changed files with 669 additions and 30 deletions.
2 changes: 2 additions & 0 deletions clamscan/clamscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,8 @@ void help(void)
mprintf(LOGG_INFO, " --gen-json[=yes/no(*)] Generate JSON metadata for the scanned file(s). For testing & development use ONLY.\n");
mprintf(LOGG_INFO, " JSON will be printed if --debug is enabled.\n");
mprintf(LOGG_INFO, " A JSON file will dropped to the temp directory if --leave-temps is enabled.\n");
mprintf(LOGG_INFO, " --json-store-html-urls[=yes(*)/no] Store html URLs in metadata.\n");
mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'HTMLUrls'\n");
mprintf(LOGG_INFO, " --database=FILE/DIR -d FILE/DIR Load virus database from FILE or load all supported db files from DIR\n");
mprintf(LOGG_INFO, " --official-db-only[=yes/no(*)] Only load official signatures\n");
mprintf(LOGG_INFO, " --fail-if-cvd-older-than=days Return with a nonzero error code if virus database outdated.\n");
Expand Down
4 changes: 4 additions & 0 deletions clamscan/manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -1574,6 +1574,10 @@ int scanmanager(const struct optstruct *opts)
options.general |= CL_SCAN_GENERAL_HEURISTICS;
}

if (optget(opts, "json-store-html-urls")->enabled) {
options.general |= CL_SCAN_GENERAL_STORE_HTML_URLS;
}

/* TODO: Remove deprecated option in a future feature release */
if ((optget(opts, "block-max")->enabled) ||
(optget(opts, "alert-exceeds-max")->enabled)) {
Expand Down
1 change: 1 addition & 0 deletions common/optparser.c
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ const struct clam_option __clam_options[] = {
{"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"},

{"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"},
{"JsonStoreHTMLUrls", "json-store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML <form and <a tags.", "yes"},

{"HeuristicScanPrecedence", "heuristic-scan-precedence", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Allow heuristic match to take precedence.\nWhen enabled, if a heuristic scan (such as phishingScan) detects\na possible virus/phish it will stop scan immediately. Recommended, saves CPU\nscan-time.\nWhen disabled, virus/phish detected by heuristic scans will be reported only\nat the end of a scan. If an archive contains both a heuristically detected\nvirus/phish, and a real malware, the real malware will be reported.\nKeep this disabled if you intend to handle \"Heuristics.*\" viruses\ndifferently from \"real\" malware.\nIf a non-heuristically-detected virus (signature-based) is found first,\nthe scan is interrupted immediately, regardless of this config option.", "yes"},

Expand Down
6 changes: 6 additions & 0 deletions etc/clamd.conf.sample
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,12 @@ Example
# Default: no
#GenerateMetadataJson yes

# Store URLs found in html files to the json metadata.
# URLs will be stored in an array with the tag 'HTMLUrls'
# GenerateMetadataJson is required for this feature.
# Default: yes (if GenerateMetadataJson is used)
#JsonStoreHTMLUrls no

# Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject
# any ALLMATCHSCAN command as invalid.
# Default: yes
Expand Down
1 change: 1 addition & 0 deletions libclamav/clamav.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ struct cl_scan_options {
#define CL_SCAN_GENERAL_HEURISTICS 0x4 /* option to enable heuristic alerts */
#define CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE 0x8 /* allow heuristic match to take precedence. */
#define CL_SCAN_GENERAL_UNPRIVILEGED 0x10 /* scanner will not have read access to files. */
#define CL_SCAN_GENERAL_STORE_HTML_URLS 0x20 /* Store urls found in html <a and <form tags when recording JSON metadata */

/* parsing capabilities options */
#define CL_SCAN_PARSE_ARCHIVE 0x1
Expand Down
6 changes: 3 additions & 3 deletions libclamav/hashtab.c
Original file line number Diff line number Diff line change
Expand Up @@ -719,9 +719,9 @@ void cli_hashset_destroy(struct cli_hashset *hs)
hs->capacity = 0;
}

#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val) & 0x1f)))
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val) & 0x1f)))
#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val) & 0x1f)))
#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val)&0x1f)))
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val)&0x1f)))
#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val)&0x1f)))

/*
* searches the hashset for the @key.
Expand Down
126 changes: 100 additions & 26 deletions libclamav/htmlnorm.c
Original file line number Diff line number Diff line change
Expand Up @@ -370,51 +370,70 @@ void html_tag_arg_add(tag_arguments_t *tags,
const char *tag, char *value)
{
int len, i;
tags->count++;
tags->tag = (unsigned char **)cli_max_realloc_or_free(tags->tag,
tags->count * sizeof(char *));
if (!tags->tag) {
int tagCnt = tags->count;
int valueCnt = tags->count;
int contentCnt = 0;
unsigned char **tmp = NULL;

tmp = (unsigned char **)cli_max_realloc(tags->tag, (tagCnt + 1) * sizeof(char *));
if (!tmp) {
goto done;
}
tags->value = (unsigned char **)cli_max_realloc_or_free(tags->value,
tags->count * sizeof(char *));
if (!tags->value) {
tags->tag = tmp;
tagCnt++;

tmp = (unsigned char **)cli_max_realloc(tags->value, (valueCnt + 1) * sizeof(char *));
if (!tmp) {
goto done;
}
tags->value = tmp;
valueCnt++;

if (tags->scanContents) {
tags->contents = (unsigned char **)cli_max_realloc_or_free(tags->contents,
tags->count * sizeof(*tags->contents));
if (!tags->contents) {
contentCnt = tags->count;
tmp = (unsigned char **)cli_max_realloc(tags->contents, (contentCnt + 1) * sizeof(*tags->contents));
if (!tmp) {
goto done;
}
tags->contents[tags->count - 1] = NULL;
tags->contents = tmp;
tags->contents[contentCnt] = NULL;
contentCnt++;
}
tags->tag[tags->count - 1] = (unsigned char *)cli_safer_strdup(tag);

tags->tag[tags->count] = (unsigned char *)cli_safer_strdup(tag);
if (value) {
if (*value == '"') {
tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value + 1);
len = strlen((const char *)value + 1);
tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value + 1);
if (NULL == tags->value[tags->count]) {
goto done;
}
len = strlen((const char *)value + 1);
if (len > 0) {
tags->value[tags->count - 1][len - 1] = '\0';
tags->value[tags->count][len - 1] = '\0';
}
} else {
tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value);
tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value);
}
} else {
tags->value[tags->count - 1] = NULL;
tags->value[tags->count] = NULL;
}

tags->count++;
return;

done:
/* Bad error - can't do 100% recovery */
tags->count--;
for (i = 0; i < tags->count; i++) {
for (i = 0; i < tagCnt; i++) {
if (tags->tag) {
free(tags->tag[i]);
}
}
for (i = 0; i < valueCnt; i++) {
if (tags->value) {
free(tags->value[i]);
}
}
for (i = 0; i < contentCnt; i++) {
if (tags->contents) {
if (tags->contents[i])
free(tags->contents[i]);
Expand Down Expand Up @@ -649,7 +668,46 @@ static void js_process(struct parser_state *js_state, const unsigned char *js_be
}
}

static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
bool html_insert_form_data(const char *const value, form_data_t *tags)
{
bool bRet = false;
size_t cnt = tags->count + 1;
char **tmp = NULL;

/*
* Do NOT use cli_max_realloc_or_free because all the previously malloc'd tag
* values will be leaked when tag is free'd in the case where realloc fails.
*/
tmp = cli_max_realloc(tags->urls, cnt * sizeof(unsigned char *));
if (!tmp) {
goto done;
}
tags->urls = tmp;

tags->urls[tags->count] = cli_safer_strdup(value);
if (tags->urls[tags->count]) {
tags->count = cnt;
}

bRet = true;
done:
if (!bRet) {
memset(tags, 0, sizeof(*tags));
}

return bRet;
}

void html_form_data_tag_free(form_data_t *tags)
{
size_t i;
for (i = 0; i < tags->count; i++) {
CLI_FREE_AND_SET_NULL(tags->urls[i]);
}
CLI_FREE_AND_SET_NULL(tags->urls);
}

static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
{
int fd_tmp, tag_length = 0, tag_arg_length = 0;
bool binary, retval = false, escape = false, hex = false;
Expand All @@ -659,7 +717,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
FILE *stream_in = NULL;
html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE;
char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1];
char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value;
char tag_val[HTML_STR_LENGTH + 1], *tmp_file = NULL, *arg_value = NULL;
unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;
tag_arguments_t tag_args;
quoted_state quoted = NOT_QUOTED;
Expand Down Expand Up @@ -1224,8 +1282,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
href_contents_begin = ptr;
}
if (strcmp(tag, "/form") == 0) {
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
in_form_action = NULL;
}
} else if (strcmp(tag, "script") == 0) {
Expand Down Expand Up @@ -1310,9 +1369,13 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
} else if (strcmp(tag, "form") == 0 && hrefs->scanContents) {
const char *arg_action_value = html_tag_arg_value(&tag_args, "action");
if (arg_action_value) {
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value);
if (form_data) {
html_insert_form_data((const char *const)in_form_action, form_data);
}
}
} else if (strcmp(tag, "img") == 0) {
arg_value = html_tag_arg_value(&tag_args, "src");
Expand Down Expand Up @@ -1917,8 +1980,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
done:
if (line) /* only needed for done case */
free(line);
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
if (in_ahref) /* tag not closed, force closing */
html_tag_contents_done(hrefs, in_ahref, &contents);

Expand Down Expand Up @@ -1960,6 +2024,11 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
}

bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
{
return html_normalise_mem_form_data(ctx, in_buff, in_size, dirname, hrefs, dconf, NULL);
}

bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
{
m_area_t m_area;

Expand All @@ -1968,18 +2037,23 @@ bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, con
m_area.offset = 0;
m_area.map = NULL;

return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
}

bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
{
return html_normalise_map_form_data(ctx, map, dirname, hrefs, dconf, NULL);
}

bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
{
bool retval = false;
m_area_t m_area;

m_area.length = map->len;
m_area.offset = 0;
m_area.map = map;
retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
return retval;
}

Expand Down
9 changes: 9 additions & 0 deletions libclamav/htmlnorm.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,19 @@ typedef struct m_area_tag {
fmap_t *map;
} m_area_t;

typedef struct form_data_tag {
char **urls;
size_t count;
} form_data_t;

bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data);
bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data);
void html_tag_arg_free(tag_arguments_t *tags);
bool html_screnc_decode(fmap_t *map, const char *dirname);
void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value);

void html_form_data_tag_free(form_data_t *tags);

#endif
1 change: 1 addition & 0 deletions libclamav/others.h
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ extern LIBCLAMAV_EXPORT int have_rar;
#define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS)
#define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE)
#define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED)
#define SCAN_STORE_HTML_URLS (ctx->options->general & CL_SCAN_GENERAL_STORE_HTML_URLS)

#define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE)
#define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF)
Expand Down
Loading

0 comments on commit 666e047

Please sign in to comment.