Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clam 2586 save urls html #1281

Merged
merged 1 commit into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions clamscan/clamscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,8 @@ void help(void)
mprintf(LOGG_INFO, " --gen-json[=yes/no(*)] Generate JSON metadata for the scanned file(s). For testing & development use ONLY.\n");
mprintf(LOGG_INFO, " JSON will be printed if --debug is enabled.\n");
mprintf(LOGG_INFO, " A JSON file will dropped to the temp directory if --leave-temps is enabled.\n");
mprintf(LOGG_INFO, " --json-store-html-urls[=yes(*)/no] Store html URLs in metadata.\n");
mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'HTMLUrls'\n");
mprintf(LOGG_INFO, " --database=FILE/DIR -d FILE/DIR Load virus database from FILE or load all supported db files from DIR\n");
mprintf(LOGG_INFO, " --official-db-only[=yes/no(*)] Only load official signatures\n");
mprintf(LOGG_INFO, " --fail-if-cvd-older-than=days Return with a nonzero error code if virus database outdated.\n");
Expand Down
4 changes: 4 additions & 0 deletions clamscan/manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -1574,6 +1574,10 @@ int scanmanager(const struct optstruct *opts)
options.general |= CL_SCAN_GENERAL_HEURISTICS;
}

if (optget(opts, "json-store-html-urls")->enabled) {
options.general |= CL_SCAN_GENERAL_STORE_HTML_URLS;
}

/* TODO: Remove deprecated option in a future feature release */
if ((optget(opts, "block-max")->enabled) ||
(optget(opts, "alert-exceeds-max")->enabled)) {
Expand Down
1 change: 1 addition & 0 deletions common/optparser.c
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ const struct clam_option __clam_options[] = {
{"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"},

{"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"},
{"JsonStoreHTMLUrls", "json-store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML <form and <a tags.", "yes"},
micahsnyder marked this conversation as resolved.
Show resolved Hide resolved

{"HeuristicScanPrecedence", "heuristic-scan-precedence", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 0, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Allow heuristic match to take precedence.\nWhen enabled, if a heuristic scan (such as phishingScan) detects\na possible virus/phish it will stop scan immediately. Recommended, saves CPU\nscan-time.\nWhen disabled, virus/phish detected by heuristic scans will be reported only\nat the end of a scan. If an archive contains both a heuristically detected\nvirus/phish, and a real malware, the real malware will be reported.\nKeep this disabled if you intend to handle \"Heuristics.*\" viruses\ndifferently from \"real\" malware.\nIf a non-heuristically-detected virus (signature-based) is found first,\nthe scan is interrupted immediately, regardless of this config option.", "yes"},

Expand Down
6 changes: 6 additions & 0 deletions etc/clamd.conf.sample
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,12 @@ Example
# Default: no
#GenerateMetadataJson yes

# Store URLs found in html files to the json metadata.
# URLs will be stored in an array with the tag 'HTMLUrls'
# GenerateMetadataJson is required for this feature.
# Default: yes (if GenerateMetadataJson is used)
#JsonStoreHTMLUrls no

# Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject
# any ALLMATCHSCAN command as invalid.
# Default: yes
Expand Down
1 change: 1 addition & 0 deletions libclamav/clamav.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ struct cl_scan_options {
#define CL_SCAN_GENERAL_HEURISTICS 0x4 /* option to enable heuristic alerts */
#define CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE 0x8 /* allow heuristic match to take precedence. */
#define CL_SCAN_GENERAL_UNPRIVILEGED 0x10 /* scanner will not have read access to files. */
#define CL_SCAN_GENERAL_STORE_HTML_URLS 0x20 /* Store urls found in html <a and <form tags when recording JSON metadata */

/* parsing capabilities options */
#define CL_SCAN_PARSE_ARCHIVE 0x1
Expand Down
6 changes: 3 additions & 3 deletions libclamav/hashtab.c
Original file line number Diff line number Diff line change
Expand Up @@ -719,9 +719,9 @@ void cli_hashset_destroy(struct cli_hashset *hs)
hs->capacity = 0;
}

#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val) & 0x1f)))
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val) & 0x1f)))
#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val) & 0x1f)))
#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val)&0x1f)))
#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val)&0x1f)))
#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val)&0x1f)))

/*
* searches the hashset for the @key.
Expand Down
126 changes: 100 additions & 26 deletions libclamav/htmlnorm.c
Original file line number Diff line number Diff line change
Expand Up @@ -370,51 +370,70 @@ void html_tag_arg_add(tag_arguments_t *tags,
const char *tag, char *value)
{
int len, i;
tags->count++;
tags->tag = (unsigned char **)cli_max_realloc_or_free(tags->tag,
tags->count * sizeof(char *));
if (!tags->tag) {
int tagCnt = tags->count;
int valueCnt = tags->count;
int contentCnt = 0;
unsigned char **tmp = NULL;

tmp = (unsigned char **)cli_max_realloc(tags->tag, (tagCnt + 1) * sizeof(char *));
if (!tmp) {
goto done;
}
tags->value = (unsigned char **)cli_max_realloc_or_free(tags->value,
tags->count * sizeof(char *));
if (!tags->value) {
tags->tag = tmp;
tagCnt++;

tmp = (unsigned char **)cli_max_realloc(tags->value, (valueCnt + 1) * sizeof(char *));
if (!tmp) {
goto done;
}
tags->value = tmp;
valueCnt++;

if (tags->scanContents) {
tags->contents = (unsigned char **)cli_max_realloc_or_free(tags->contents,
tags->count * sizeof(*tags->contents));
if (!tags->contents) {
contentCnt = tags->count;
tmp = (unsigned char **)cli_max_realloc(tags->contents, (contentCnt + 1) * sizeof(*tags->contents));
if (!tmp) {
goto done;
}
tags->contents[tags->count - 1] = NULL;
tags->contents = tmp;
tags->contents[contentCnt] = NULL;
contentCnt++;
}
tags->tag[tags->count - 1] = (unsigned char *)cli_safer_strdup(tag);

tags->tag[tags->count] = (unsigned char *)cli_safer_strdup(tag);
if (value) {
if (*value == '"') {
tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value + 1);
len = strlen((const char *)value + 1);
tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value + 1);
if (NULL == tags->value[tags->count]) {
goto done;
}
len = strlen((const char *)value + 1);
if (len > 0) {
tags->value[tags->count - 1][len - 1] = '\0';
tags->value[tags->count][len - 1] = '\0';
}
} else {
tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value);
tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value);
}
} else {
tags->value[tags->count - 1] = NULL;
tags->value[tags->count] = NULL;
}

tags->count++;
return;

done:
/* Bad error - can't do 100% recovery */
tags->count--;
for (i = 0; i < tags->count; i++) {
for (i = 0; i < tagCnt; i++) {
if (tags->tag) {
free(tags->tag[i]);
}
}
for (i = 0; i < valueCnt; i++) {
if (tags->value) {
free(tags->value[i]);
}
}
for (i = 0; i < contentCnt; i++) {
if (tags->contents) {
if (tags->contents[i])
free(tags->contents[i]);
Expand Down Expand Up @@ -649,7 +668,46 @@ static void js_process(struct parser_state *js_state, const unsigned char *js_be
}
}

static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
bool html_insert_form_data(const char *const value, form_data_t *tags)
{
bool bRet = false;
size_t cnt = tags->count + 1;
char **tmp = NULL;

/*
* Do NOT use cli_max_realloc_or_free because all the previously malloc'd tag
* values will be leaked when tag is free'd in the case where realloc fails.
*/
tmp = cli_max_realloc(tags->urls, cnt * sizeof(unsigned char *));
if (!tmp) {
goto done;
}
tags->urls = tmp;

tags->urls[tags->count] = cli_safer_strdup(value);
if (tags->urls[tags->count]) {
tags->count = cnt;
}

bRet = true;
done:
if (!bRet) {
memset(tags, 0, sizeof(*tags));
}

return bRet;
}

void html_form_data_tag_free(form_data_t *tags)
{
size_t i;
for (i = 0; i < tags->count; i++) {
CLI_FREE_AND_SET_NULL(tags->urls[i]);
}
CLI_FREE_AND_SET_NULL(tags->urls);
}

static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
{
int fd_tmp, tag_length = 0, tag_arg_length = 0;
bool binary, retval = false, escape = false, hex = false;
Expand All @@ -659,7 +717,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
FILE *stream_in = NULL;
html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE;
char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1];
char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value;
char tag_val[HTML_STR_LENGTH + 1], *tmp_file = NULL, *arg_value = NULL;
unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;
tag_arguments_t tag_args;
quoted_state quoted = NOT_QUOTED;
Expand Down Expand Up @@ -1224,8 +1282,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
href_contents_begin = ptr;
}
if (strcmp(tag, "/form") == 0) {
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
in_form_action = NULL;
}
} else if (strcmp(tag, "script") == 0) {
Expand Down Expand Up @@ -1310,9 +1369,13 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
} else if (strcmp(tag, "form") == 0 && hrefs->scanContents) {
const char *arg_action_value = html_tag_arg_value(&tag_args, "action");
if (arg_action_value) {
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value);
if (form_data) {
html_insert_form_data((const char *const)in_form_action, form_data);
}
}
} else if (strcmp(tag, "img") == 0) {
arg_value = html_tag_arg_value(&tag_args, "src");
Expand Down Expand Up @@ -1917,8 +1980,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
done:
if (line) /* only needed for done case */
free(line);
if (in_form_action)
if (in_form_action) {
free(in_form_action);
}
if (in_ahref) /* tag not closed, force closing */
html_tag_contents_done(hrefs, in_ahref, &contents);

Expand Down Expand Up @@ -1960,6 +2024,11 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha
}

bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
{
return html_normalise_mem_form_data(ctx, in_buff, in_size, dirname, hrefs, dconf, NULL);
}

bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
{
m_area_t m_area;

Expand All @@ -1968,18 +2037,23 @@ bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, con
m_area.offset = 0;
m_area.map = NULL;

return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
}

bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
{
return html_normalise_map_form_data(ctx, map, dirname, hrefs, dconf, NULL);
}

bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data)
{
bool retval = false;
m_area_t m_area;

m_area.length = map->len;
m_area.offset = 0;
m_area.map = map;
retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf);
retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data);
return retval;
}

Expand Down
9 changes: 9 additions & 0 deletions libclamav/htmlnorm.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,19 @@ typedef struct m_area_tag {
fmap_t *map;
} m_area_t;

typedef struct form_data_tag {
char **urls;
size_t count;
} form_data_t;

bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data);
bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf);
bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data);
void html_tag_arg_free(tag_arguments_t *tags);
bool html_screnc_decode(fmap_t *map, const char *dirname);
void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value);

void html_form_data_tag_free(form_data_t *tags);

#endif
1 change: 1 addition & 0 deletions libclamav/others.h
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ extern LIBCLAMAV_EXPORT int have_rar;
#define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS)
#define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE)
#define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED)
#define SCAN_STORE_HTML_URLS (ctx->options->general & CL_SCAN_GENERAL_STORE_HTML_URLS)

#define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE)
#define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF)
Expand Down
Loading
Loading