From 666e047f2bbf7137ae246211169b68e898ca0397 Mon Sep 17 00:00:00 2001 From: Andy Ragusa Date: Fri, 7 Jun 2024 10:20:57 -0700 Subject: [PATCH] Store URLs from HTML when recording scan metadata json Store URLs found in HTML `` and `
` tags during scan of HTML files when recording scan metadata. HTML URL recording will be ON by default, but is a part of the generate-metadata-json feature. The generate-metadata-json feature is OFF by default. This introduces a new general scan option: - libclamav: `CL_SCAN_GENERAL_STORE_HTML_URLS`. - ClamD: `JsonStoreHTMLUrls`. - ClamScan: `--json-store-html-urls` Thank you Matt Jolly for the helpful comment on the pull request. --- clamscan/clamscan.c | 2 + clamscan/manager.c | 4 + common/optparser.c | 1 + etc/clamd.conf.sample | 6 + libclamav/clamav.h | 1 + libclamav/hashtab.c | 6 +- libclamav/htmlnorm.c | 126 ++++- libclamav/htmlnorm.h | 9 + libclamav/others.h | 1 + libclamav/scanners.c | 459 +++++++++++++++++- unit_tests/clamscan/save_html_urls_test.py | 62 +++ .../input/other_scanfiles/html/index.html | 16 + win32/conf_examples/clamd.conf.sample | 6 + 13 files changed, 669 insertions(+), 30 deletions(-) create mode 100644 unit_tests/clamscan/save_html_urls_test.py create mode 100644 unit_tests/input/other_scanfiles/html/index.html diff --git a/clamscan/clamscan.c b/clamscan/clamscan.c index 2b10a1023b..f3e32f26fa 100644 --- a/clamscan/clamscan.c +++ b/clamscan/clamscan.c @@ -254,6 +254,8 @@ void help(void) mprintf(LOGG_INFO, " --gen-json[=yes/no(*)] Generate JSON metadata for the scanned file(s). For testing & development use ONLY.\n"); mprintf(LOGG_INFO, " JSON will be printed if --debug is enabled.\n"); mprintf(LOGG_INFO, " A JSON file will dropped to the temp directory if --leave-temps is enabled.\n"); + mprintf(LOGG_INFO, " --json-store-html-urls[=yes(*)/no] Store html URLs in metadata.\n"); + mprintf(LOGG_INFO, " URLs will be written to the metadata.json file in an array called 'HTMLUrls'\n"); mprintf(LOGG_INFO, " --database=FILE/DIR -d FILE/DIR Load virus database from FILE or load all supported db files from DIR\n"); mprintf(LOGG_INFO, " --official-db-only[=yes/no(*)] Only load official signatures\n"); mprintf(LOGG_INFO, " --fail-if-cvd-older-than=days Return with a nonzero error code if virus database outdated.\n"); diff --git a/clamscan/manager.c b/clamscan/manager.c index 668bfcd39b..cd9564b2dd 100644 --- a/clamscan/manager.c +++ b/clamscan/manager.c @@ -1574,6 +1574,10 @@ int scanmanager(const struct optstruct *opts) options.general |= CL_SCAN_GENERAL_HEURISTICS; } + if (optget(opts, "json-store-html-urls")->enabled) { + options.general |= CL_SCAN_GENERAL_STORE_HTML_URLS; + } + /* TODO: Remove deprecated option in a future feature release */ if ((optget(opts, "block-max")->enabled) || (optget(opts, "alert-exceeds-max")->enabled)) { diff --git a/common/optparser.c b/common/optparser.c index dd99f43eb2..bbf3bfa2f6 100644 --- a/common/optparser.c +++ b/common/optparser.c @@ -389,6 +389,7 @@ const struct clam_option __clam_options[] = { {"PhishingScanURLs", "phishing-scan-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Scan URLs found in mails for phishing attempts using heuristics.", "yes"}, {"HeuristicAlerts", "heuristic-alerts", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "In some cases (eg. complex malware, exploits in graphic files, and others),\nClamAV uses special algorithms to provide accurate detection. This option\ncontrols the algorithmic detection.", "yes"}, + {"JsonStoreHTMLUrls", "json-store-html-urls", 0, CLOPT_TYPE_BOOL, MATCH_BOOL, 1, NULL, 0, OPT_CLAMD | OPT_CLAMSCAN, "Store URLs found in HTML capacity = 0; } -#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val) & 0x1f))) -#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val) & 0x1f))) -#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val) & 0x1f))) +#define BITMAP_CONTAINS(bmap, val) ((bmap)[(val) >> 5] & ((uint64_t)1 << ((val)&0x1f))) +#define BITMAP_INSERT(bmap, val) ((bmap)[(val) >> 5] |= ((uint64_t)1 << ((val)&0x1f))) +#define BITMAP_REMOVE(bmap, val) ((bmap)[(val) >> 5] &= ~((uint64_t)1 << ((val)&0x1f))) /* * searches the hashset for the @key. diff --git a/libclamav/htmlnorm.c b/libclamav/htmlnorm.c index edd1bc00d7..95cfb20b26 100644 --- a/libclamav/htmlnorm.c +++ b/libclamav/htmlnorm.c @@ -370,51 +370,70 @@ void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value) { int len, i; - tags->count++; - tags->tag = (unsigned char **)cli_max_realloc_or_free(tags->tag, - tags->count * sizeof(char *)); - if (!tags->tag) { + int tagCnt = tags->count; + int valueCnt = tags->count; + int contentCnt = 0; + unsigned char **tmp = NULL; + + tmp = (unsigned char **)cli_max_realloc(tags->tag, (tagCnt + 1) * sizeof(char *)); + if (!tmp) { goto done; } - tags->value = (unsigned char **)cli_max_realloc_or_free(tags->value, - tags->count * sizeof(char *)); - if (!tags->value) { + tags->tag = tmp; + tagCnt++; + + tmp = (unsigned char **)cli_max_realloc(tags->value, (valueCnt + 1) * sizeof(char *)); + if (!tmp) { goto done; } + tags->value = tmp; + valueCnt++; + if (tags->scanContents) { - tags->contents = (unsigned char **)cli_max_realloc_or_free(tags->contents, - tags->count * sizeof(*tags->contents)); - if (!tags->contents) { + contentCnt = tags->count; + tmp = (unsigned char **)cli_max_realloc(tags->contents, (contentCnt + 1) * sizeof(*tags->contents)); + if (!tmp) { goto done; } - tags->contents[tags->count - 1] = NULL; + tags->contents = tmp; + tags->contents[contentCnt] = NULL; + contentCnt++; } - tags->tag[tags->count - 1] = (unsigned char *)cli_safer_strdup(tag); + + tags->tag[tags->count] = (unsigned char *)cli_safer_strdup(tag); if (value) { if (*value == '"') { - tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value + 1); - len = strlen((const char *)value + 1); + tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value + 1); + if (NULL == tags->value[tags->count]) { + goto done; + } + len = strlen((const char *)value + 1); if (len > 0) { - tags->value[tags->count - 1][len - 1] = '\0'; + tags->value[tags->count][len - 1] = '\0'; } } else { - tags->value[tags->count - 1] = (unsigned char *)cli_safer_strdup(value); + tags->value[tags->count] = (unsigned char *)cli_safer_strdup(value); } } else { - tags->value[tags->count - 1] = NULL; + tags->value[tags->count] = NULL; } + + tags->count++; return; done: /* Bad error - can't do 100% recovery */ - tags->count--; - for (i = 0; i < tags->count; i++) { + for (i = 0; i < tagCnt; i++) { if (tags->tag) { free(tags->tag[i]); } + } + for (i = 0; i < valueCnt; i++) { if (tags->value) { free(tags->value[i]); } + } + for (i = 0; i < contentCnt; i++) { if (tags->contents) { if (tags->contents[i]) free(tags->contents[i]); @@ -649,7 +668,46 @@ static void js_process(struct parser_state *js_state, const unsigned char *js_be } } -static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf) +bool html_insert_form_data(const char *const value, form_data_t *tags) +{ + bool bRet = false; + size_t cnt = tags->count + 1; + char **tmp = NULL; + + /* + * Do NOT use cli_max_realloc_or_free because all the previously malloc'd tag + * values will be leaked when tag is free'd in the case where realloc fails. + */ + tmp = cli_max_realloc(tags->urls, cnt * sizeof(unsigned char *)); + if (!tmp) { + goto done; + } + tags->urls = tmp; + + tags->urls[tags->count] = cli_safer_strdup(value); + if (tags->urls[tags->count]) { + tags->count = cnt; + } + + bRet = true; +done: + if (!bRet) { + memset(tags, 0, sizeof(*tags)); + } + + return bRet; +} + +void html_form_data_tag_free(form_data_t *tags) +{ + size_t i; + for (i = 0; i < tags->count; i++) { + CLI_FREE_AND_SET_NULL(tags->urls[i]); + } + CLI_FREE_AND_SET_NULL(tags->urls); +} + +static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data) { int fd_tmp, tag_length = 0, tag_arg_length = 0; bool binary, retval = false, escape = false, hex = false; @@ -659,7 +717,7 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha FILE *stream_in = NULL; html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE; char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1]; - char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value; + char tag_val[HTML_STR_LENGTH + 1], *tmp_file = NULL, *arg_value = NULL; unsigned char *line = NULL, *ptr, *ptr_screnc = NULL; tag_arguments_t tag_args; quoted_state quoted = NOT_QUOTED; @@ -1224,8 +1282,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha href_contents_begin = ptr; } if (strcmp(tag, "/form") == 0) { - if (in_form_action) + if (in_form_action) { free(in_form_action); + } in_form_action = NULL; } } else if (strcmp(tag, "script") == 0) { @@ -1310,9 +1369,13 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha } else if (strcmp(tag, "form") == 0 && hrefs->scanContents) { const char *arg_action_value = html_tag_arg_value(&tag_args, "action"); if (arg_action_value) { - if (in_form_action) + if (in_form_action) { free(in_form_action); + } in_form_action = (unsigned char *)cli_safer_strdup(arg_action_value); + if (form_data) { + html_insert_form_data((const char *const)in_form_action, form_data); + } } } else if (strcmp(tag, "img") == 0) { arg_value = html_tag_arg_value(&tag_args, "src"); @@ -1917,8 +1980,9 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha done: if (line) /* only needed for done case */ free(line); - if (in_form_action) + if (in_form_action) { free(in_form_action); + } if (in_ahref) /* tag not closed, force closing */ html_tag_contents_done(hrefs, in_ahref, &contents); @@ -1960,6 +2024,11 @@ static bool cli_html_normalise(cli_ctx *ctx, int fd, m_area_t *m_area, const cha } bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf) +{ + return html_normalise_mem_form_data(ctx, in_buff, in_size, dirname, hrefs, dconf, NULL); +} + +bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data) { m_area_t m_area; @@ -1968,10 +2037,15 @@ bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, con m_area.offset = 0; m_area.map = NULL; - return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf); + return cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data); } bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf) +{ + return html_normalise_map_form_data(ctx, map, dirname, hrefs, dconf, NULL); +} + +bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data) { bool retval = false; m_area_t m_area; @@ -1979,7 +2053,7 @@ bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_argu m_area.length = map->len; m_area.offset = 0; m_area.map = map; - retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf); + retval = cli_html_normalise(ctx, -1, &m_area, dirname, hrefs, dconf, form_data); return retval; } diff --git a/libclamav/htmlnorm.h b/libclamav/htmlnorm.h index 72524165a6..3ab6f8029b 100644 --- a/libclamav/htmlnorm.h +++ b/libclamav/htmlnorm.h @@ -45,10 +45,19 @@ typedef struct m_area_tag { fmap_t *map; } m_area_t; +typedef struct form_data_tag { + char **urls; + size_t count; +} form_data_t; + bool html_normalise_mem(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf); +bool html_normalise_mem_form_data(cli_ctx *ctx, unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data); bool html_normalise_map(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf); +bool html_normalise_map_form_data(cli_ctx *ctx, fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf, form_data_t *form_data); void html_tag_arg_free(tag_arguments_t *tags); bool html_screnc_decode(fmap_t *map, const char *dirname); void html_tag_arg_add(tag_arguments_t *tags, const char *tag, char *value); +void html_form_data_tag_free(form_data_t *tags); + #endif diff --git a/libclamav/others.h b/libclamav/others.h index 7f0d267255..18f1e72e45 100644 --- a/libclamav/others.h +++ b/libclamav/others.h @@ -552,6 +552,7 @@ extern LIBCLAMAV_EXPORT int have_rar; #define SCAN_HEURISTICS (ctx->options->general & CL_SCAN_GENERAL_HEURISTICS) #define SCAN_HEURISTIC_PRECEDENCE (ctx->options->general & CL_SCAN_GENERAL_HEURISTIC_PRECEDENCE) #define SCAN_UNPRIVILEGED (ctx->options->general & CL_SCAN_GENERAL_UNPRIVILEGED) +#define SCAN_STORE_HTML_URLS (ctx->options->general & CL_SCAN_GENERAL_STORE_HTML_URLS) #define SCAN_PARSE_ARCHIVE (ctx->options->parse & CL_SCAN_PARSE_ARCHIVE) #define SCAN_PARSE_ELF (ctx->options->parse & CL_SCAN_PARSE_ELF) diff --git a/libclamav/scanners.c b/libclamav/scanners.c index 3ef17bad8c..91f5be2979 100644 --- a/libclamav/scanners.c +++ b/libclamav/scanners.c @@ -2082,6 +2082,452 @@ static cl_error_t cli_ole2_tempdir_scan_for_xlm_and_images(const char *dir, cli_ return ret; } +const char *const HTML_URLS_JSON_KEY = "HTMLUrls"; +/* https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml */ +const char *URI_LIST[] = { + "aaa://", + "aaas://", + "about://", + "acap://", + "acct://", + "acd://", + "acr://", + "adiumxtra://", + "adt://", + "afp://", + "afs://", + "aim://", + "amss://", + "android://", + "appdata://", + "apt://", + "ar://", + "ark://", + "at://", + "attachment://", + "aw://", + "barion://", + "bb://", + "beshare://", + "bitcoin://", + "bitcoincash://", + "blob://", + "bolo://", + "brid://", + "browserext://", + "cabal://", + "calculator://", + "callto://", + "cap://", + "cast://", + "casts://", + "chrome://", + "chrome-extension://", + "cid://", + "coap://", + "coap+tcp://", + "coap+ws://", + "coaps://", + "coaps+tcp://", + "coaps+ws://", + "com-eventbrite-attendee://", + "content://", + "content-type://", + "crid://", + "cstr://", + "cvs://", + "dab://", + "dat://", + "data://", + "dav://", + "dhttp://", + "diaspora://", + "dict://", + "did://", + "dis://", + "dlna-playcontainer://", + "dlna-playsingle://", + "dns://", + "dntp://", + "doi://", + "dpp://", + "drm://", + "drop://", + "dtmi://", + "dtn://", + "dvb://", + "dvx://", + "dweb://", + "ed2k://", + "eid://", + "elsi://", + "embedded://", + "ens://", + "ethereum://", + "example://", + "facetime://", + "fax://", + "feed://", + "feedready://", + "fido://", + "file://", + "filesystem://", + "finger://", + "first-run-pen-experience://", + "fish://", + "fm://", + "ftp://", + "fuchsia-pkg://", + "geo://", + "gg://", + "git://", + "gitoid://", + "gizmoproject://", + "go://", + "gopher://", + "graph://", + "grd://", + "gtalk://", + "h323://", + "ham://", + "hcap://", + "hcp://", + "hs20://", + "http://", + "https://", + "hxxp://", + "hxxps://", + "hydrazone://", + "hyper://", + "iax://", + "icap://", + "icon://", + "im://", + "imap://", + "info://", + "iotdisco://", + "ipfs://", + "ipn://", + "ipns://", + "ipp://", + "ipps://", + "irc://", + "irc6://", + "ircs://", + "iris://", + "iris.beep://", + "iris.lwz://", + "iris.xpc://", + "iris.xpcs://", + "isostore://", + "itms://", + "jabber://", + "jar://", + "jms://", + "keyparc://", + "lastfm://", + "lbry://", + "ldap://", + "ldaps://", + "leaptofrogans://", + "lid://", + "lorawan://", + "lpa://", + "lvlt://", + "machineProvisioningProgressReporter://", + "magnet://", + "mailserver://", + "mailto://", + "maps://", + "market://", + "matrix://", + "message://", + "microsoft.windows.camera://", + "microsoft.windows.camera.multipicker://", + "microsoft.windows.camera.picker://", + "mid://", + "mms://", + "modem://", + "mongodb://", + "moz://", + "ms-access://", + "ms-appinstaller://", + "ms-browser-extension://", + "ms-calculator://", + "ms-drive-to://", + "ms-enrollment://", + "ms-excel://", + "ms-eyecontrolspeech://", + "ms-gamebarservices://", + "ms-gamingoverlay://", + "ms-getoffice://", + "ms-help://", + "ms-infopath://", + "ms-inputapp://", + "ms-launchremotedesktop://", + "ms-lockscreencomponent-config://", + "ms-media-stream-id://", + "ms-meetnow://", + "ms-mixedrealitycapture://", + "ms-mobileplans://", + "ms-newsandinterests://", + "ms-officeapp://", + "ms-people://", + "ms-project://", + "ms-powerpoint://", + "ms-publisher://", + "ms-recall://", + "ms-remotedesktop://", + "ms-remotedesktop-launch://", + "ms-restoretabcompanion://", + "ms-screenclip://", + "ms-screensketch://", + "ms-search://", + "ms-search-repair://", + "ms-secondary-screen-controller://", + "ms-secondary-screen-setup://", + "ms-settings://", + "ms-settings-airplanemode://", + "ms-settings-bluetooth://", + "ms-settings-camera://", + "ms-settings-cellular://", + "ms-settings-cloudstorage://", + "ms-settings-connectabledevices://", + "ms-settings-displays-topology://", + "ms-settings-emailandaccounts://", + "ms-settings-language://", + "ms-settings-location://", + "ms-settings-lock://", + "ms-settings-nfctransactions://", + "ms-settings-notifications://", + "ms-settings-power://", + "ms-settings-privacy://", + "ms-settings-proximity://", + "ms-settings-screenrotation://", + "ms-settings-wifi://", + "ms-settings-workplace://", + "ms-spd://", + "ms-stickers://", + "ms-sttoverlay://", + "ms-transit-to://", + "ms-useractivityset://", + "ms-virtualtouchpad://", + "ms-visio://", + "ms-walk-to://", + "ms-whiteboard://", + "ms-whiteboard-cmd://", + "ms-word://", + "msnim://", + "msrp://", + "msrps://", + "mss://", + "mt://", + "mtqp://", + "mumble://", + "mupdate://", + "mvn://", + "mvrp://", + "mvrps://", + "news://", + "nfs://", + "ni://", + "nih://", + "nntp://", + "notes://", + "num://", + "ocf://", + "oid://", + "onenote://", + "onenote-cmd://", + "opaquelocktoken://", + "openid://", + "openpgp4fpr://", + "otpauth://", + "p1://", + "pack://", + "palm://", + "paparazzi://", + "payment://", + "payto://", + "pkcs11://", + "platform://", + "pop://", + "pres://", + "prospero://", + "proxy://", + "pwid://", + "psyc://", + "pttp://", + "qb://", + "query://", + "quic-transport://", + "redis://", + "rediss://", + "reload://", + "res://", + "resource://", + "rmi://", + "rsync://", + "rtmfp://", + "rtmp://", + "rtsp://", + "rtsps://", + "rtspu://", + "sarif://", + "secondlife://", + "secret-token://", + "service://", + "session://", + "sftp://", + "sgn://", + "shc://", + "shttp://", + "sieve://", + "simpleledger://", + "simplex://", + "sip://", + "sips://", + "skype://", + "smb://", + "smp://", + "sms://", + "smtp://", + "snews://", + "snmp://", + "soap.beep://", + "soap.beeps://", + "soldat://", + "spiffe://", + "spotify://", + "ssb://", + "ssh://", + "starknet://", + "steam://", + "stun://", + "stuns://", + "submit://", + "svn://", + "swh://", + "swid://", + "swidpath://", + "tag://", + "taler://", + "teamspeak://", + "tel://", + "teliaeid://", + "telnet://", + "tftp://", + "things://", + "thismessage://", + "tip://", + "tn3270://", + "tool://", + "turn://", + "turns://", + "tv://", + "udp://", + "unreal://", + "upt://", + "urn://", + "ut2004://", + "uuid-in-package://", + "v-event://", + "vemmi://", + "ventrilo://", + "ves://", + "videotex://", + "vnc://", + "view-source://", + "vscode://", + "vscode-insiders://", + "vsls://", + "w3://", + "wais://", + "web3://", + "wcr://", + "webcal://", + "web+ap://", + "wifi://", + "wpid://", + "ws://", + "wss://", + "wtai://", + "wyciwyg://", + "xcon://", + "xcon-userid://", + "xfire://", + "xmlrpc.beep://", + "xmlrpc.beeps://", + "xmpp://", + "xftp://", + "xrcp://", + "xri://", + "ymsgr://", + "z39.50://", + "z39.50r://", + "z39.50s://"}; + +static bool is_url(const char *const str, size_t str_len) +{ + bool bRet = false; + size_t i; + + for (i = 0; i < sizeof(URI_LIST) / sizeof(URI_LIST[0]); i++) { + if (str && (str_len > strlen(URI_LIST[i])) && (0 == strncasecmp(str, URI_LIST[i], strlen(URI_LIST[i])))) { + bRet = true; + goto done; + } + } +done: + return bRet; +} + +static void save_urls(cli_ctx *ctx, tag_arguments_t *hrefs, form_data_t *form_data) +{ + int i = 0; + json_object *ary = NULL; + + if (NULL == hrefs) { + return; + } + + if (ctx->wrkproperty != ctx->properties) { + return; + } + + if (!(SCAN_STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL))) { + return; + } + + /*Add hrefs*/ + for (i = 0; i < hrefs->count; i++) { + if (is_url((const char *)hrefs->value[i], strlen((const char *)hrefs->value[i]))) { + if (NULL == ary) { + ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY); + if (!ary) { + cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY); + return; + } + } + cli_jsonstr(ary, NULL, (const char *)hrefs->value[i]); + } + } + + /*Add form_data*/ + for (i = 0; i < (int)form_data->count; i++) { + if (is_url((const char *)form_data->urls[i], strlen((const char *)form_data->urls[i]))) { + if (NULL == ary) { + ary = cli_jsonarray(ctx->wrkproperty, HTML_URLS_JSON_KEY); + if (!ary) { + cli_dbgmsg("[cli_scanhtml] Failed to add \"%s\" entry JSON array\n", HTML_URLS_JSON_KEY); + return; + } + } + cli_jsonstr(ary, NULL, (const char *)form_data->urls[i]); + } + } +} + static cl_error_t cli_scanhtml(cli_ctx *ctx) { cl_error_t status = CL_SUCCESS; @@ -2113,7 +2559,18 @@ static cl_error_t cli_scanhtml(cli_ctx *ctx) cli_dbgmsg("cli_scanhtml: using tempdir %s\n", tempname); - (void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf); + /* Output JSON Summary Information */ + if (SCAN_STORE_HTML_URLS && SCAN_COLLECT_METADATA && (ctx->wrkproperty != NULL)) { + tag_arguments_t hrefs = {0}; + hrefs.scanContents = 1; + form_data_t form_data = {0}; + (void)html_normalise_map_form_data(ctx, map, tempname, &hrefs, ctx->dconf, &form_data); + save_urls(ctx, &hrefs, &form_data); + html_tag_arg_free(&hrefs); + html_form_data_tag_free(&form_data); + } else { + (void)html_normalise_map(ctx, map, tempname, NULL, ctx->dconf); + } snprintf(fullname, 1024, "%s" PATHSEP "nocomment.html", tempname); fd = open(fullname, O_RDONLY | O_BINARY); diff --git a/unit_tests/clamscan/save_html_urls_test.py b/unit_tests/clamscan/save_html_urls_test.py new file mode 100644 index 0000000000..610a7f698a --- /dev/null +++ b/unit_tests/clamscan/save_html_urls_test.py @@ -0,0 +1,62 @@ +# Copyright (C) 2020-2024 Cisco Systems, Inc. and/or its affiliates. All rights reserved. + +""" +Run clamscan tests. +""" + +import sys +import os +import re +import shutil + +sys.path.append('../unit_tests') +import testcase + + +class TC(testcase.TestCase): + @classmethod + def setUpClass(cls): + super(TC, cls).setUpClass() + + @classmethod + def tearDownClass(cls): + super(TC, cls).tearDownClass() + + def setUp(self): + super(TC, self).setUp() + + def tearDown(self): + super(TC, self).tearDown() + + # Remove scan temps directory between tests + if (self.path_tmp / "TD").exists(): + shutil.rmtree(self.path_tmp / "TD") + + self.verify_valgrind_log() + + def test_save_links(self): + self.step_name('Extract Links') + + tempdir=self.path_tmp / "TD" + if not os.path.isdir(tempdir): + os.makedirs(tempdir); + + testfile = TC.path_source / 'unit_tests' / 'input' / 'other_scanfiles' / 'html' / 'index.html' + command = '{valgrind} {valgrind_args} {clamscan} -d {path_db} --gen-json --leave-temps --tempdir={tempdir} {testfile}'.format( + valgrind=TC.valgrind, valgrind_args=TC.valgrind_args, clamscan=TC.clamscan, + path_db=TC.path_source / 'unit_tests' / 'input' / 'other_sigs' / 'Clamav-Unit-Test-Signature.ndb', + tempdir=tempdir, + testfile=testfile, + ) + output = self.execute_command(command) + + assert output.ec == 0 # clean + + expected_strings = [ 'HTMLUrls' + , '"https://www.clamav.net/reports/malware"' + , '"http://www.google.com"' + ] + self.verify_metadata_json(tempdir, expected_strings) + + + diff --git a/unit_tests/input/other_scanfiles/html/index.html b/unit_tests/input/other_scanfiles/html/index.html new file mode 100644 index 0000000000..1ca1956380 --- /dev/null +++ b/unit_tests/input/other_scanfiles/html/index.html @@ -0,0 +1,16 @@ + + + + +

Save Links Unittest

+

Paragraph

+
Report Malware + + + +
+ + + + + diff --git a/win32/conf_examples/clamd.conf.sample b/win32/conf_examples/clamd.conf.sample index 66a07a8ccd..580afe0ea9 100644 --- a/win32/conf_examples/clamd.conf.sample +++ b/win32/conf_examples/clamd.conf.sample @@ -226,6 +226,12 @@ TCPAddr localhost # Default: no #GenerateMetadataJson yes +# Store URLs found in html files to the json metadata. +# URLs will be stored in an array with the tag 'HTMLUrls' +# GenerateMetadataJson is required for this feature. +# Default: yes (if GenerateMetadataJson is used) +#JsonStoreHTMLUrls no + # Permit use of the ALLMATCHSCAN command. If set to no, clamd will reject # any ALLMATCHSCAN command as invalid. # Default: yes