diff --git a/CHANGELOG.md b/CHANGELOG.md index 1cccb2222..a3657012e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,11 @@ ### New Features - add function in capa/helpers to load plain and compressed JSON reports #1883 @Rohit1123 + - document Antivirus warnings and VirusTotal false positive detections #2028 @RionEV @mr-tz +- extracts and prints web domains/IP addresses and potential WinAPI networking functions #2031 @aaronatp + ### Breaking Changes diff --git a/capa/capabilities/domain_ip_helpers.py b/capa/capabilities/domain_ip_helpers.py new file mode 100644 index 000000000..05dadae8b --- /dev/null +++ b/capa/capabilities/domain_ip_helpers.py @@ -0,0 +1,122 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import logging +from pathlib import Path + +from capa.helpers import get_auto_format +from capa.exceptions import UnsupportedFormatError +from capa.features.common import FORMAT_CAPE, FORMAT_DOTNET, FORMAT_FREEZE, FORMAT_UNKNOWN +from capa.render.result_document import ResultDocument +from capa.features.extractors.base_extractor import FeatureExtractor + +logger = logging.getLogger(__name__) + + +def get_file_path(doc: ResultDocument) -> Path: + return Path(doc.meta.sample.path) + + +def get_sigpaths_from_doc(doc: ResultDocument): + import capa.loader + from capa.main import get_default_root + + if doc.meta.argv: + try: + if "-s" in list(doc.meta.argv): + idx = doc.meta.argv.index("-s") + sigpath = Path(doc.meta.argv[idx + 1]) + if "./" in str(sigpath): + fixed_str = str(sigpath).split("./")[1] + sigpath = Path(fixed_str) + + elif "--signatures" in list(doc.meta.argv): + idx = doc.meta.argv.index("--signatures") + sigpath = Path(doc.meta.argv[idx + 1]) + if "./" in str(sigpath): + fixed_str = str(sigpath).split("./")[1] + sigpath = Path(fixed_str) + + else: + sigpath = get_default_root() / "sigs" + + return capa.loader.get_signatures(sigpath) + + except AttributeError: + raise NotImplementedError("Confirm that argv is an attribute of doc.meta") + + else: + logger.debug("'doc.meta' has not attribute 'argv'") + + +def get_extractor_from_doc(doc: ResultDocument) -> FeatureExtractor: + # import here to avoid circular import + from capa.loader import BACKEND_VIV, BACKEND_CAPE, BACKEND_DOTNET, BACKEND_FREEZE, get_extractor + + path = get_file_path(doc) + os = doc.meta.analysis.os + + if doc.meta.argv: + args = tuple(doc.meta.argv) + else: + CommandLineArgumentsError("Couldn't find command line arguments!") + + for i in range(len(args)): + if args[i] == any(["-f", "--format"]): + format = args[i + 1] + break + else: + format = "" + + if format == "": + format = get_auto_format(path) + if format == FORMAT_UNKNOWN: + raise UnsupportedFormatError(f"Couldn't get format for {path.name}") + + for i in range(len(args)): + if args[i] == any(["-b", "--backend"]): + backend = args[i + 1] + break + elif format == FORMAT_CAPE: + backend = BACKEND_CAPE + break + elif format == FORMAT_DOTNET: + backend = BACKEND_DOTNET + break + elif format == FORMAT_FREEZE: + backend = BACKEND_FREEZE + break + else: + backend = "" + + if backend == "": + backend = BACKEND_VIV + + sigpath = get_sigpaths_from_doc(doc) + + import capa.helpers + + logger.debug(f"running standable == {capa.helpers.is_running_standalone()}") + + raise QuickExitError() + + return get_extractor( + input_path=path, + input_format=format, + os_=os, + backend=backend, + sigpaths=sigpath, + ) + + +class CommandLineArgumentsError(BaseException): + pass + + +class QuickExitError(BaseException): + pass diff --git a/capa/capabilities/extract_domain_and_ip.py b/capa/capabilities/extract_domain_and_ip.py new file mode 100644 index 000000000..cc17add68 --- /dev/null +++ b/capa/capabilities/extract_domain_and_ip.py @@ -0,0 +1,417 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import re +import socket +import logging +import ipaddress +from typing import Dict, List, Tuple, Generator + +from capa.features.insn import API, Feature +from capa.features.common import Address +from capa.render.result_document import ResultDocument +from capa.capabilities.domain_ip_helpers import get_extractor_from_doc +from capa.features.extractors.base_extractor import StaticFeatureExtractor, DynamicFeatureExtractor + +logger = logging.getLogger(__name__) + + +def is_valid_domain(string: str) -> bool: + """ + uses a regex to check whether a string could be a valid web domain + + ignores domain-like strings that have invalid top-level domains (e.g., ".exe", ".dll", etc.) + """ + ############## + # ideally 'DOMAIN_PATTERN' should probably be moved out of this function's scope but + # then it would have to be passed as a variable to this function and that would make + # rendering in the main function a lot more messy + + # See this Stackoverflow post that discusses the parts of this regex (http://stackoverflow.com/a/7933253/433790) + # The following regex is based on the linked-to regex but significantly modified/updated + DOMAIN_PATTERN = ( + r"^(?!.{256})(?:[a-z](?:[a-z0-9-']{0,61})?(? bool: + """checks if a string is a valid IP address""" + try: + ipaddress.ip_address(string) + return True + except ValueError: + return False + + +def generate_insns_from_doc(doc: ResultDocument) -> Generator[Tuple[Feature, Address], None, None]: + """ + checks whether extractor's type is StaticFeatureExtractor or DynamicFeatureExtractor + + if the type is StaticFeatureExtractor, this function yields assembly instruction's and addresses + + StaticFeatureExtractor example: + mnemonic(xor), absolute(0x401015) + mnemonic(lea), absolute(0x401017) + mnemonic(mov), absolute(0x40101d) + mnemonic(push), absolute(0x401023) + number(0xF), absolute(0x401023) + ... + string(70.62.232.98), absolute(0x4010b6) + mnemonic(call), absolute(0x4010bb) + ... + api(strncpy), absolute(0x4010f3) + + if the type is DynamicFeatureExtractor, this function yields "call features" which are analogous + to assembly instructions but extracted from sandbox traces as opposed to files directly + + args: + doc (ResultDocument): a ResultDocument object + + yields: + feature, addr (Tuple[Feature, Address]): + 'feature' is either an assembly instruction or a call feature; and, + 'addr' is a memory address. + """ + extractor = get_extractor_from_doc(doc) + if isinstance(extractor, StaticFeatureExtractor): + for func in extractor.get_functions(): + for block in extractor.get_basic_blocks(func): + for insn in extractor.get_instructions(func, block): + for feature, addr in extractor.extract_insn_features(func, block, insn): + yield feature, addr + + elif isinstance(extractor, DynamicFeatureExtractor): + for proc in extractor.get_processes(): + for thread in extractor.get_threads(proc): + for call in extractor.get_calls(proc, thread): + for feature, addr in extractor.extract_call_features(proc, thread, call): + yield feature, addr + + +def default_extract_domain_names(doc: ResultDocument) -> Generator[str, None, None]: + """ + loops through assembly instructions retrieved from a ResultDocument object + + this 'default' function is meant to merely tell users what domains/IPs are in a file, + not to show users how many time each occur, so we consciously do not yield duplicates + + yields: + potential web domain names and IP addresses + """ + duplicates = set() + for feature, _ in generate_insns_from_doc(doc): + string = str(feature.value) + if string in duplicates: + continue + + if is_valid_domain(string): + duplicates.add(string) + yield string + + elif is_ip_addr(string): + duplicates.add(string) + yield string + + +def verbose_extract_domain_and_ip(doc: ResultDocument) -> Generator[str, None, None]: + """calls verbose statement formatter for IP addresses and web domains""" + for string, count in get_domain_ip_dict(doc).items(): + if is_ip_addr(string): + yield formatted_ip_verbose(doc, string, count) + else: + yield formatted_domain_verbose(doc, string, count) + + +def get_domain_ip_dict(doc: ResultDocument): + """ + returns dict of domains/IPs in a file and number of times each occur + + example: + {'malicious-website.com/next/asxp.jpg': 3, 'other-website.net': 2} + + args: + doc (ResultDocument): ResultDocument object which contains FeatureExtractor information, including file strings + + returns: + domain_and_ip_counts (Dict[str, int]): dict of domain names and IP addresses and occurrances of each + - Note: each full-path URL gets its own dict key + """ + domain_and_ip_counts: Dict[str, int] = {} + + for feature, _ in generate_insns_from_doc(doc): + extended_string = feature.value + + if not isinstance(extended_string, str): + continue + + # this for loop cleans up any "http(s)://" strings + for string in extended_string.split(" "): + if string.startswith("http://"): + string = string.split("http://")[-1] + break + + elif string.startswith("https://"): + string = string.split("https://")[-1] + break + + else: + # makes sure there are no weird "http(s)://" strings + # if the assert statement runs, there's probably an issue + assert not (any(prefix in string for prefix in ["http://", "https://"])) + + # for example, if string == "malware.com/next/virus.jpg", + # the following "if-else" statements split at "/" + # and checks whether "malware.com" is a web domain or IP address + if is_valid_domain(string.split("/")[0]): + try: + domain_and_ip_counts[string] += 1 + except KeyError: + domain_and_ip_counts[string] = 1 + + elif is_ip_addr(string.split("/")[0]): + try: + domain_and_ip_counts[string] += 1 + except KeyError: + domain_and_ip_counts[string] = 1 + + return domain_and_ip_counts + + +def formatted_domain_verbose(doc: ResultDocument, domain: str, total_occurrances: int) -> str: + """ + example output: + + capa -v suspicious.exe + ----------------------- + malware.com + |---- IP address: + | |----192.0.0.1 + |----Functions used to communicate with malware.com: + | |----InternetConnectA + | |----HttpOpenRequestA + | |----FtpGetFileA + |----3 occurrances + """ + return ( + f"{domain}\n" + + f" |---- {ip_address_statement(domain)}\n" + + f" |---- {networking_functions_statement(doc, domain)}\n" + + f" |---- {total_occurrances} occurrances" + ) + + +def formatted_ip_verbose(doc: ResultDocument, ip_addr: str, total_occurrances: int) -> str: + """same as 'formatted_domain_verbose' but without 'ip_address_statement'""" + return ( + f"{ip_addr}\n" + + f" |---- {networking_functions_statement(doc, ip_addr)}" + + f" |---- {total_occurrances} occurrances" + ) + + +def ip_address_statement(domain: str) -> str: + """ + tries to identify a web domain's IP address + + this function's output is used by 'formatted_domain_verbose' + + return: + (str): either the formatted IP address, or an error message + """ + try: + ip_address = socket.gethostbyname(domain) + return "IP address:\n" + f" | |----{ip_address}\n" + except socket.gaierror: + return f"Could not get IP address for {domain.split('/')[0]}\n" + + +def networking_functions_statement(doc: ResultDocument, domain_or_ip: str): + """prints the functions used to communicate with domain/ip""" + api_functions = get_domain_or_ip_caller_functions(doc, domain_or_ip) + + if len(api_functions) == 0: + statement = ( + f"{domain_or_ip} occurs but no functions found that use it.\n" # noqa: NIC002 + " If you think this is a mistake, please open an issue on\n" # noqa: NIC002 + " the capa GitHub page (https://github.com/mandiant/capa)\n" + ) + return statement + + elif len(api_functions) == 1: + statement = f"Function used to communicate with {domain_or_ip}:\n" + for func in api_functions: + return statement + f" | |----{func}\n" + + elif len(api_functions) > 1: + statement = f"Functions used to communicate with {domain_or_ip}:\n" + for function in api_functions: + statement += f" | |----{function}\n" + + return statement + + else: + raise LengthError("'api_functions' contains unexpected data!") + + +class LengthError(BaseException): + pass + + +def get_domain_or_ip_caller_functions(doc: ResultDocument, domain_or_ip: str) -> List[str]: + """ + for every occurrance of 'domain_or_ip' in the ResultDocument, we see which functions operate on it + + returns: + List[str]: list of functions that operate on the 'domain_or_ip' string + """ + return list(yield_caller_funcs(doc, domain_or_ip)) + + +def yield_caller_funcs(doc: ResultDocument, domain_or_ip: str) -> Generator[str, None, None]: + """ + We loop through asembly instructions and look for features whose values equal 'domain_or_ip'. + When we find a feature, we look for a WinAPI instruction. WinAPI instructions are features: + 1) whose type is API; and, + 2) whose values are, heuristically, WinAPI networking functions. + + yields: + (str): either a potential WinAPI function, or an error message + """ + signal = 0 + for feature, _ in generate_insns_from_doc(doc): + if isinstance(feature.value, str) and feature.value == domain_or_ip: + signal = 1 + continue + + # we only run this block if we have found a 'target_string' + if signal == 1: + # skip instructions until we get to an API instruction + if not isinstance(feature, API): + continue + + signal = 0 + + func = str(feature.value) # redundant but helps pass mypy tests + if "." in func: + func = func.split(".")[-1] + + # at this point, we have found an API instruction + # and see whether it could be a networking function + if potential_winapi_function(func): + yield func + + else: + yield "Not able to identify the calling function" + + +def potential_winapi_function(string: str) -> bool: + """ + some simple heuristics for checking whether a string is NOT a WinAPI function + + returns: + True if string could be a WinAPI function + False if string is not a WinAPI function + """ + if string in excluded_functions(): + return False + + if any(x in string.lower() for x in quick_true()): + return True + + if all(sep.isupper() for sep in string.split("_")) or all( + sep.islower() for sep in string.split("_") + ): # WinAPI functions are usually mixed upper and lower case + return False + + if not all(sep.isalpha() for sep in string.split("_")): # if contains non-letters + return False + + if too_many_consecutive_uppercase_letters(string, 7): # maximum of 7 consecutive uppercase letters + return False + + return True + + +def quick_true(): + """matched against lowercase strings""" + return [ + "http", + "ftp", + "internet", + "url", + "connection", + "connected", + "online", + "inet", + "addr", + "send", + "recv", + "sock", + "select", + "shutdown", + "ntoh", + "listen", + "serv", + "getpeer", + ] + + +def excluded_functions(): + """ + add excluded functions here, e.g., those that can't accept an IP address/web domain as an argument + """ + return ["Sleep"] + + +def too_many_consecutive_uppercase_letters(string, limit): + """ + 'HOSTENT' (probably) has the most consecutive uppercase letters + + returns: + True: too many consecutive uppercase letters, caller function disregards + False: not too many consecutive uppercase, indicates this is a potential WinAPI function + """ + counter = 0 + for i in string: + if i.isupper(): + counter += 1 + else: # basically reset counter if we reach a non-uppercase letter + counter = 0 + + if counter > limit: + return True + + return False diff --git a/capa/render/default.py b/capa/render/default.py index 2e5064740..22811e4b1 100644 --- a/capa/render/default.py +++ b/capa/render/default.py @@ -16,6 +16,7 @@ from capa.rules import RuleSet from capa.engine import MatchResults from capa.render.utils import StringIO +from capa.capabilities.extract_domain_and_ip import default_extract_domain_names tabulate.PRESERVE_WHITESPACE = True @@ -197,6 +198,36 @@ def render_mbc(doc: rd.ResultDocument, ostream: StringIO): ostream.write("\n") +def render_domain_and_ip(doc: rd.ResultDocument, ostream: StringIO): + """ + example:: + +------------------------------+ + | IP addresses and web domains | + |------------------------------+ + | google.com | + | 192.123.232.08 | + | my-website.net | + | maliciooous.webs1t3-site.uhoh| + | malware.net | + +------------------------------+ + """ + rows = [] + for domain_or_ip in default_extract_domain_names(doc): + rows.append(domain_or_ip) + + if rows: + ostream.write( + tabulate.tabulate( + {"IP addresses and web domains": rows}, + headers=["IP addresses and web domains"], + tablefmt="mixed_outline", + ) + ) + ostream.write("\n") + else: + ostream.writeln(rutils.bold("No web domains or IP addresses found")) + + def render_default(doc: rd.ResultDocument): ostream = rutils.StringIO() @@ -207,6 +238,9 @@ def render_default(doc: rd.ResultDocument): render_mbc(doc, ostream) ostream.write("\n") render_capabilities(doc, ostream) + ostream.write("\n") + render_domain_and_ip(doc, ostream) + ostream.write("\n") return ostream.getvalue() diff --git a/capa/render/verbose.py b/capa/render/verbose.py index 076ad2b13..12fd4d00f 100644 --- a/capa/render/verbose.py +++ b/capa/render/verbose.py @@ -34,6 +34,7 @@ import capa.render.result_document as rd from capa.rules import RuleSet from capa.engine import MatchResults +from capa.capabilities.extract_domain_and_ip import verbose_extract_domain_and_ip def format_address(address: frz.Address) -> str: @@ -317,6 +318,58 @@ def render_rules(ostream, doc: rd.ResultDocument): ostream.writeln(rutils.bold("no capabilities found")) +def render_domain_and_ip(ostream: rutils.StringIO, doc: rd.ResultDocument): + """ + example:: + +-----------------------------------------------------------+ + | IP addresses and web domains | + |-----------------------------------------------------------+ + | google.com | + | |----IP address: | + | |----192.0.0.1 | + | |----Functions used to communicate with google.com: | + | |----InternetConnectA | + | |----HttpOpenRequestA | + | |----FtpGetFileA | + | |----3 occurrances | + | | + | 192.123.232.08 | + | |----Functions used to communicate with 192.123.232.08:| + | |----... | + | | + +-----------------------------------------------------------+ + """ + rows = [] + for domain_or_ip in verbose_extract_domain_and_ip(doc): + for i in domain_or_ip.split("\n"): + rows.append(i) + + max_line = 0 + for item in rows: + for new_line in item.split("\n"): + if len(new_line) > max_line: + max_line = len(new_line) + + if max_line > 0: + white_spaces = " " * ceil(1 / 3 * max_line) + + if rows: + ostream.write( + tabulate.tabulate( + {white_spaces + "IP addresses and web domains" + white_spaces: rows}, + headers=[white_spaces + "IP addresses and web domains" + white_spaces], + tablefmt="mixed_outline", + ) + ) + ostream.write("\n") + else: + ostream.writeln(rutils.bold("No web domains or IP addresses found")) + + +def ceil(num: float) -> int: + return int(num - 0.5) + 1 + + def render_verbose(doc: rd.ResultDocument): ostream = rutils.StringIO() @@ -326,6 +379,9 @@ def render_verbose(doc: rd.ResultDocument): render_rules(ostream, doc) ostream.write("\n") + render_domain_and_ip(ostream, doc) + ostream.write("\n") + return ostream.getvalue() diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index ac1674672..967b8cccb 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -22,6 +22,7 @@ import capa.features.freeze.features as frzf from capa.rules import RuleSet from capa.engine import MatchResults +from capa.capabilities.extract_domain_and_ip import verbose_extract_domain_and_ip logger = logging.getLogger(__name__) @@ -458,6 +459,58 @@ def render_rules(ostream, doc: rd.ResultDocument): ostream.writeln(rutils.bold("no capabilities found")) +def render_domain_and_ip(ostream: rutils.StringIO, doc: rd.ResultDocument): + """ + example:: + +-----------------------------------------------------------+ + | IP addresses and web domains | + |-----------------------------------------------------------+ + | google.com | + | |----IP address: | + | |----192.0.0.1 | + | |----Functions used to communicate with google.com: | + | |----InternetConnectA | + | |----HttpOpenRequestA | + | |----FtpGetFileA | + | |----3 occurrances | + | | | + | 192.123.232.08 | + | |----Functions used to communicate with 192.123.232.08:| + | |----... | + | | + +-----------------------------------------------------------+ + """ + rows = [] + for domain_or_ip in verbose_extract_domain_and_ip(doc): + for i in domain_or_ip.split("\n"): + rows.append(i) + + max_line = 0 + for item in rows: + for new_line in item.split("\n"): + if len(new_line) > max_line: + max_line = len(new_line) + + if max_line > 0: + white_spaces = " " * ceil(1 / 3 * max_line) + + if rows: + ostream.write( + tabulate.tabulate( + {white_spaces + "IP addresses and web domains" + white_spaces: rows}, + headers=[white_spaces + "IP addresses and web domains" + white_spaces], + tablefmt="mixed_outline", + ) + ) + ostream.write("\n") + else: + ostream.writeln(rutils.bold("No web domains or IP addresses found")) + + +def ceil(num: float) -> int: + return int(num - 0.5) + 1 + + def render_vverbose(doc: rd.ResultDocument): ostream = rutils.StringIO() @@ -467,6 +520,9 @@ def render_vverbose(doc: rd.ResultDocument): render_rules(ostream, doc) ostream.write("\n") + render_domain_and_ip(ostream, doc) + ostream.write("\n") + return ostream.getvalue() diff --git a/tests/test_domain_ip_extractor.py b/tests/test_domain_ip_extractor.py new file mode 100644 index 000000000..54906a92b --- /dev/null +++ b/tests/test_domain_ip_extractor.py @@ -0,0 +1,181 @@ +# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import pytest + +from capa.capabilities.extract_domain_and_ip import is_ip_addr, is_valid_domain, potential_winapi_function + + +@pytest.mark.parametrize( + "string", + [ + # Valid IPv4 addresses + ("8.8.8.8"), + ("128.0.0.1"), + ("123.4.56.78"), + ("0.0.0.0"), + ("255.255.255.255"), + # Valid IPv6 addresses + ("2001:0db8:85a3:0000:0000:8a2e:0370:7334"), + ("fe80:0000:0000:0000:0202:b3ff:fe1e:8329"), + ("2002::1234:5678:9abc:def0"), + ("::1"), + ("2001:0db8:0001:0000:0000:0ab9:C0A8:0102"), + ("2001:db8:1::ab9:C0A8:102"), + ("::1234:5678"), + ("::"), + ("2001:db8::"), + ("2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF"), + ("2001:db8:3333:4444:5555:6666:7777:8888"), + ("3ffe:ffff:ffff:ffff:ffff:ffff:ffff:ffff"), + ("2001:db8:3333:4444:5555:6666:1.2.3.4"), + ("::11.22.33.44"), + ("2001:db8::123.123.123.123"), + ("::1234:5678:91.123.4.56"), + ("::1234:5678:1.2.3.4"), + ("2001:db8::1234:5678:5.6.7.8"), + ], +) +def test_is_ip_addr(string: str): + # Valid IPv4 addresses + assert is_ip_addr(string) + + +@pytest.mark.parametrize( + "string", + [ + # Invalid IPv4 addresses + ("255.255.255.256"), + ("255.255.255.-1"), + ("2555.255.255.255"), + # Invalid IPv6 addresses + ("2001:0db8:85a3:0000:0000:8a2e:0370:G334"), + ("2001:db8:a0b:12f0:0000:0000:0000::0001"), + ("2001:db8:a0b:12f0::1:2:3:4:5"), + ("2001:db8::::1"), + ("fe80:2030:31:24"), + ("::1:2:3:4:5:6:7:8"), + ("2001:db8:a0b:12f0:g:h:i:j"), + ("1234567890:1234:5678:90ab:cdef:1234:5678:90ab"), + ], +) +def test_is_not_ip_addr(string: str): + assert not is_ip_addr(string) + + +@pytest.mark.parametrize( + "string", + [ + ( + "google.com" + ), # the following talks about some domain matching considerations - (http://stackoverflow.com/a/7933253/433790) + ("favorite.website"), + ("dont.like.spiders"), + ("lots.of.subnets.com.org.net"), + ("walk-your-dog.net"), # can have dashes in domain names + ( + "whos--a---goood---boy.com" + ), # can have multiple dashes (https://stackoverflow.com/questions/16468309/can-domain-name-have-two-continuous-hyphens) + ("fileshare.biz"), + ( + "g00gle.c0m" + ), # can have numbers in top-level domain as long as the top-level domain doesn't start or end with a number + ( + "coooooooooooool.we.b.s.t.e" + ), # single-character top-level-domains technically legal (https://stackoverflow.com/questions/7411255/is-it-possible-to-have-one-single-character-top-level-domain-name) + ("really.long.jhgfjhgfjhgfkjh76547kjhgkjhgl234567gfdshgfkklkjh"), + ("oiuyu78658765hgjj-i765jhgftuytruytr.jhgfhgfjhgf654365436576908-088098jhgjff.gdffdghdgfd"), + ("xn--bcher-kva.tld"), + ( + "xn--q1a.xn--b1aube0e.xn--c1acygb.xn--p1ai" + ), # https://superuser.com/questions/860121/what-does-it-mean-when-a-dns-name-starts-with-xn + ("xn--diseolatinoamericano-66b.com"), # https://stackoverflow.com/questions/9724379/xn-on-domain-what-it-means + ( + "don't.like.sp1d3rs" + ), # apostropes in URLs technically legal (https://stackoverflow.com/questions/13442421/apostrophes-in-the-url-good-idea-or-bad-idea-and-why) + ], +) +def test_valid_domain(string: str): + assert is_valid_domain(string) + + +@pytest.mark.parametrize( + "string", + [ + ("yup"), + ("no way this passes the test"), # can't have spaces + ("really.long-domainname"), # can only have "-" in top-level domains if "xn--..." + ("really.long-domain-name"), + ( + "dog..cat" + ), # consecutive periods are invalid in a subdomain (https://stackoverflow.com/questions/41821416/are-urls-with-multiple-periods-in-the-url-path-valid) + ("dog.34.cat"), # subdomain has only numbers + ("34.dog.cat"), + ( + "dog.cat.34" + ), # top-level domains can not consist only of numbers (https://stackoverflow.com/questions/7411255/is-it-possible-to-have-one-single-character-top-level-domain-name) + ("d0nt.lik3.sp1d3rs"), # number at end of second subdomain + ("definite.1nvalid"), # number at start of the top-level domain + ], +) +def test_invalid_domain(string: str): + assert not is_valid_domain(string) + + +@pytest.mark.parametrize( + "string", + [ + ("InternetConnectA"), + ("HttpQueryInfo"), + ("HttpSendRequestW"), + ("InternetCanonicalizeUrlA"), + ("InternetCrackUrlA"), + ("InternetCloseHandle"), + ("InternetCombineUrlW"), + ("InternetCheckConnectionA"), + ("INTERNET_STATUS_CALLBACK"), + ("INTERNET_CACHE_ENTRY_INFOA"), + ("INTERNET_ASYNC_RESULT"), + ("GetUrlCacheEntryInfoExA"), + ("FindNextUrlCacheEntryW"), + ("DeleteUrlCacheEntry"), + ("DetectAutoProxyUrl"), + ("FindFirstUrlCacheEntryExA"), + ("InternetConfirmZoneCrossing"), + ("InternetGoOnlineW"), + ("InternetHangUp"), + ("InternetSetOptionExW"), + ("UnlockUrlCacheEntryFile"), + ("URL_COMPONENTSA"), + ("Internet"), + ("recv"), + ("send"), + ], +) +def test_potential_winapi_function(string: str): + assert potential_winapi_function(string) + + +@pytest.mark.parametrize( + "string", + [ + ("asdfadsfasdfasf"), + ("plkj"), + ("DSFLKJKLJKLDJFKJ"), + ("LKJD LKJ ALKSDJFH"), + ("dog cat mouse snake"), + ("Dog CAT mOuse Snake"), + (""), + (" "), + ("2345"), + ("SDFGHJ_SDFGHJKLKJHG"), + ("Sleep"), + ], +) +def test_not_potential_winapi_function(string: str): + assert not potential_winapi_function(string)