diff --git a/cartography/cli.py b/cartography/cli.py index 2c0f08afe..cd1bbc610 100644 --- a/cartography/cli.py +++ b/cartography/cli.py @@ -419,6 +419,38 @@ def _build_parser(self): 'The crowdstrike URL, if using self-hosted. Defaults to the public crowdstrike API URL otherwise.' ), ) + parser.add_argument( + '--azureresourcegraph-tenant-id', + type=str, + default=None, + help=( + 'The azureresourcegraph tenant id for authentication.' + ), + ) + parser.add_argument( + '--azureresourcegraph-client-id-env-var', + type=str, + default=None, + help=( + 'The name of environment variable containing the azureresourcegraph client id for authentication.' + ), + ) + parser.add_argument( + '--azureresourcegraph-client-secret-env-var', + type=str, + default=None, + help=( + 'The name of environment variable containing the azureresourcegraph secret key for authentication.' + ), + ) + parser.add_argument( + '--azureresourcegraph-use-managedidentity', + type=bool, + default=False, + help=( + 'Use managed identity for azureresourcegraph authentication.' + ), + ) parser.add_argument( '--gsuite-auth-method', type=str, @@ -569,6 +601,23 @@ def main(self, argv: str) -> int: else: config.crowdstrike_client_secret = None + # Azure Resource Graph config + if config.azureresourcegraph_client_id_env_var: + logger.debug( + f"Reading API key for mde from environment variable {config.azureresourcegraph_client_id_env_var}", + ) + config.azureresourcegraph_client_id = os.environ.get(config.azureresourcegraph_client_id_env_var) + else: + config.azureresourcegraph_client_id = None + + if config.azureresourcegraph_client_secret_env_var: + logger.debug( + f"Reading API key for mde from environment variable {config.azureresourcegraph_client_secret_env_var}", + ) + config.azureresourcegraph_client_secret = os.environ.get(config.azureresourcegraph_client_secret_env_var) + else: + config.azureresourcegraph_client_secret = None + # GSuite config if config.gsuite_tokens_env_var: logger.debug(f"Reading config string for GSuite from environment variable {config.gsuite_tokens_env_var}") diff --git a/cartography/config.py b/cartography/config.py index a18d5f5ad..b77131ccd 100644 --- a/cartography/config.py +++ b/cartography/config.py @@ -131,6 +131,10 @@ def __init__( crowdstrike_client_id=None, crowdstrike_client_secret=None, crowdstrike_api_url=None, + azureresourcegraph_tenant_id=None, + azureresourcegraph_client_id=None, + azureresourcegraph_client_secret=None, + azureresourcegraph_use_managedidentity=None, gsuite_auth_method=None, gsuite_config=None, ): @@ -174,5 +178,9 @@ def __init__( self.crowdstrike_client_id = crowdstrike_client_id self.crowdstrike_client_secret = crowdstrike_client_secret self.crowdstrike_api_url = crowdstrike_api_url + self.azureresourcegraph_tenant_id = azureresourcegraph_tenant_id + self.azureresourcegraph_client_id = azureresourcegraph_client_id + self.azureresourcegraph_client_secret = azureresourcegraph_client_secret + self.azureresourcegraph_use_managedidentity = azureresourcegraph_use_managedidentity self.gsuite_auth_method = gsuite_auth_method self.gsuite_config = gsuite_config diff --git a/cartography/data/jobs/cleanup/azureresourcegraph_import_cleanup.json b/cartography/data/jobs/cleanup/azureresourcegraph_import_cleanup.json new file mode 100644 index 000000000..f70203201 --- /dev/null +++ b/cartography/data/jobs/cleanup/azureresourcegraph_import_cleanup.json @@ -0,0 +1,10 @@ +{ + "statements": [ + { + "query": "WITH datetime()-duration('P7D') AS threshold MATCH (h:AzureResourceGraphHost) WHERE h.lastupdated < threshold WITH h LIMIT $LIMIT_SIZE DETACH DELETE (h)", + "iterative": true, + "iterationsize": 100 + } + ], + "name": "cleanup azureresourcegraph" +} diff --git a/cartography/intel/azureresourcegraph/__init__.py b/cartography/intel/azureresourcegraph/__init__.py new file mode 100644 index 000000000..f9f78f92c --- /dev/null +++ b/cartography/intel/azureresourcegraph/__init__.py @@ -0,0 +1,74 @@ +""" +cartography/intel/azureresourcegraph +""" +import logging + +import neo4j + +from cartography.config import Config +from cartography.intel.azureresourcegraph.endpoints import sync_hosts +from cartography.intel.azureresourcegraph.util import get_authorization +from cartography.stats import get_stats_client +from cartography.util import merge_module_sync_metadata +from cartography.util import run_cleanup_job +from cartography.util import timeit + +logger = logging.getLogger(__name__) +stat_handler = get_stats_client(__name__) + + +@timeit +def start_azureresourcegraph_ingestion( + neo4j_session: neo4j.Session, + config: Config, +) -> None: + """ + Perform ingestion of Azure Resource Graph data. + :param neo4j_session: Neo4J session for database interface + :param config: A cartography.config object + :return: None + """ + common_job_parameters = { + "UPDATE_TAG": config.update_tag, + } + if not ( + ( + config.azureresourcegraph_client_id and + config.azureresourcegraph_client_secret + ) or + (config.azureresourcegraph_use_managedidentity.lower() == "true") + ): + logger.error( + "azureresourcegraph config not found and not requested managed identity.", + ) + return + + authorization = get_authorization( + config.azureresourcegraph_client_id, + config.azureresourcegraph_client_secret, + config.azureresourcegraph_tenant_id, + True, + config.azureresourcegraph_use_managedidentity, + ) + sync_hosts( + neo4j_session, + config.update_tag, + authorization, + ) + run_cleanup_job( + "azureresourcegraph_import_cleanup.json", + neo4j_session, + common_job_parameters, + ) + + group_id = "public" + if config.azureresourcegraph_tenant_id: + group_id = config.azureresourcegraph_tenant_id + merge_module_sync_metadata( + neo4j_session, + group_type="azureresourcegraph", + group_id=group_id, + synced_type="azureresourcegraph", + update_tag=config.update_tag, + stat_handler=stat_handler, + ) diff --git a/cartography/intel/azureresourcegraph/endpoints.py b/cartography/intel/azureresourcegraph/endpoints.py new file mode 100644 index 000000000..d568f494d --- /dev/null +++ b/cartography/intel/azureresourcegraph/endpoints.py @@ -0,0 +1,90 @@ +""" +cartography/intel/azureresourcegraph/endpoints +""" +# pylint: disable=missing-function-docstring,too-many-arguments +import logging +from typing import Dict +from typing import List + +import neo4j + +from .util import get_azureresourcegraph_hosts +from cartography.util import timeit + +logger = logging.getLogger(__name__) + + +@timeit +def sync_hosts( + neo4j_session: neo4j.Session, + update_tag: int, + authorization: str, +) -> None: + arg_hosts_list = get_azureresourcegraph_hosts(authorization) + for host_data in arg_hosts_list: + load_host_data(neo4j_session, host_data, update_tag) + + +def load_host_data( + neo4j_session: neo4j.Session, + data: List[Dict], + update_tag: int, +) -> None: + """ + Transform and load scan information + + resourceid,instance_id,subscriptionId,subscriptionName, resourceGroup, name, type, vmStatus, + tags.environment, tags.costcenter, tags.contact, tags.businessproduct, tags.businesscontact, + tags.engproduct, tags.engcontact, tags.lob, tags.compliance, tags.ticket, + subproperties,publicIpName,publicIPAllocationMethod,ipAddress,nsgId + + FIXME! create duplicate entries/not merging + """ + ingestion_cypher_query = """ + UNWIND $Hosts AS host + MERGE (h:AzureResourceGraphHost{id: host.resourceid}) + ON CREATE SET h.id = host.resourceid, + h.firstseen = timestamp() + SET h.instance_id = host.instance_id, + h.resource_id = toLower(host.resourceid), + h.subscription_id = host.subscriptionId, + h.subscription_name = host.subscriptionName, + h.resource_group = host.resourceGroup, + h.hostname = host.name, + h.short_hostname = host.short_hostname, + h.type = host.type, + h.osname = host.osname, + h.ostype = host.ostype, + h.tags_environment = host.tags_environment, + h.tags_costcenter = host.tags_costcenter, + h.tags_engproduct = host.tags_engproduct, + h.tags_engcontact = host.tags_engcontact, + h.tags_businesscontact = host.tags_businesscontact, + h.vm_status = host.vmStatus, + h.image_publisher = host.image_publisher, + h.image_offer = host.image_offer, + h.image_sku = host.image_sku, + h.image_galleryid = host.image_galleryid, + h.public_ip_name = host.publicIpName, + h.public_ip_allocation_method = host.publicIPAllocationMethod, + h.public_ip = host.ipAddress, + h.nsg_id = host.nsgId, + h.modified_timestamp = host.modified_timestamp, + h.lastupdated = $update_tag + WITH h + MATCH (s:AzureSubscription{id: h.subscription_id}) + MERGE (s)-[r:CONTAINS]->(h) + ON CREATE SET r.firstseen = timestamp() + SET r.lastupdated = $update_tag + WITH h + MATCH (s:AzureVirtualMachine{id: h.resource_id}) + MERGE (s)-[r:PRESENT_IN]->(h) + ON CREATE SET r.firstseen = timestamp() + SET r.lastupdated = $update_tag + """ + logger.debug("Loading %s azureresourcegraph hosts.", len(data)) + neo4j_session.run( + ingestion_cypher_query, + Hosts=data, + update_tag=update_tag, + ) diff --git a/cartography/intel/azureresourcegraph/util.py b/cartography/intel/azureresourcegraph/util.py new file mode 100644 index 000000000..dbf541f9a --- /dev/null +++ b/cartography/intel/azureresourcegraph/util.py @@ -0,0 +1,228 @@ +""" +cartography/intel/azureresourcegraph/util +""" +# pylint: disable=invalid-name,broad-except +import json +import logging +import re +from array import array +from typing import Dict + +import azure.mgmt.resourcegraph as arg +import pandas +from azure.identity import ChainedTokenCredential +from azure.identity import ClientSecretCredential +from azure.identity import ManagedIdentityCredential +from azure.mgmt.resource import SubscriptionClient + + +logger = logging.getLogger(__name__) + + +def get_authorization( + client_id: str, + client_secret: str, + tenant_id: str, + logging_enable: bool = True, + managedidentity_enable: str = "False", +) -> str: + """ + Get Authentication token + + https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python + """ + logger.warning( + ( + "get_authorization inputs: client_id %s, tenant_id %s, " + "logging_enabled %s, managedidentity_enable %s" + ), + client_id, + tenant_id, + logging_enable, + managedidentity_enable, + ) + azure_token = None + if managedidentity_enable.lower() == "true": + try: + # Azure Resource Managed Identity + managed_identity = ManagedIdentityCredential(logging_enable=logging_enable) + logger.warning("Using Managed identity: %s", managed_identity) + return managed_identity + except TypeError as exc: + logger.exception("Exception: %s", exc) + # try: + # # Get your credentials from Azure CLI (development only!) and get your subscription list + # # `az login` first + # azure_cli = AzureCliCredential() + # logger.warning("Using Azure CLI identity: %s", azure_cli) + # return azure_cli + # except TypeError as exc: + # logger.exception("Exception: %s", exc) + try: + # App registration token + logger.info( # nosemgrep + "ClientSecretCredential: tenant %s, client %s", + tenant_id, + client_id, + ) + azure_token = ClientSecretCredential(tenant_id, client_id, client_secret) + logger.warning("Using App registration identity: %s", azure_token) + return azure_token + except TypeError as exc: + logger.exception("Exception: %s", exc) + + try: + credential_chain = ChainedTokenCredential(managed_identity, azure_token) + # credential_chain = ChainedTokenCredential(managed_identity, azure_cli) + # nosemgrep + logger.warning("Using ChainedTokenCredential identity: %s", credential_chain) + return credential_chain + except TypeError as exc: + logger.exception("Exception: %s", exc) + + return "" + + +def get_short_hostname(row: Dict) -> str: + """ + Get short_hostname + With extra restrictions + if ostype Windows, 15 characters max + if others, 64 characters max + """ + short = re.sub(r"\..*$", "", row["name"].lower()) + if row["ostype"] == "Windows": + return short[:15] + return short[:64] + + +# pylint: disable=too-many-locals +def get_azureresourcegraph_hosts( + authorization: str, +) -> array: + """ + Get Azure Resource Graph coverage inventory + + https://docs.microsoft.com/en-us/azure/governance/resource-graph/first-query-python + """ + + vm_query = ( + r"""resources +| where type =~ 'Microsoft.Compute/virtualMachines' +| join kind=inner (resourcecontainers + | where type == 'microsoft.resources/subscriptions' + | project subscriptionId, subscriptionName = name, subproperties = properties + ) on subscriptionId +| extend instance_id = properties.vmId +| extend resourceid = id +| extend vmStatus = properties.extended.instanceView.powerState.displayStatus +| extend osname = properties.osProfile.computerName +| extend ostype = properties.storageProfile.osDisk.osType +| extend image_publisher = properties.storageProfile.imageReference.publisher +| extend image_offer = properties.storageProfile.imageReference.offer +| extend image_sku = properties.storageProfile.imageReference.sku +| extend image_galleryid = properties.storageProfile.imageReference.id +| join kind=leftouter ( + Resources + | where type =~ "Microsoft.Network/networkInterfaces" + | mv-expand properties.ipConfigurations + | where isnotempty(properties_ipConfigurations.properties.publicIPAddress.id) + | extend publicIpId = tostring(properties_ipConfigurations.properties.publicIPAddress.id) + | join ( + Resources + | where type =~ "microsoft.network/publicipaddresses" + ) on $left.publicIpId == $right.id + | extend ipAddress = tostring(properties1.ipAddress) + | extend publicIPAllocationMethod = tostring(properties1.publicIPAllocationMethod) + | extend publicIpName = tostring(name1) + | extend vmId = tostring(properties.virtualMachine.id) + | extend nsgId = tostring(properties.networkSecurityGroup.id) + | project publicIpName,publicIPAllocationMethod,ipAddress,vmId,nsgId + ) on $left.id == $right.vmId +| project resourceid,instance_id,subscriptionId,subscriptionName, resourceGroup, name, type, """ + "osname, ostype, vmStatus, tags.environment, tags.costcenter, tags.contact, " + "tags.businessproduct, tags.businesscontact, tags.engproduct, tags.engcontact, " + "tags.lob, tags.compliance, tags.ticket, subproperties,publicIpName," + "publicIPAllocationMethod,ipAddress,nsgId," + "image_publisher,image_offer,image_sku,image_galleryid" + ) + + subsClient = SubscriptionClient(authorization) + subsRaw = [] + for sub in subsClient.subscriptions.list(): + subsRaw.append(sub.as_dict()) + subsList = [] + for sub in subsRaw: + subsList.append(sub.get("subscription_id")) + + # Create Azure Resource Graph client and set options + argClient = arg.ResourceGraphClient(authorization) + # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/resources/ + # azure-mgmt-resourcegraph/azure/mgmt/resourcegraph/models/_models_py3.py#L543 + # argQueryOptions = arg.models.QueryRequestOptions(result_format="objectArray") + argQueryOptions = arg.models.QueryRequestOptions(result_format="table") + + # Create query + argQuery = arg.models.QueryRequest( + subscriptions=subsList, + query=vm_query, + options=argQueryOptions, + ) + + # Run query + argResults = argClient.resources(argQuery) + + # Show Python object + # return argResults + + # https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/resources/ + # azure-mgmt-resourcegraph/azure/mgmt/resourcegraph/models/_models_py3.py#L612 + df_resourcegraph = pandas.DataFrame( + argResults.data["rows"], + columns=[item["name"] for item in argResults.data["columns"]], + ) + + while argResults.total_records - 1 > df_resourcegraph.shape[0] + 100: + logger.info("Paging %s/%s", df_resourcegraph.shape[0], argResults.total_records) + argQueryOptions2 = arg.models.QueryRequestOptions( + result_format="table", + skip_token=argResults.skip_token, + skip=df_resourcegraph.shape[0], + ) + argQuery2 = arg.models.QueryRequest( + subscriptions=subsList, + query=vm_query, + options=argQueryOptions2, + ) + argResults2 = argClient.resources(argQuery2) + df_resourcegraph_tmp = pandas.DataFrame( + argResults2.data["rows"], + columns=[item["name"] for item in argResults2.data["columns"]], + ) + if not df_resourcegraph_tmp.empty: + df_resourcegraph = pandas.concat([df_resourcegraph, df_resourcegraph_tmp]) + + logger.info( + "ARG Final count %s/%s", + df_resourcegraph.shape[0], + argResults.total_records, + ) + logger.warning( + "ARG Final count %s/%s", + df_resourcegraph.shape[0], + argResults.total_records, + ) + + df_resourcegraph["short_hostname"] = df_resourcegraph.apply( + get_short_hostname, + axis=1, + ) + + flatten_data = json.loads(df_resourcegraph.to_json(orient="records")) + logger.debug("Example: %s", flatten_data[0]) + logger.warning("Example: %s", flatten_data[0]) + + # save to local csv for debugging? + # df_resourcegraph.to_csv("/tmp/cartography-resourcegraph.csv") + + return flatten_data diff --git a/cartography/sync.py b/cartography/sync.py index b417bb286..83da4c3c1 100644 --- a/cartography/sync.py +++ b/cartography/sync.py @@ -14,6 +14,7 @@ import cartography.intel.analysis import cartography.intel.aws import cartography.intel.azure +import cartography.intel.azureresourcegraph import cartography.intel.create_indexes import cartography.intel.crowdstrike import cartography.intel.crxcavator.crxcavator @@ -37,6 +38,7 @@ 'create-indexes': cartography.intel.create_indexes.run, 'aws': cartography.intel.aws.start_aws_ingestion, 'azure': cartography.intel.azure.start_azure_ingestion, + 'azureresourcegraph': cartography.intel.azureresourcegraph.start_azureresourcegraph_ingestion, 'crowdstrike': cartography.intel.crowdstrike.start_crowdstrike_ingestion, 'gcp': cartography.intel.gcp.start_gcp_ingestion, 'gsuite': cartography.intel.gsuite.start_gsuite_ingestion, diff --git a/docs/root/modules/azureresourcegraph/config.md b/docs/root/modules/azureresourcegraph/config.md new file mode 100644 index 000000000..cc75b0e50 --- /dev/null +++ b/docs/root/modules/azureresourcegraph/config.md @@ -0,0 +1,27 @@ +## Azure Resource Graph Configuration + +.. _azureresourcegraph_config: + +Follow these steps to collect Azure Resource Graph data with Cartography: + +### Service Principal setup + +1. Set up an Azure identity for Cartography to use, and ensure that this identity has the built-in Azure [Microsoft Sentinel Reader role](https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#microsoft-sentinel-reader) attached: + * Authenticate: `$ az login` + * Create a Service Principal: `$ az ad sp create-for-rbac --name cartography --role "Microsoft Sentinel Reader"` + * Note the values of the `tenant`, `appId`, and `password` fields +1. Populate environment variables with the values generated in the previous step (e.g., `AZURE_TENANT_ID`, `AZURE_CLIENT_ID`, `AZURE_CLIENT_SECRET`) +1. Call the `cartography` CLI with: + ```bash + --azureresourcegraph-tenant-id xxx \ + --azureresourcegraph-client-id-env-var AZURE_CLIENT_ID \ + --azureresourcegraph-client-secret-env-var AZURE_CLIENT_SECRET + ``` + +### Managed Identity setup + +1. Call the `cartography` CLI with: + ```bash + --azureresourcegraph-tenant-id xxx \ + --azureresourcegraph-use-managedidentity + ``` diff --git a/docs/root/modules/azureresourcegraph/schema.md b/docs/root/modules/azureresourcegraph/schema.md new file mode 100644 index 000000000..9d1b0ce1b --- /dev/null +++ b/docs/root/modules/azureresourcegraph/schema.md @@ -0,0 +1,36 @@ +## Azure Resource Graph Schema + +.. _azureresourcegraph_schema: + +### Azure Resource Graph platform + +Representation of a virtualmachines type resource from [Azure Resource Graph](https://learn.microsoft.com/en-us/azure/governance/resource-graph/overview). + +| Field | Description | +|-------|-------------| +|tool_first_seen| Timestamp of when first available logs for host is available since first sync| +|tool_last_seen| Timestamp of when last available logs for host is available per last sync| +|lastupdated| Timestamp of the last time the node was updated| +|**hostname**| The Hostname Computer name| +|short_hostname| The short hostname, lowercase| +|instance_id| Azure instance_id| +|resource_id| Azure resource_id| +|subscription_id| Azure subscription_id| +|subscription_name| Azure subscription_name| +|resource_group| Azure resource_group| +|type| Azure type| +|osname| Azure osname| +|ostype| Azure ostype| +|vm_status| Azure vm_status| +|image_publisher| Azure image_publisher| +|image_offer| Azure image_offer| +|image_sku| Azure image_sku| +|image_galleryid| Azure image_galleryid| +|public_ip_name| Azure public_ip_name| +|public_ip_allocation_method| Azure public_ip_allocation_method| +|public_ip| Azure public_ip| +|nsg_id| Azure nsg_id| +|tags_costcenter| Azure tag costcenter| +|tags_engcontact| Azure tag engcontact| +|tags_businesscontact| Azure tag businesscontact| +|tags_environment| Azure tag environment| diff --git a/setup.py b/setup.py index 59bab4d36..10fdb4402 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,7 @@ "azure-mgmt-storage>=16.0.0", "azure-mgmt-sql<=1.0.0", "azure-identity>=1.5.0", + "azure-mgmt-resourcegraph>=8.0.0", "kubernetes>=22.6.0", "pdpyras>=4.3.0", "crowdstrike-falconpy>=0.5.1",