diff --git a/api/CHANGELOG.md b/api/CHANGELOG.md index 729325ca38..2d7342a293 100644 --- a/api/CHANGELOG.md +++ b/api/CHANGELOG.md @@ -20,6 +20,7 @@ All notable changes to the **Prowler API** are documented in this file. - Finding groups aggregated `status` now treats muted findings as resolved: a group is `FAIL` only while at least one non-muted FAIL remains, otherwise it is `PASS` (including fully-muted groups). The `filter[status]` filter and the `sort=status` ordering share the same semantics, keeping `status` consistent with `fail_count` and the orthogonal `muted` flag [(#10825)](https://github.com/prowler-cloud/prowler/pull/10825) - `aggregate_findings` is now idempotent: it deletes the scan's existing `ScanSummary` rows before `bulk_create`, so re-runs (such as the post-mute reaggregation pipeline) no longer violate the `unique_scan_summary` constraint and no longer abort the downstream `DailySeveritySummary` / `FindingGroupDailySummary` recomputation for the affected scan [(#10827)](https://github.com/prowler-cloud/prowler/pull/10827) +- Attack Paths: Findings on AWS were silently dropped during the Neo4j merge for resources whose Cartography node is keyed by a short identifier (e.g. EC2 instances) rather than the full ARN [(#10839)](https://github.com/prowler-cloud/prowler/pull/10839) --- diff --git a/api/src/backend/tasks/jobs/attack_paths/aws.py b/api/src/backend/tasks/jobs/attack_paths/aws.py index 7248cc39e8..4acb37d641 100644 --- a/api/src/backend/tasks/jobs/attack_paths/aws.py +++ b/api/src/backend/tasks/jobs/attack_paths/aws.py @@ -313,3 +313,16 @@ def sync_aws_account( ) return failed_syncs + + +def extract_short_uid(uid: str) -> str: + """Return the short identifier from an AWS ARN or resource ID. + + Supported inputs end in one of: + - `/` (e.g. `instance/i-xxx`) + - `:` (e.g. `function:name`) + - `` (e.g. `bucket-name` or `i-xxx`) + + If `uid` is already a short resource ID, it is returned unchanged. + """ + return uid.rsplit("/", 1)[-1].rsplit(":", 1)[-1] diff --git a/api/src/backend/tasks/jobs/attack_paths/config.py b/api/src/backend/tasks/jobs/attack_paths/config.py index 5f5c523ceb..0816626b67 100644 --- a/api/src/backend/tasks/jobs/attack_paths/config.py +++ b/api/src/backend/tasks/jobs/attack_paths/config.py @@ -37,6 +37,8 @@ class ProviderConfig: # Label for resources connected to the account node, enabling indexed finding lookups. resource_label: str # e.g., "_AWSResource" ingestion_function: Callable + # Maps a Postgres resource UID (e.g. full ARN) to the short-id form Cartography stores on some node types (e.g. `i-xxx` for EC2Instance). + short_uid_extractor: Callable[[str], str] # Provider Configurations @@ -48,6 +50,7 @@ AWS_CONFIG = ProviderConfig( uid_field="arn", resource_label="_AWSResource", ingestion_function=aws.start_aws_ingestion, + short_uid_extractor=aws.extract_short_uid, ) PROVIDER_CONFIGS: dict[str, ProviderConfig] = { @@ -116,6 +119,21 @@ def get_provider_resource_label(provider_type: str) -> str: return config.resource_label if config else "_UnknownProviderResource" +def _identity_short_uid(uid: str) -> str: + """Fallback short-uid extractor for providers without a custom mapping.""" + return uid + + +def get_short_uid_extractor(provider_type: str) -> Callable[[str], str]: + """Get the short-uid extractor for a provider type. + + Returns an identity function when the provider is unknown, so callers can + rely on a callable always being returned. + """ + config = PROVIDER_CONFIGS.get(provider_type) + return config.short_uid_extractor if config else _identity_short_uid + + # Dynamic Isolation Label Helpers # -------------------------------- diff --git a/api/src/backend/tasks/jobs/attack_paths/findings.py b/api/src/backend/tasks/jobs/attack_paths/findings.py index 0b2ecb4c45..3581f0ca0f 100644 --- a/api/src/backend/tasks/jobs/attack_paths/findings.py +++ b/api/src/backend/tasks/jobs/attack_paths/findings.py @@ -8,7 +8,7 @@ This module handles: """ from collections import defaultdict -from typing import Any, Generator +from typing import Any, Callable, Generator from uuid import UUID import neo4j @@ -21,6 +21,7 @@ from tasks.jobs.attack_paths.config import ( get_node_uid_field, get_provider_resource_label, get_root_node_label, + get_short_uid_extractor, ) from tasks.jobs.attack_paths.queries import ( ADD_RESOURCE_LABEL_TEMPLATE, @@ -57,7 +58,9 @@ _DB_QUERY_FIELDS = [ ] -def _to_neo4j_dict(record: dict[str, Any], resource_uid: str) -> dict[str, Any]: +def _to_neo4j_dict( + record: dict[str, Any], resource_uid: str, resource_short_uid: str +) -> dict[str, Any]: """Transform a Django `.values()` record into a `dict` ready for Neo4j ingestion.""" return { "id": str(record["id"]), @@ -75,6 +78,7 @@ def _to_neo4j_dict(record: dict[str, Any], resource_uid: str) -> dict[str, Any]: "muted": record["muted"], "muted_reason": record["muted_reason"], "resource_uid": resource_uid, + "resource_short_uid": resource_short_uid, } @@ -170,6 +174,8 @@ def load_findings( batch_num = 0 total_records = 0 + edges_merged = 0 + edges_dropped = 0 for batch in findings_batches: batch_num += 1 batch_size = len(batch) @@ -178,9 +184,15 @@ def load_findings( parameters["findings_data"] = batch logger.info(f"Loading findings batch {batch_num} ({batch_size} records)") - neo4j_session.run(query, parameters) + summary = neo4j_session.run(query, parameters).single() + if summary is not None: + edges_merged += summary.get("merged_count", 0) + edges_dropped += summary.get("dropped_count", 0) - logger.info(f"Finished loading {total_records} records in {batch_num} batches") + logger.info( + f"Finished loading {total_records} records in {batch_num} batches " + f"(edges_merged={edges_merged}, edges_dropped={edges_dropped})" + ) return total_records @@ -205,8 +217,9 @@ def stream_findings_with_resources( ) tenant_id = prowler_api_provider.tenant_id + short_uid_extractor = get_short_uid_extractor(prowler_api_provider.provider) for batch in _paginate_findings(tenant_id, scan_id): - enriched = _enrich_batch_with_resources(batch, tenant_id) + enriched = _enrich_batch_with_resources(batch, tenant_id, short_uid_extractor) if enriched: yield enriched @@ -269,6 +282,7 @@ def _fetch_findings_batch( def _enrich_batch_with_resources( findings_batch: list[dict[str, Any]], tenant_id: str, + short_uid_extractor: Callable[[str], str], ) -> list[dict[str, Any]]: """ Enrich findings with their resource UIDs. @@ -280,7 +294,7 @@ def _enrich_batch_with_resources( resource_map = _build_finding_resource_map(finding_ids, tenant_id) return [ - _to_neo4j_dict(finding, resource_uid) + _to_neo4j_dict(finding, resource_uid, short_uid_extractor(resource_uid)) for finding in findings_batch for resource_uid in resource_map.get(finding["id"], []) ] diff --git a/api/src/backend/tasks/jobs/attack_paths/queries.py b/api/src/backend/tasks/jobs/attack_paths/queries.py index 26ffa32f92..eb1d82a96e 100644 --- a/api/src/backend/tasks/jobs/attack_paths/queries.py +++ b/api/src/backend/tasks/jobs/attack_paths/queries.py @@ -35,46 +35,56 @@ INSERT_FINDING_TEMPLATE = f""" UNWIND $findings_data AS finding_data OPTIONAL MATCH (resource_by_uid:__RESOURCE_LABEL__ {{__NODE_UID_FIELD__: finding_data.resource_uid}}) - WITH finding_data, resource_by_uid - OPTIONAL MATCH (resource_by_id:__RESOURCE_LABEL__ {{id: finding_data.resource_uid}}) WHERE resource_by_uid IS NULL - WITH finding_data, COALESCE(resource_by_uid, resource_by_id) AS resource - WHERE resource IS NOT NULL + OPTIONAL MATCH (resource_by_short:__RESOURCE_LABEL__ {{id: finding_data.resource_short_uid}}) + WHERE resource_by_uid IS NULL AND resource_by_id IS NULL + WITH finding_data, + resource_by_uid, + resource_by_id, + head(collect(resource_by_short)) AS resource_by_short + WITH finding_data, + COALESCE(resource_by_uid, resource_by_id, resource_by_short) AS resource - MERGE (finding:{PROWLER_FINDING_LABEL} {{id: finding_data.id}}) - ON CREATE SET - finding.id = finding_data.id, - finding.uid = finding_data.uid, - finding.inserted_at = finding_data.inserted_at, - finding.updated_at = finding_data.updated_at, - finding.first_seen_at = finding_data.first_seen_at, - finding.scan_id = finding_data.scan_id, - finding.delta = finding_data.delta, - finding.status = finding_data.status, - finding.status_extended = finding_data.status_extended, - finding.severity = finding_data.severity, - finding.check_id = finding_data.check_id, - finding.check_title = finding_data.check_title, - finding.muted = finding_data.muted, - finding.muted_reason = finding_data.muted_reason, - finding.firstseen = timestamp(), - finding.lastupdated = $last_updated, - finding._module_name = 'cartography:prowler', - finding._module_version = $prowler_version - ON MATCH SET - finding.status = finding_data.status, - finding.status_extended = finding_data.status_extended, - finding.lastupdated = $last_updated + FOREACH (_ IN CASE WHEN resource IS NOT NULL THEN [1] ELSE [] END | + MERGE (finding:{PROWLER_FINDING_LABEL} {{id: finding_data.id}}) + ON CREATE SET + finding.id = finding_data.id, + finding.uid = finding_data.uid, + finding.inserted_at = finding_data.inserted_at, + finding.updated_at = finding_data.updated_at, + finding.first_seen_at = finding_data.first_seen_at, + finding.scan_id = finding_data.scan_id, + finding.delta = finding_data.delta, + finding.status = finding_data.status, + finding.status_extended = finding_data.status_extended, + finding.severity = finding_data.severity, + finding.check_id = finding_data.check_id, + finding.check_title = finding_data.check_title, + finding.muted = finding_data.muted, + finding.muted_reason = finding_data.muted_reason, + finding.firstseen = timestamp(), + finding.lastupdated = $last_updated, + finding._module_name = 'cartography:prowler', + finding._module_version = $prowler_version + ON MATCH SET + finding.status = finding_data.status, + finding.status_extended = finding_data.status_extended, + finding.lastupdated = $last_updated + MERGE (resource)-[rel:HAS_FINDING]->(finding) + ON CREATE SET + rel.firstseen = timestamp(), + rel.lastupdated = $last_updated, + rel._module_name = 'cartography:prowler', + rel._module_version = $prowler_version + ON MATCH SET + rel.lastupdated = $last_updated + ) - MERGE (resource)-[rel:HAS_FINDING]->(finding) - ON CREATE SET - rel.firstseen = timestamp(), - rel.lastupdated = $last_updated, - rel._module_name = 'cartography:prowler', - rel._module_version = $prowler_version - ON MATCH SET - rel.lastupdated = $last_updated + WITH sum(CASE WHEN resource IS NOT NULL THEN 1 ELSE 0 END) AS merged_count, + sum(CASE WHEN resource IS NULL THEN 1 ELSE 0 END) AS dropped_count + + RETURN merged_count, dropped_count """ # Internet queries (used by internet.py) diff --git a/api/src/backend/tasks/tests/test_attack_paths_scan.py b/api/src/backend/tasks/tests/test_attack_paths_scan.py index 283c0650e1..986a2f5b2c 100644 --- a/api/src/backend/tasks/tests/test_attack_paths_scan.py +++ b/api/src/backend/tasks/tests/test_attack_paths_scan.py @@ -1285,6 +1285,12 @@ class TestAttackPathsFindingsHelpers: config = SimpleNamespace(update_tag=12345) mock_session = MagicMock() + first_result = MagicMock() + first_result.single.return_value = {"merged_count": 1, "dropped_count": 0} + second_result = MagicMock() + second_result.single.return_value = {"merged_count": 0, "dropped_count": 1} + mock_session.run.side_effect = [first_result, second_result] + with ( patch( "tasks.jobs.attack_paths.findings.get_node_uid_field", @@ -1294,6 +1300,7 @@ class TestAttackPathsFindingsHelpers: "tasks.jobs.attack_paths.findings.get_provider_resource_label", return_value="_AWSResource", ), + patch("tasks.jobs.attack_paths.findings.logger") as mock_logger, ): findings_module.load_findings( mock_session, findings_generator(), provider, config @@ -1305,6 +1312,14 @@ class TestAttackPathsFindingsHelpers: assert params["last_updated"] == config.update_tag assert "findings_data" in params + summary_log = next( + call_args.args[0] + for call_args in mock_logger.info.call_args_list + if call_args.args and "Finished loading" in call_args.args[0] + ) + assert "edges_merged=1" in summary_log + assert "edges_dropped=1" in summary_log + def test_stream_findings_with_resources_returns_latest_scan_data( self, tenants_fixture, @@ -1484,11 +1499,12 @@ class TestAttackPathsFindingsHelpers: "default", ): result = findings_module._enrich_batch_with_resources( - [finding_dict], str(tenant.id) + [finding_dict], str(tenant.id), lambda uid: f"short:{uid}" ) assert len(result) == 1 assert result[0]["resource_uid"] == resource.uid + assert result[0]["resource_short_uid"] == f"short:{resource.uid}" assert result[0]["id"] == str(finding.id) assert result[0]["status"] == "FAIL" @@ -1572,7 +1588,7 @@ class TestAttackPathsFindingsHelpers: "default", ): result = findings_module._enrich_batch_with_resources( - [finding_dict], str(tenant.id) + [finding_dict], str(tenant.id), lambda uid: uid ) assert len(result) == 3 @@ -1646,7 +1662,7 @@ class TestAttackPathsFindingsHelpers: patch("tasks.jobs.attack_paths.findings.logger") as mock_logger, ): result = findings_module._enrich_batch_with_resources( - [finding_dict], str(tenant.id) + [finding_dict], str(tenant.id), lambda uid: uid ) assert len(result) == 0 @@ -1693,6 +1709,63 @@ class TestAttackPathsFindingsHelpers: mock_session.run.assert_not_called() + @pytest.mark.parametrize( + "uid, expected", + [ + ( + "arn:aws:ec2:us-east-1:552455647653:instance/i-05075b63eb51baacb", + "i-05075b63eb51baacb", + ), + ( + "arn:aws:ec2:us-east-1:123456789012:volume/vol-0abcd1234ef567890", + "vol-0abcd1234ef567890", + ), + ( + "arn:aws:ec2:us-east-1:123456789012:security-group/sg-0123abcd", + "sg-0123abcd", + ), + ("arn:aws:s3:::my-bucket-name", "my-bucket-name"), + ("arn:aws:iam::123456789012:role/MyRole", "MyRole"), + ( + "arn:aws:lambda:us-east-1:123456789012:function:my-function", + "my-function", + ), + ("i-05075b63eb51baacb", "i-05075b63eb51baacb"), + ], + ) + def test_extract_short_uid_aws_variants(self, uid, expected): + from tasks.jobs.attack_paths.aws import extract_short_uid + + assert extract_short_uid(uid) == expected + + def test_insert_finding_template_has_short_id_fallback(self): + from tasks.jobs.attack_paths.queries import ( + INSERT_FINDING_TEMPLATE, + render_cypher_template, + ) + + rendered = render_cypher_template( + INSERT_FINDING_TEMPLATE, + { + "__NODE_UID_FIELD__": "arn", + "__RESOURCE_LABEL__": "_AWSResource", + }, + ) + + assert ( + "resource_by_uid:_AWSResource {arn: finding_data.resource_uid}" in rendered + ) + assert "resource_by_id:_AWSResource {id: finding_data.resource_uid}" in rendered + assert ( + "resource_by_short:_AWSResource {id: finding_data.resource_short_uid}" + in rendered + ) + assert "head(collect(resource_by_short)) AS resource_by_short" in rendered + assert ( + "COALESCE(resource_by_uid, resource_by_id, resource_by_short)" in rendered + ) + assert "RETURN merged_count, dropped_count" in rendered + class TestAddResourceLabel: def test_add_resource_label_applies_private_label(self):