fix(api): merge Attack Paths findings on short UIDs for AWS resources (#10841)

Co-authored-by: Josema Camacho <josema@prowler.com>
2026-07-24 13:01:56 +00:00 · 2026-04-22 12:35:48 +02:00
parent 6d88a402c9
commit 0433c4ad64
6 changed files with 174 additions and 45 deletions
@@ -12,6 +12,7 @@ All notable changes to the **Prowler API** are documented in this file.

 - Finding groups aggregated `status` now treats muted findings as resolved: a group is `FAIL` only while at least one non-muted FAIL remains, otherwise it is `PASS` (including fully-muted groups). The `filter[status]` filter and the `sort=status` ordering share the same semantics, keeping `status` consistent with `fail_count` and the orthogonal `muted` flag [(#10825)](https://github.com/prowler-cloud/prowler/pull/10825)
 - `aggregate_findings` is now idempotent: it deletes the scan's existing `ScanSummary` rows before `bulk_create`, so re-runs (such as the post-mute reaggregation pipeline) no longer violate the `unique_scan_summary` constraint and no longer abort the downstream `DailySeveritySummary` / `FindingGroupDailySummary` recomputation for the affected scan [(#10827)](https://github.com/prowler-cloud/prowler/pull/10827)
+- Attack Paths: Findings on AWS were silently dropped during the Neo4j merge for resources whose Cartography node is keyed by a short identifier (e.g. EC2 instances) rather than the full ARN [(#10839)](https://github.com/prowler-cloud/prowler/pull/10839)

 ---

@@ -313,3 +313,16 @@ def sync_aws_account(
            )

    return failed_syncs
+
+
+def extract_short_uid(uid: str) -> str:
+    """Return the short identifier from an AWS ARN or resource ID.
+
+    Supported inputs end in one of:
+      - `<type>/<id>`        (e.g. `instance/i-xxx`)
+      - `<type>:<id>`        (e.g. `function:name`)
+      - `<id>`               (e.g. `bucket-name` or `i-xxx`)
+
+    If `uid` is already a short resource ID, it is returned unchanged.
+    """
+    return uid.rsplit("/", 1)[-1].rsplit(":", 1)[-1]
@@ -37,6 +37,8 @@ class ProviderConfig:
    # Label for resources connected to the account node, enabling indexed finding lookups.
    resource_label: str  # e.g., "_AWSResource"
    ingestion_function: Callable
+    # Maps a Postgres resource UID (e.g. full ARN) to the short-id form Cartography stores on some node types (e.g. `i-xxx` for EC2Instance).
+    short_uid_extractor: Callable[[str], str]


 # Provider Configurations
@@ -48,6 +50,7 @@ AWS_CONFIG = ProviderConfig(
    uid_field="arn",
    resource_label="_AWSResource",
    ingestion_function=aws.start_aws_ingestion,
+    short_uid_extractor=aws.extract_short_uid,
 )

 PROVIDER_CONFIGS: dict[str, ProviderConfig] = {
@@ -116,6 +119,21 @@ def get_provider_resource_label(provider_type: str) -> str:
    return config.resource_label if config else "_UnknownProviderResource"


+def _identity_short_uid(uid: str) -> str:
+    """Fallback short-uid extractor for providers without a custom mapping."""
+    return uid
+
+
+def get_short_uid_extractor(provider_type: str) -> Callable[[str], str]:
+    """Get the short-uid extractor for a provider type.
+
+    Returns an identity function when the provider is unknown, so callers can
+    rely on a callable always being returned.
+    """
+    config = PROVIDER_CONFIGS.get(provider_type)
+    return config.short_uid_extractor if config else _identity_short_uid
+
+
 # Dynamic Isolation Label Helpers
 # --------------------------------

@@ -8,7 +8,7 @@ This module handles:
 """

 from collections import defaultdict
-from typing import Any, Generator
+from typing import Any, Callable, Generator
 from uuid import UUID

 import neo4j
@@ -21,6 +21,7 @@ from tasks.jobs.attack_paths.config import (
    get_node_uid_field,
    get_provider_resource_label,
    get_root_node_label,
+    get_short_uid_extractor,
 )
 from tasks.jobs.attack_paths.queries import (
    ADD_RESOURCE_LABEL_TEMPLATE,
@@ -57,7 +58,9 @@ _DB_QUERY_FIELDS = [
 ]


-def _to_neo4j_dict(record: dict[str, Any], resource_uid: str) -> dict[str, Any]:
+def _to_neo4j_dict(
+    record: dict[str, Any], resource_uid: str, resource_short_uid: str
+) -> dict[str, Any]:
    """Transform a Django `.values()` record into a `dict` ready for Neo4j ingestion."""
    return {
        "id": str(record["id"]),
@@ -75,6 +78,7 @@ def _to_neo4j_dict(record: dict[str, Any], resource_uid: str) -> dict[str, Any]:
        "muted": record["muted"],
        "muted_reason": record["muted_reason"],
        "resource_uid": resource_uid,
+        "resource_short_uid": resource_short_uid,
    }


@@ -170,6 +174,8 @@ def load_findings(

    batch_num = 0
    total_records = 0
+    edges_merged = 0
+    edges_dropped = 0
    for batch in findings_batches:
        batch_num += 1
        batch_size = len(batch)
@@ -178,9 +184,15 @@ def load_findings(
        parameters["findings_data"] = batch

        logger.info(f"Loading findings batch {batch_num} ({batch_size} records)")
-        neo4j_session.run(query, parameters)
+        summary = neo4j_session.run(query, parameters).single()
+        if summary is not None:
+            edges_merged += summary.get("merged_count", 0)
+            edges_dropped += summary.get("dropped_count", 0)

-    logger.info(f"Finished loading {total_records} records in {batch_num} batches")
+    logger.info(
+        f"Finished loading {total_records} records in {batch_num} batches "
+        f"(edges_merged={edges_merged}, edges_dropped={edges_dropped})"
+    )
    return total_records


@@ -205,8 +217,9 @@ def stream_findings_with_resources(
    )

    tenant_id = prowler_api_provider.tenant_id
+    short_uid_extractor = get_short_uid_extractor(prowler_api_provider.provider)
    for batch in _paginate_findings(tenant_id, scan_id):
-        enriched = _enrich_batch_with_resources(batch, tenant_id)
+        enriched = _enrich_batch_with_resources(batch, tenant_id, short_uid_extractor)
        if enriched:
            yield enriched

@@ -269,6 +282,7 @@ def _fetch_findings_batch(
 def _enrich_batch_with_resources(
    findings_batch: list[dict[str, Any]],
    tenant_id: str,
+    short_uid_extractor: Callable[[str], str],
 ) -> list[dict[str, Any]]:
    """
    Enrich findings with their resource UIDs.
@@ -280,7 +294,7 @@ def _enrich_batch_with_resources(
    resource_map = _build_finding_resource_map(finding_ids, tenant_id)

    return [
-        _to_neo4j_dict(finding, resource_uid)
+        _to_neo4j_dict(finding, resource_uid, short_uid_extractor(resource_uid))
        for finding in findings_batch
        for resource_uid in resource_map.get(finding["id"], [])
    ]
@@ -35,46 +35,56 @@ INSERT_FINDING_TEMPLATE = f"""
    UNWIND $findings_data AS finding_data

    OPTIONAL MATCH (resource_by_uid:__RESOURCE_LABEL__ {{__NODE_UID_FIELD__: finding_data.resource_uid}})
-    WITH finding_data, resource_by_uid
-
    OPTIONAL MATCH (resource_by_id:__RESOURCE_LABEL__ {{id: finding_data.resource_uid}})
        WHERE resource_by_uid IS NULL
-    WITH finding_data, COALESCE(resource_by_uid, resource_by_id) AS resource
-        WHERE resource IS NOT NULL
+    OPTIONAL MATCH (resource_by_short:__RESOURCE_LABEL__ {{id: finding_data.resource_short_uid}})
+        WHERE resource_by_uid IS NULL AND resource_by_id IS NULL
+    WITH finding_data,
+         resource_by_uid,
+         resource_by_id,
+         head(collect(resource_by_short)) AS resource_by_short
+    WITH finding_data,
+         COALESCE(resource_by_uid, resource_by_id, resource_by_short) AS resource

-    MERGE (finding:{PROWLER_FINDING_LABEL} {{id: finding_data.id}})
-        ON CREATE SET
-            finding.id = finding_data.id,
-            finding.uid = finding_data.uid,
-            finding.inserted_at = finding_data.inserted_at,
-            finding.updated_at = finding_data.updated_at,
-            finding.first_seen_at = finding_data.first_seen_at,
-            finding.scan_id = finding_data.scan_id,
-            finding.delta = finding_data.delta,
-            finding.status = finding_data.status,
-            finding.status_extended = finding_data.status_extended,
-            finding.severity = finding_data.severity,
-            finding.check_id = finding_data.check_id,
-            finding.check_title = finding_data.check_title,
-            finding.muted = finding_data.muted,
-            finding.muted_reason = finding_data.muted_reason,
-            finding.firstseen = timestamp(),
-            finding.lastupdated = $last_updated,
-            finding._module_name = 'cartography:prowler',
-            finding._module_version = $prowler_version
-        ON MATCH SET
-            finding.status = finding_data.status,
-            finding.status_extended = finding_data.status_extended,
-            finding.lastupdated = $last_updated
+    FOREACH (_ IN CASE WHEN resource IS NOT NULL THEN [1] ELSE [] END |
+        MERGE (finding:{PROWLER_FINDING_LABEL} {{id: finding_data.id}})
+            ON CREATE SET
+                finding.id = finding_data.id,
+                finding.uid = finding_data.uid,
+                finding.inserted_at = finding_data.inserted_at,
+                finding.updated_at = finding_data.updated_at,
+                finding.first_seen_at = finding_data.first_seen_at,
+                finding.scan_id = finding_data.scan_id,
+                finding.delta = finding_data.delta,
+                finding.status = finding_data.status,
+                finding.status_extended = finding_data.status_extended,
+                finding.severity = finding_data.severity,
+                finding.check_id = finding_data.check_id,
+                finding.check_title = finding_data.check_title,
+                finding.muted = finding_data.muted,
+                finding.muted_reason = finding_data.muted_reason,
+                finding.firstseen = timestamp(),
+                finding.lastupdated = $last_updated,
+                finding._module_name = 'cartography:prowler',
+                finding._module_version = $prowler_version
+            ON MATCH SET
+                finding.status = finding_data.status,
+                finding.status_extended = finding_data.status_extended,
+                finding.lastupdated = $last_updated
+        MERGE (resource)-[rel:HAS_FINDING]->(finding)
+            ON CREATE SET
+                rel.firstseen = timestamp(),
+                rel.lastupdated = $last_updated,
+                rel._module_name = 'cartography:prowler',
+                rel._module_version = $prowler_version
+            ON MATCH SET
+                rel.lastupdated = $last_updated
+    )

-    MERGE (resource)-[rel:HAS_FINDING]->(finding)
-        ON CREATE SET
-            rel.firstseen = timestamp(),
-            rel.lastupdated = $last_updated,
-            rel._module_name = 'cartography:prowler',
-            rel._module_version = $prowler_version
-        ON MATCH SET
-            rel.lastupdated = $last_updated
+    WITH sum(CASE WHEN resource IS NOT NULL THEN 1 ELSE 0 END) AS merged_count,
+         sum(CASE WHEN resource IS NULL THEN 1 ELSE 0 END) AS dropped_count
+
+    RETURN merged_count, dropped_count
 """

 # Internet queries (used by internet.py)
@@ -1285,6 +1285,12 @@ class TestAttackPathsFindingsHelpers:
        config = SimpleNamespace(update_tag=12345)
        mock_session = MagicMock()

+        first_result = MagicMock()
+        first_result.single.return_value = {"merged_count": 1, "dropped_count": 0}
+        second_result = MagicMock()
+        second_result.single.return_value = {"merged_count": 0, "dropped_count": 1}
+        mock_session.run.side_effect = [first_result, second_result]
+
        with (
            patch(
                "tasks.jobs.attack_paths.findings.get_node_uid_field",
@@ -1294,6 +1300,7 @@ class TestAttackPathsFindingsHelpers:
                "tasks.jobs.attack_paths.findings.get_provider_resource_label",
                return_value="_AWSResource",
            ),
+            patch("tasks.jobs.attack_paths.findings.logger") as mock_logger,
        ):
            findings_module.load_findings(
                mock_session, findings_generator(), provider, config
@@ -1305,6 +1312,14 @@ class TestAttackPathsFindingsHelpers:
            assert params["last_updated"] == config.update_tag
            assert "findings_data" in params

+        summary_log = next(
+            call_args.args[0]
+            for call_args in mock_logger.info.call_args_list
+            if call_args.args and "Finished loading" in call_args.args[0]
+        )
+        assert "edges_merged=1" in summary_log
+        assert "edges_dropped=1" in summary_log
+
    def test_stream_findings_with_resources_returns_latest_scan_data(
        self,
        tenants_fixture,
@@ -1484,11 +1499,12 @@ class TestAttackPathsFindingsHelpers:
            "default",
        ):
            result = findings_module._enrich_batch_with_resources(
-                [finding_dict], str(tenant.id)
+                [finding_dict], str(tenant.id), lambda uid: f"short:{uid}"
            )

        assert len(result) == 1
        assert result[0]["resource_uid"] == resource.uid
+        assert result[0]["resource_short_uid"] == f"short:{resource.uid}"
        assert result[0]["id"] == str(finding.id)
        assert result[0]["status"] == "FAIL"

@@ -1572,7 +1588,7 @@ class TestAttackPathsFindingsHelpers:
            "default",
        ):
            result = findings_module._enrich_batch_with_resources(
-                [finding_dict], str(tenant.id)
+                [finding_dict], str(tenant.id), lambda uid: uid
            )

        assert len(result) == 3
@@ -1646,7 +1662,7 @@ class TestAttackPathsFindingsHelpers:
            patch("tasks.jobs.attack_paths.findings.logger") as mock_logger,
        ):
            result = findings_module._enrich_batch_with_resources(
-                [finding_dict], str(tenant.id)
+                [finding_dict], str(tenant.id), lambda uid: uid
            )

        assert len(result) == 0
@@ -1693,6 +1709,63 @@ class TestAttackPathsFindingsHelpers:

        mock_session.run.assert_not_called()

+    @pytest.mark.parametrize(
+        "uid, expected",
+        [
+            (
+                "arn:aws:ec2:us-east-1:552455647653:instance/i-05075b63eb51baacb",
+                "i-05075b63eb51baacb",
+            ),
+            (
+                "arn:aws:ec2:us-east-1:123456789012:volume/vol-0abcd1234ef567890",
+                "vol-0abcd1234ef567890",
+            ),
+            (
+                "arn:aws:ec2:us-east-1:123456789012:security-group/sg-0123abcd",
+                "sg-0123abcd",
+            ),
+            ("arn:aws:s3:::my-bucket-name", "my-bucket-name"),
+            ("arn:aws:iam::123456789012:role/MyRole", "MyRole"),
+            (
+                "arn:aws:lambda:us-east-1:123456789012:function:my-function",
+                "my-function",
+            ),
+            ("i-05075b63eb51baacb", "i-05075b63eb51baacb"),
+        ],
+    )
+    def test_extract_short_uid_aws_variants(self, uid, expected):
+        from tasks.jobs.attack_paths.aws import extract_short_uid
+
+        assert extract_short_uid(uid) == expected
+
+    def test_insert_finding_template_has_short_id_fallback(self):
+        from tasks.jobs.attack_paths.queries import (
+            INSERT_FINDING_TEMPLATE,
+            render_cypher_template,
+        )
+
+        rendered = render_cypher_template(
+            INSERT_FINDING_TEMPLATE,
+            {
+                "__NODE_UID_FIELD__": "arn",
+                "__RESOURCE_LABEL__": "_AWSResource",
+            },
+        )
+
+        assert (
+            "resource_by_uid:_AWSResource {arn: finding_data.resource_uid}" in rendered
+        )
+        assert "resource_by_id:_AWSResource {id: finding_data.resource_uid}" in rendered
+        assert (
+            "resource_by_short:_AWSResource {id: finding_data.resource_short_uid}"
+            in rendered
+        )
+        assert "head(collect(resource_by_short)) AS resource_by_short" in rendered
+        assert (
+            "COALESCE(resource_by_uid, resource_by_id, resource_by_short)" in rendered
+        )
+        assert "RETURN merged_count, dropped_count" in rendered
+

 class TestAddResourceLabel:
    def test_add_resource_label_applies_private_label(self):