diff --git a/api/CHANGELOG.md b/api/CHANGELOG.md index fc367813f7..5b5ea6b97e 100644 --- a/api/CHANGELOG.md +++ b/api/CHANGELOG.md @@ -4,9 +4,11 @@ All notable changes to the **Prowler API** are documented in this file. ## [1.31.1] (Prowler UNRELEASED) + ### 🐞 Fixed - `compliance-overviews/attributes` now resolves the provider from the scan, so multi-provider universal frameworks (e.g. CSA CCM) return the check IDs of the scan's provider and Azure/GCP requirement details show their findings instead of appearing empty [(#11546)](https://github.com/prowler-cloud/prowler/pull/11546) +- Attack Paths: `drop_subgraph` now deletes relationships first and then nodes in batches, using less memory on Neo4j when clearing a dense provider graph [(#11557)](https://github.com/prowler-cloud/prowler/pull/11557) - OCI scans now use API key credentials with the configured region instead of falling back to `/home/prowler/.oci/config` [(#11558)](https://github.com/prowler-cloud/prowler/pull/11558) --- diff --git a/api/src/backend/api/attack_paths/database.py b/api/src/backend/api/attack_paths/database.py index d5cc1698a7..0e6cc083dc 100644 --- a/api/src/backend/api/attack_paths/database.py +++ b/api/src/backend/api/attack_paths/database.py @@ -175,7 +175,8 @@ def drop_subgraph(database: str, provider_id: str) -> int: """ Delete all nodes for a provider from the tenant database. - Uses batched deletion to avoid memory issues with large graphs. + Deletes relationships then nodes in batches (not `DETACH DELETE`) so a dense + provider's graph cannot exceed Neo4j's transaction memory limit. Silently returns 0 if the database doesn't exist. """ provider_label = get_provider_label(provider_id) @@ -183,13 +184,28 @@ def drop_subgraph(database: str, provider_id: str) -> int: try: with get_session(database) as session: + # Phase 1: delete relationships incident to provider nodes in batches. + deleted_count = 1 + while deleted_count > 0: + result = session.run( + f""" + MATCH (:`{provider_label}`)-[r]-() + WITH DISTINCT r LIMIT $batch_size + DELETE r + RETURN COUNT(r) AS deleted_rels_count + """, + {"batch_size": BATCH_SIZE}, + ) + deleted_count = result.single().get("deleted_rels_count", 0) + + # Phase 2: delete the now relationship-free nodes in batches. deleted_count = 1 while deleted_count > 0: result = session.run( f""" MATCH (n:{PROVIDER_RESOURCE_LABEL}:`{provider_label}`) WITH n LIMIT $batch_size - DETACH DELETE n + DELETE n RETURN COUNT(n) AS deleted_nodes_count """, {"batch_size": BATCH_SIZE}, diff --git a/api/src/backend/api/tests/test_attack_paths_database.py b/api/src/backend/api/tests/test_attack_paths_database.py index 3a29a1007d..7ca8a4accb 100644 --- a/api/src/backend/api/tests/test_attack_paths_database.py +++ b/api/src/backend/api/tests/test_attack_paths_database.py @@ -542,3 +542,84 @@ class TestHasProviderData: ): with pytest.raises(db_module.GraphDatabaseQueryException): db_module.has_provider_data("db-tenant-abc", "provider-123") + + +class TestDropSubgraph: + """Test drop_subgraph two-phase batched deletion of a provider's graph.""" + + @staticmethod + def _result(count): + result = MagicMock() + result.single.return_value.get.return_value = count + return result + + @staticmethod + def _session_ctx(session): + ctx = MagicMock() + ctx.__enter__.return_value = session + ctx.__exit__.return_value = False + return ctx + + def test_deletes_relationships_then_nodes_in_batches(self): + session = MagicMock() + # Phase 1 (relationships): one full batch then empty. + # Phase 2 (nodes): one full batch then empty. + session.run.side_effect = [ + self._result(1000), + self._result(0), + self._result(1000), + self._result(0), + ] + + with patch( + "api.attack_paths.database.get_session", + return_value=self._session_ctx(session), + ): + deleted = db_module.drop_subgraph("db-tenant-abc", "provider-123") + + # Only phase-2 node counts contribute to the return value. + assert deleted == 1000 + assert session.run.call_count == 4 + + queries = [call.args[0] for call in session.run.call_args_list] + + # Regression guard: the memory blow-up was caused by DETACH DELETE. + assert all("DETACH DELETE" not in query for query in queries) + + rel_queries = [query for query in queries if "DELETE r" in query] + node_queries = [query for query in queries if "DELETE n" in query] + assert rel_queries and node_queries + # DISTINCT avoids double-counting relationships matched from both ends. + assert all("DISTINCT r" in query for query in rel_queries) + + # Relationships must be fully drained before nodes are deleted. + first_node = next(i for i, q in enumerate(queries) if "DELETE n" in q) + last_rel = max(i for i, q in enumerate(queries) if "DELETE r" in q) + assert last_rel < first_node + + def test_returns_zero_when_database_not_found(self): + session_ctx = MagicMock() + session_ctx.__enter__.side_effect = db_module.GraphDatabaseQueryException( + message="Database does not exist", + code="Neo.ClientError.Database.DatabaseNotFound", + ) + + with patch( + "api.attack_paths.database.get_session", + return_value=session_ctx, + ): + assert db_module.drop_subgraph("db-tenant-gone", "provider-123") == 0 + + def test_raises_on_other_errors(self): + session_ctx = MagicMock() + session_ctx.__enter__.side_effect = db_module.GraphDatabaseQueryException( + message="Connection refused", + code="Neo.TransientError.General.UnknownError", + ) + + with patch( + "api.attack_paths.database.get_session", + return_value=session_ctx, + ): + with pytest.raises(db_module.GraphDatabaseQueryException): + db_module.drop_subgraph("db-tenant-abc", "provider-123")