perf(api): optimize attack paths graph cleanup (#11755)

This commit is contained in:
Josema Camacho
2026-07-01 16:28:59 +02:00
committed by GitHub
parent 48db27481d
commit 72cf2a65a6
5 changed files with 210 additions and 195 deletions
+4
View File
@@ -10,6 +10,10 @@ All notable changes to the **Prowler API** are documented in this file.
- Attack Paths: Scan task now checks the ingest Neo4j database and configured graph sink before starting graph ingestion [(#11743)](https://github.com/prowler-cloud/prowler/pull/11743) - Attack Paths: Scan task now checks the ingest Neo4j database and configured graph sink before starting graph ingestion [(#11743)](https://github.com/prowler-cloud/prowler/pull/11743)
- Disable PowerShell telemetry in the API container image [(#11746)](https://github.com/prowler-cloud/prowler/pull/11746) - Disable PowerShell telemetry in the API container image [(#11746)](https://github.com/prowler-cloud/prowler/pull/11746)
### 🐞 Fixed
- Attack Paths: Provider graph cleanup now deletes Neo4j and Neptune relationships in directed batches before deleting nodes [(#11755)](https://github.com/prowler-cloud/prowler/pull/11755)
--- ---
## [1.32.2] (Prowler UNRELEASED) ## [1.32.2] (Prowler UNRELEASED)
@@ -0,0 +1,78 @@
"""Shared batched deletion helpers for sink backends."""
import logging
import time
from typing import Any
RELATIONSHIP_DELETE_QUERY_TEMPLATES = {
"outgoing relationship": """
MATCH (n:`{provider_label}`)-[r]->()
WITH r LIMIT $batch_size
DELETE r
RETURN COUNT(r) AS deleted_rels_count
""",
"incoming relationship": """
MATCH (n:`{provider_label}`)<-[r]-()
WITH r LIMIT $batch_size
DELETE r
RETURN COUNT(r) AS deleted_rels_count
""",
}
NODE_DELETE_QUERY_TEMPLATE = """
MATCH (n:{provider_resource_label}:`{provider_label}`)
WITH n LIMIT $batch_size
DELETE n
RETURN COUNT(n) AS deleted_nodes_count
"""
def delete_batches(
*,
session: Any,
logger: logging.Logger,
log_target: str,
provider_id: str,
query: str,
phase: str,
count_key: str,
total_key: str,
deleted_key: str,
initial_total: int,
batch_size: int,
drop_t0: float,
) -> tuple[int, int]:
deleted_total = initial_total
batches = 0
while True:
logger.info(
"Deleting %s batch from %s "
"(provider=%s, batch=%s, total_%s=%s, elapsed=%.3fs)",
phase,
log_target,
provider_id,
batches + 1,
total_key,
deleted_total,
time.perf_counter() - drop_t0,
)
record = session.run(query, {"batch_size": batch_size}).single()
deleted = (record[count_key] if record else 0) or 0
if deleted == 0:
return deleted_total, batches
batches += 1
deleted_total += deleted
logger.info(
"Deleted %s batch from %s "
"(provider=%s, batch=%s, %s=%s, total_%s=%s, elapsed=%.3fs)",
phase,
log_target,
provider_id,
batches,
deleted_key,
deleted,
total_key,
deleted_total,
time.perf_counter() - drop_t0,
)
+42 -79
View File
@@ -17,6 +17,11 @@ import neo4j
import neo4j.exceptions import neo4j.exceptions
from api.attack_paths.retryable_session import RetryableSession from api.attack_paths.retryable_session import RetryableSession
from api.attack_paths.sink.base import SinkDatabase from api.attack_paths.sink.base import SinkDatabase
from api.attack_paths.sink.drop import (
NODE_DELETE_QUERY_TEMPLATE,
RELATIONSHIP_DELETE_QUERY_TEMPLATES,
delete_batches,
)
from config.env import env from config.env import env
from django.conf import settings from django.conf import settings
@@ -204,10 +209,8 @@ class Neo4jSink(SinkDatabase):
) )
provider_label = get_provider_label(provider_id) provider_label = get_provider_label(provider_id)
deleted_nodes = 0 deleted_nodes = deleted_relationships = 0
deleted_relationships = 0 relationship_batches = node_batches = 0
relationship_batches = 0
node_batches = 0
drop_t0 = time.perf_counter() drop_t0 = time.perf_counter()
logger.info( logger.info(
@@ -232,83 +235,43 @@ class Neo4jSink(SinkDatabase):
database, database,
provider_id, provider_id,
) )
# Phase 1: delete relationships incident to provider nodes in log_target = f"Neo4j sink database {database}"
# batches. The undirected pattern matches an edge between two for (
# provider nodes from both ends, so `DISTINCT r` dedupes it to phase,
# delete a full batch of unique relationships each round. query_template,
deleted_count = 1 ) in RELATIONSHIP_DELETE_QUERY_TEMPLATES.items():
while deleted_count > 0: deleted_relationships, phase_batches = delete_batches(
next_batch = relationship_batches + 1 session=session,
logger.info( logger=logger,
"Deleting relationship batch from Neo4j sink database %s " log_target=log_target,
"(provider=%s, batch=%s, total_rels=%s, elapsed=%.3fs)", provider_id=provider_id,
database, query=query_template.format(provider_label=provider_label),
provider_id, phase=phase,
next_batch, count_key="deleted_rels_count",
deleted_relationships, total_key="rels",
time.perf_counter() - drop_t0, deleted_key="deleted_rels",
) initial_total=deleted_relationships,
result = session.run( batch_size=BATCH_SIZE,
f""" drop_t0=drop_t0,
MATCH (:`{provider_label}`)-[r]-()
WITH DISTINCT r LIMIT $batch_size
DELETE r
RETURN COUNT(r) AS deleted_rels_count
""",
{"batch_size": BATCH_SIZE},
)
deleted_count = result.single().get("deleted_rels_count", 0)
if deleted_count > 0:
relationship_batches += 1
deleted_relationships += deleted_count
logger.info(
"Deleted relationship batch from Neo4j sink database %s "
"(provider=%s, batch=%s, deleted_rels=%s, "
"total_rels=%s, elapsed=%.3fs)",
database,
provider_id,
relationship_batches,
deleted_count,
deleted_relationships,
time.perf_counter() - drop_t0,
) )
relationship_batches += phase_batches
# Phase 2: delete the now relationship-free nodes in batches. deleted_nodes, node_batches = delete_batches(
deleted_count = 1 session=session,
while deleted_count > 0: logger=logger,
next_batch = node_batches + 1 log_target=log_target,
logger.info( provider_id=provider_id,
"Deleting node batch from Neo4j sink database %s " query=NODE_DELETE_QUERY_TEMPLATE.format(
"(provider=%s, batch=%s, total_nodes=%s, elapsed=%.3fs)", provider_label=provider_label,
database, provider_resource_label=PROVIDER_RESOURCE_LABEL,
provider_id, ),
next_batch, phase="node",
deleted_nodes, count_key="deleted_nodes_count",
time.perf_counter() - drop_t0, total_key="nodes",
) deleted_key="deleted_nodes",
result = session.run( initial_total=0,
f""" batch_size=BATCH_SIZE,
MATCH (n:{PROVIDER_RESOURCE_LABEL}:`{provider_label}`) drop_t0=drop_t0,
WITH n LIMIT $batch_size
DELETE n
RETURN COUNT(n) AS deleted_nodes_count
""",
{"batch_size": BATCH_SIZE},
)
deleted_count = result.single().get("deleted_nodes_count", 0)
if deleted_count > 0:
node_batches += 1
deleted_nodes += deleted_count
logger.info(
"Deleted node batch from Neo4j sink database %s "
"(provider=%s, batch=%s, deleted_nodes=%s, "
"total_nodes=%s, elapsed=%.3fs)",
database,
provider_id,
node_batches,
deleted_count,
deleted_nodes,
time.perf_counter() - drop_t0,
) )
except GraphDatabaseQueryException as exc: except GraphDatabaseQueryException as exc:
@@ -27,6 +27,11 @@ import neo4j
import neo4j.exceptions import neo4j.exceptions
from api.attack_paths.retryable_session import RetryableSession from api.attack_paths.retryable_session import RetryableSession
from api.attack_paths.sink.base import SinkDatabase from api.attack_paths.sink.base import SinkDatabase
from api.attack_paths.sink.drop import (
NODE_DELETE_QUERY_TEMPLATE,
RELATIONSHIP_DELETE_QUERY_TEMPLATES,
delete_batches,
)
from botocore.auth import SigV4Auth from botocore.auth import SigV4Auth
from botocore.awsrequest import AWSRequest from botocore.awsrequest import AWSRequest
from botocore.session import Session as BotoSession from botocore.session import Session as BotoSession
@@ -296,77 +301,39 @@ class NeptuneSink(SinkDatabase):
"Opened Neptune writer session for provider graph drop (provider=%s)", "Opened Neptune writer session for provider graph drop (provider=%s)",
provider_id, provider_id,
) )
while True: for phase, query_template in RELATIONSHIP_DELETE_QUERY_TEMPLATES.items():
next_batch = relationship_batches + 1 deleted_relationships, phase_batches = delete_batches(
logger.info( session=session,
"Deleting relationship batch from Neptune sink " logger=logger,
"(provider=%s, batch=%s, total_rels=%s, elapsed=%.3fs)", log_target="Neptune sink",
provider_id, provider_id=provider_id,
next_batch, query=query_template.format(provider_label=provider_label),
deleted_relationships, phase=phase,
time.perf_counter() - drop_t0, count_key="deleted_rels_count",
) total_key="rels",
result = session.run( deleted_key="deleted_rels",
f""" initial_total=deleted_relationships,
MATCH (:`{provider_label}`)-[r]-() batch_size=BATCH_SIZE,
WITH DISTINCT r LIMIT $batch_size drop_t0=drop_t0,
DELETE r
RETURN COUNT(r) AS deleted_rels_count
""",
{"batch_size": BATCH_SIZE},
)
record = result.single()
deleted_rels = (record["deleted_rels_count"] if record else 0) or 0
if deleted_rels == 0:
break
relationship_batches += 1
deleted_relationships += deleted_rels
logger.info(
"Deleted relationship batch from Neptune sink "
"(provider=%s, batch=%s, deleted_rels=%s, total_rels=%s, "
"elapsed=%.3fs)",
provider_id,
relationship_batches,
deleted_rels,
deleted_relationships,
time.perf_counter() - drop_t0,
) )
relationship_batches += phase_batches
deleted_nodes = 0 deleted_nodes, node_batches = delete_batches(
while True: session=session,
next_batch = node_batches + 1 logger=logger,
logger.info( log_target="Neptune sink",
"Deleting node batch from Neptune sink " provider_id=provider_id,
"(provider=%s, batch=%s, total_nodes=%s, elapsed=%.3fs)", query=NODE_DELETE_QUERY_TEMPLATE.format(
provider_id, provider_label=provider_label,
next_batch, provider_resource_label=PROVIDER_RESOURCE_LABEL,
deleted_nodes, ),
time.perf_counter() - drop_t0, phase="node",
) count_key="deleted_nodes_count",
result = session.run( total_key="nodes",
f""" deleted_key="deleted_nodes",
MATCH (n:`{PROVIDER_RESOURCE_LABEL}`:`{provider_label}`) initial_total=0,
WITH n LIMIT $batch_size batch_size=BATCH_SIZE,
DELETE n drop_t0=drop_t0,
RETURN COUNT(n) AS deleted_nodes_count
""",
{"batch_size": BATCH_SIZE},
)
record = result.single()
deleted = (record["deleted_nodes_count"] if record else 0) or 0
if deleted == 0:
break
node_batches += 1
deleted_nodes += deleted
logger.info(
"Deleted node batch from Neptune sink "
"(provider=%s, batch=%s, deleted_nodes=%s, total_nodes=%s, "
"elapsed=%.3fs)",
provider_id,
node_batches,
deleted,
deleted_nodes,
time.perf_counter() - drop_t0,
) )
logger.info( logger.info(
+48 -45
View File
@@ -186,6 +186,25 @@ def _session_ctx(session: MagicMock) -> MagicMock:
return ctx return ctx
def _count_result(key: str, count: int) -> MagicMock:
return MagicMock(single=MagicMock(return_value={key: count}))
def _directed_drop_results(
outgoing_rels: int,
incoming_rels: int,
nodes: int,
) -> list[MagicMock]:
return [
_count_result("deleted_rels_count", outgoing_rels),
_count_result("deleted_rels_count", 0),
_count_result("deleted_rels_count", incoming_rels),
_count_result("deleted_rels_count", 0),
_count_result("deleted_nodes_count", nodes),
_count_result("deleted_nodes_count", 0),
]
class TestNeo4jSinkSyncWrites: class TestNeo4jSinkSyncWrites:
def test_ensure_sync_indexes_runs_create_index_idempotent(self): def test_ensure_sync_indexes_runs_create_index_idempotent(self):
from api.attack_paths.sink.neo4j import Neo4jSink from api.attack_paths.sink.neo4j import Neo4jSink
@@ -310,65 +329,48 @@ class TestNeptuneSinkSyncWrites:
class TestNeptuneSinkDropSubgraph: class TestNeptuneSinkDropSubgraph:
def test_drop_subgraph_deletes_rels_before_nodes_in_bounded_batches(self): def test_drop_subgraph_deletes_directed_rels_before_nodes_in_bounded_batches(self):
from api.attack_paths.sink.neptune import NeptuneSink from api.attack_paths.sink.neptune import NeptuneSink
sink = NeptuneSink() sink = NeptuneSink()
session = MagicMock() session = MagicMock()
session.run.side_effect = _directed_drop_results(
rel_record_first = MagicMock() outgoing_rels=50,
rel_record_first.__getitem__ = lambda _self, key: 50 incoming_rels=30,
rel_record_drain = MagicMock() nodes=10,
rel_record_drain.__getitem__ = lambda _self, key: 0 )
node_record_first = MagicMock()
node_record_first.__getitem__ = lambda _self, key: 10
node_record_drain = MagicMock()
node_record_drain.__getitem__ = lambda _self, key: 0
run_results = [
MagicMock(single=MagicMock(return_value=rel_record_first)),
MagicMock(single=MagicMock(return_value=rel_record_drain)),
MagicMock(single=MagicMock(return_value=node_record_first)),
MagicMock(single=MagicMock(return_value=node_record_drain)),
]
session.run.side_effect = run_results
with patch.object(sink, "get_session", return_value=_session_ctx(session)): with patch.object(sink, "get_session", return_value=_session_ctx(session)):
deleted = sink.drop_subgraph("ignored", "provider-1") deleted = sink.drop_subgraph("ignored", "provider-1")
assert deleted == 10 assert deleted == 10
first_query = session.run.call_args_list[0].args[0] assert session.run.call_count == 6
assert "DELETE r" in first_query queries = [call.args[0] for call in session.run.call_args_list]
assert "DETACH DELETE" not in first_query
# DISTINCT avoids double-counting relationships matched from both ends. assert ")-[r]->()" in queries[0]
assert "DISTINCT r" in first_query assert ")<-[r]-()" in queries[2]
third_query = session.run.call_args_list[2].args[0] assert "DELETE n" in queries[4]
assert "DELETE n" in third_query assert all("DETACH DELETE" not in query for query in queries)
assert all("DISTINCT r" not in query for query in queries)
first_node = next(i for i, q in enumerate(queries) if "DELETE n" in q)
last_rel = max(i for i, q in enumerate(queries) if "DELETE r" in q)
assert last_rel < first_node
class TestNeo4jSinkDropSubgraph: class TestNeo4jSinkDropSubgraph:
"""Neo4j drop deletes relationships then nodes in batches (no ``DETACH DELETE``).""" """Neo4j drop deletes relationships then nodes in batches (no ``DETACH DELETE``)."""
def test_drop_subgraph_deletes_rels_before_nodes_in_bounded_batches(self): def test_drop_subgraph_deletes_directed_rels_before_nodes_in_bounded_batches(self):
from api.attack_paths.sink.neo4j import Neo4jSink from api.attack_paths.sink.neo4j import Neo4jSink
sink = Neo4jSink() sink = Neo4jSink()
session = MagicMock() session = MagicMock()
session.run.side_effect = _directed_drop_results(
rel_first = MagicMock() outgoing_rels=50,
rel_first.get = lambda key, default=0: 50 incoming_rels=30,
rel_drain = MagicMock() nodes=10,
rel_drain.get = lambda key, default=0: 0 )
node_first = MagicMock()
node_first.get = lambda key, default=0: 10
node_drain = MagicMock()
node_drain.get = lambda key, default=0: 0
session.run.side_effect = [
MagicMock(single=MagicMock(return_value=rel_first)),
MagicMock(single=MagicMock(return_value=rel_drain)),
MagicMock(single=MagicMock(return_value=node_first)),
MagicMock(single=MagicMock(return_value=node_drain)),
]
provider_id = "00000000-0000-0000-0000-000000000abc" provider_id = "00000000-0000-0000-0000-000000000abc"
with patch.object(sink, "get_session", return_value=_session_ctx(session)): with patch.object(sink, "get_session", return_value=_session_ctx(session)):
@@ -376,19 +378,20 @@ class TestNeo4jSinkDropSubgraph:
# Only phase-2 node counts contribute to the return value. # Only phase-2 node counts contribute to the return value.
assert deleted == 10 assert deleted == 10
assert session.run.call_count == 4 assert session.run.call_count == 6
queries = [call.args[0] for call in session.run.call_args_list] queries = [call.args[0] for call in session.run.call_args_list]
# Regression guard: the memory blow-up was caused by DETACH DELETE. # Regression guard: the memory blow-up was caused by DETACH DELETE.
assert all("DETACH DELETE" not in query for query in queries) assert all("DETACH DELETE" not in query for query in queries)
assert all("DISTINCT r" not in query for query in queries)
first_query = queries[0] first_query = queries[0]
assert "DELETE r" in first_query assert "DELETE r" in first_query
# DISTINCT avoids double-counting relationships matched from both ends. assert ")-[r]->()" in first_query
assert "DISTINCT r" in first_query
assert ":`_Provider_00000000000000000000000000000abc`" in first_query assert ":`_Provider_00000000000000000000000000000abc`" in first_query
assert "DELETE n" in queries[2] assert ")<-[r]-()" in queries[2]
assert "DELETE n" in queries[4]
# Relationships must be fully drained before nodes are deleted. # Relationships must be fully drained before nodes are deleted.
first_node = next(i for i, q in enumerate(queries) if "DELETE n" in q) first_node = next(i for i, q in enumerate(queries) if "DELETE n" in q)