feat(findings): Optimize findings endpoint (#7019)

This commit is contained in:
Víctor Fernández Poyatos
2025-02-25 12:41:47 +01:00
committed by GitHub
parent 7e3688fdd0
commit dbffed8f1f
11 changed files with 362 additions and 62 deletions

View File

@@ -10,6 +10,9 @@ All notable changes to the **Prowler API** are documented in this file.
- Social login integration with Google and GitHub [(#6906)](https://github.com/prowler-cloud/prowler/pull/6906)
- Configurable Sentry integration [(#6874)](https://github.com/prowler-cloud/prowler/pull/6874)
### Changed
- Optimized `GET /findings` endpoint to improve response time and size [(#7019)](https://github.com/prowler-cloud/prowler/pull/7019).
---
## [v1.4.0] (Prowler v5.3.0) - 2025-02-10

View File

@@ -447,9 +447,7 @@ class FindingFilter(FilterSet):
)
return (
queryset.filter(id__gte=start)
.filter(id__lt=end)
.filter(scan__id=value_uuid)
queryset.filter(id__gte=start).filter(id__lt=end).filter(scan_id=value_uuid)
)
def filter_scan_id_in(self, queryset, name, value):
@@ -474,31 +472,32 @@ class FindingFilter(FilterSet):
]
)
if start == end:
return queryset.filter(id__gte=start).filter(scan__id__in=uuid_list)
return queryset.filter(id__gte=start).filter(scan_id__in=uuid_list)
else:
return (
queryset.filter(id__gte=start)
.filter(id__lt=end)
.filter(scan__id__in=uuid_list)
.filter(scan_id__in=uuid_list)
)
def filter_inserted_at(self, queryset, name, value):
value = self.maybe_date_to_datetime(value)
start = uuid7_start(datetime_to_uuid7(value))
datetime_value = self.maybe_date_to_datetime(value)
start = uuid7_start(datetime_to_uuid7(datetime_value))
end = uuid7_start(datetime_to_uuid7(datetime_value + timedelta(days=1)))
return queryset.filter(id__gte=start).filter(inserted_at__date=value)
return queryset.filter(id__gte=start, id__lt=end)
def filter_inserted_at_gte(self, queryset, name, value):
value = self.maybe_date_to_datetime(value)
start = uuid7_start(datetime_to_uuid7(value))
datetime_value = self.maybe_date_to_datetime(value)
start = uuid7_start(datetime_to_uuid7(datetime_value))
return queryset.filter(id__gte=start).filter(inserted_at__gte=value)
return queryset.filter(id__gte=start)
def filter_inserted_at_lte(self, queryset, name, value):
value = self.maybe_date_to_datetime(value)
end = uuid7_start(datetime_to_uuid7(value))
datetime_value = self.maybe_date_to_datetime(value)
end = uuid7_start(datetime_to_uuid7(datetime_value + timedelta(days=1)))
return queryset.filter(id__lte=end).filter(inserted_at__lte=value)
return queryset.filter(id__lt=end)
def filter_resource_tag(self, queryset, name, value):
overall_query = Q()

View File

@@ -0,0 +1,109 @@
from functools import partial
from django.db import connection, migrations
def create_index_on_partitions(
apps, schema_editor, parent_table: str, index_name: str, index_details: str
):
with connection.cursor() as cursor:
cursor.execute(
"""
SELECT inhrelid::regclass::text
FROM pg_inherits
WHERE inhparent = %s::regclass;
""",
[parent_table],
)
partitions = [row[0] for row in cursor.fetchall()]
# Iterate over partitions and create index concurrently.
# Note: PostgreSQL does not allow CONCURRENTLY inside a transaction,
# so we need atomic = False for this migration.
for partition in partitions:
sql = (
f"CREATE INDEX CONCURRENTLY IF NOT EXISTS {partition.replace('.', '_')}_{index_name} ON {partition} "
f"{index_details};"
)
schema_editor.execute(sql)
def drop_index_on_partitions(apps, schema_editor, parent_table: str, index_name: str):
with schema_editor.connection.cursor() as cursor:
cursor.execute(
"""
SELECT inhrelid::regclass::text
FROM pg_inherits
WHERE inhparent = %s::regclass;
""",
[parent_table],
)
partitions = [row[0] for row in cursor.fetchall()]
# Iterate over partitions and drop index concurrently.
for partition in partitions:
partition_index = f"{partition.replace('.', '_')}_{index_name}"
sql = f"DROP INDEX CONCURRENTLY IF EXISTS {partition_index};"
schema_editor.execute(sql)
class Migration(migrations.Migration):
atomic = False
dependencies = [
("api", "0009_increase_provider_uid_maximum_length"),
]
operations = [
migrations.RunPython(
partial(
create_index_on_partitions,
parent_table="findings",
index_name="findings_tenant_and_id_idx",
index_details="(tenant_id, id)",
),
reverse_code=partial(
drop_index_on_partitions,
parent_table="findings",
index_name="findings_tenant_and_id_idx",
),
),
migrations.RunPython(
partial(
create_index_on_partitions,
parent_table="findings",
index_name="find_tenant_scan_idx",
index_details="(tenant_id, scan_id)",
),
reverse_code=partial(
drop_index_on_partitions,
parent_table="findings",
index_name="find_tenant_scan_idx",
),
),
migrations.RunPython(
partial(
create_index_on_partitions,
parent_table="findings",
index_name="find_tenant_scan_id_idx",
index_details="(tenant_id, scan_id, id)",
),
reverse_code=partial(
drop_index_on_partitions,
parent_table="findings",
index_name="find_tenant_scan_id_idx",
),
),
migrations.RunPython(
partial(
create_index_on_partitions,
parent_table="findings",
index_name="find_delta_new_idx",
index_details="(tenant_id, id) where delta = 'new'",
),
reverse_code=partial(
drop_index_on_partitions,
parent_table="findings",
index_name="find_delta_new_idx",
),
),
]

View File

@@ -0,0 +1,49 @@
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("api", "0010_findings_performance_indexes_partitions"),
]
operations = [
migrations.AddIndex(
model_name="finding",
index=models.Index(
fields=["tenant_id", "id"], name="findings_tenant_and_id_idx"
),
),
migrations.AddIndex(
model_name="finding",
index=models.Index(
fields=["tenant_id", "scan_id"], name="find_tenant_scan_idx"
),
),
migrations.AddIndex(
model_name="finding",
index=models.Index(
fields=["tenant_id", "scan_id", "id"], name="find_tenant_scan_id_idx"
),
),
migrations.AddIndex(
model_name="finding",
index=models.Index(
condition=models.Q(("delta", "new")),
fields=["tenant_id", "id"],
name="find_delta_new_idx",
),
),
migrations.AddIndex(
model_name="resourcetagmapping",
index=models.Index(
fields=["tenant_id", "resource_id"], name="resource_tag_tenant_idx"
),
),
migrations.AddIndex(
model_name="resource",
index=models.Index(
fields=["tenant_id", "service", "region", "type"],
name="resource_tenant_metadata_idx",
),
),
]

View File

@@ -552,6 +552,10 @@ class Resource(RowLevelSecurityProtectedModel):
fields=["uid", "region", "service", "name"],
name="resource_uid_reg_serv_name_idx",
),
models.Index(
fields=["tenant_id", "service", "region", "type"],
name="resource_tenant_metadata_idx",
),
GinIndex(fields=["text_search"], name="gin_resources_search_idx"),
]
@@ -599,6 +603,12 @@ class ResourceTagMapping(RowLevelSecurityProtectedModel):
),
]
indexes = [
models.Index(
fields=["tenant_id", "resource_id"], name="resource_tag_tenant_idx"
),
]
class Finding(PostgresPartitionedModel, RowLevelSecurityProtectedModel):
"""
@@ -697,7 +707,17 @@ class Finding(PostgresPartitionedModel, RowLevelSecurityProtectedModel):
],
name="findings_filter_idx",
),
models.Index(fields=["tenant_id", "id"], name="findings_tenant_and_id_idx"),
GinIndex(fields=["text_search"], name="gin_findings_search_idx"),
models.Index(fields=["tenant_id", "scan_id"], name="find_tenant_scan_idx"),
models.Index(
fields=["tenant_id", "scan_id", "id"], name="find_tenant_scan_id_idx"
),
models.Index(
fields=["tenant_id", "id"],
condition=Q(delta="new"),
name="find_delta_new_idx",
),
]
class JSONAPIMeta:

View File

@@ -2435,7 +2435,7 @@ class TestFindingViewSet:
[
("resources", ["resources"]),
("scan", ["scans"]),
("resources.provider,scan", ["resources", "scans", "providers"]),
("resources,scan.provider", ["resources", "scans", "providers"]),
],
)
def test_findings_list_include(
@@ -2491,8 +2491,8 @@ class TestFindingViewSet:
("search", "orange juice", 1),
# full text search on resource
("search", "ec2", 2),
# full text search on finding tags
("search", "value2", 2),
# full text search on finding tags (disabled for now)
# ("search", "value2", 2),
# Temporary disabled until we implement tag filtering in the UI
# ("resource_tag_key", "key", 2),
# ("resource_tag_key__in", "key,key2", 2),

View File

@@ -106,7 +106,7 @@ def uuid7_end(uuid_obj: UUID, offset_months: int = 1) -> UUID:
Args:
uuid_obj: A UUIDv7 object.
offset_days: Number of months to offset from the given UUID's date. Defaults to 1 to handle if
offset_months: Number of months to offset from the given UUID's date. Defaults to 1 to handle if
partitions are not being used, if so the value will be the one set at FINDINGS_TABLE_PARTITION_MONTHS.
Returns:

View File

@@ -745,6 +745,43 @@ class ProviderSerializer(RLSSerializer):
}
class ProviderIncludeSerializer(RLSSerializer):
"""
Serializer for the Provider model.
"""
provider = ProviderEnumSerializerField()
connection = serializers.SerializerMethodField(read_only=True)
class Meta:
model = Provider
fields = [
"id",
"inserted_at",
"updated_at",
"provider",
"uid",
"alias",
"connection",
# "scanner_args",
]
@extend_schema_field(
{
"type": "object",
"properties": {
"connected": {"type": "boolean"},
"last_checked_at": {"type": "string", "format": "date-time"},
},
}
)
def get_connection(self, obj):
return {
"connected": obj.connected,
"last_checked_at": obj.connection_last_checked_at,
}
class ProviderCreateSerializer(RLSSerializer, BaseWriteSerializer):
class Meta:
model = Provider
@@ -807,6 +844,35 @@ class ScanSerializer(RLSSerializer):
]
class ScanIncludeSerializer(RLSSerializer):
trigger = serializers.ChoiceField(
choices=Scan.TriggerChoices.choices, read_only=True
)
state = StateEnumSerializerField(read_only=True)
class Meta:
model = Scan
fields = [
"id",
"name",
"trigger",
"state",
"unique_resource_count",
"progress",
# "scanner_args",
"duration",
"inserted_at",
"started_at",
"completed_at",
"scheduled_at",
"provider",
]
included_serializers = {
"provider": "api.v1.serializers.ProviderIncludeSerializer",
}
class ScanCreateSerializer(RLSSerializer, BaseWriteSerializer):
class Meta:
model = Scan
@@ -938,6 +1004,51 @@ class ResourceSerializer(RLSSerializer):
return fields
class ResourceIncludeSerializer(RLSSerializer):
"""
Serializer for the Resource model.
"""
tags = serializers.SerializerMethodField()
type_ = serializers.CharField(read_only=True)
class Meta:
model = Resource
fields = [
"id",
"inserted_at",
"updated_at",
"uid",
"name",
"region",
"service",
"type_",
"tags",
]
extra_kwargs = {
"id": {"read_only": True},
"inserted_at": {"read_only": True},
"updated_at": {"read_only": True},
}
@extend_schema_field(
{
"type": "object",
"description": "Tags associated with the resource",
"example": {"env": "prod", "owner": "johndoe"},
}
)
def get_tags(self, obj):
return obj.get_tags(self.context.get("tenant_id"))
def get_fields(self):
"""`type` is a Python reserved keyword."""
fields = super().get_fields()
type_ = fields.pop("type_")
fields["type"] = type_
return fields
class FindingSerializer(RLSSerializer):
"""
Serializer for the Finding model.
@@ -967,8 +1078,8 @@ class FindingSerializer(RLSSerializer):
]
included_serializers = {
"scan": ScanSerializer,
"resources": ResourceSerializer,
"scan": ScanIncludeSerializer,
"resources": ResourceIncludeSerializer,
}

View File

@@ -10,7 +10,7 @@ from django.conf import settings as django_settings
from django.contrib.postgres.aggregates import ArrayAgg
from django.contrib.postgres.search import SearchQuery
from django.db import transaction
from django.db.models import Count, F, OuterRef, Prefetch, Q, Subquery, Sum
from django.db.models import Count, Exists, F, OuterRef, Prefetch, Q, Subquery, Sum
from django.db.models.functions import Coalesce
from django.urls import reverse
from django.utils.decorators import method_decorator
@@ -75,6 +75,7 @@ from api.models import (
ProviderGroupMembership,
ProviderSecret,
Resource,
ResourceFindingMapping,
Role,
RoleProviderGroupRelationship,
Scan,
@@ -89,7 +90,6 @@ from api.pagination import ComplianceOverviewPagination
from api.rbac.permissions import Permissions, get_providers, get_role
from api.rls import Tenant
from api.utils import CustomOAuth2Client, validate_invitation
from api.uuid_utils import datetime_to_uuid7
from api.v1.serializers import (
ComplianceOverviewFullSerializer,
ComplianceOverviewSerializer,
@@ -1401,17 +1401,10 @@ class ResourceViewSet(BaseRLSViewSet):
@method_decorator(CACHE_DECORATOR, name="list")
@method_decorator(CACHE_DECORATOR, name="retrieve")
class FindingViewSet(BaseRLSViewSet):
queryset = Finding.objects.all()
queryset = Finding.all_objects.all()
serializer_class = FindingSerializer
prefetch_for_includes = {
"__all__": [],
"resources": [
Prefetch("resources", queryset=Resource.objects.select_related("findings"))
],
"scan": [Prefetch("scan", queryset=Scan.objects.select_related("findings"))],
}
http_method_names = ["get"]
filterset_class = FindingFilter
http_method_names = ["get"]
ordering = ["-inserted_at"]
ordering_fields = [
"status",
@@ -1420,6 +1413,18 @@ class FindingViewSet(BaseRLSViewSet):
"inserted_at",
"updated_at",
]
prefetch_for_includes = {
"__all__": [],
"resources": [
Prefetch(
"resources",
queryset=Resource.all_objects.prefetch_related("tags", "findings"),
)
],
"scan": [
Prefetch("scan", queryset=Scan.all_objects.select_related("findings"))
],
}
# RBAC required permissions (implicit -> MANAGE_PROVIDERS enable unlimited visibility or check the visibility of
# the provider through the provider group)
required_permissions = []
@@ -1433,41 +1438,34 @@ class FindingViewSet(BaseRLSViewSet):
return super().get_serializer_class()
def get_queryset(self):
tenant_id = self.request.tenant_id
user_roles = get_role(self.request.user)
if user_roles.unlimited_visibility:
# User has unlimited visibility, return all scans
queryset = Finding.objects.filter(tenant_id=self.request.tenant_id)
# User has unlimited visibility, return all findings
queryset = Finding.all_objects.filter(tenant_id=tenant_id)
else:
# User lacks permission, filter providers based on provider groups associated with the role
queryset = Finding.objects.filter(
# User lacks permission, filter findings based on provider groups associated with the role
queryset = Finding.all_objects.filter(
scan__provider__in=get_providers(user_roles)
)
search_value = self.request.query_params.get("filter[search]", None)
if search_value:
# Django's ORM will build a LEFT JOIN and OUTER JOIN on any "through" tables, resulting in duplicates
# The duplicates then require a `distinct` query
search_query = SearchQuery(
search_value, config="simple", search_type="plain"
)
resource_match = Resource.all_objects.filter(
text_search=search_query,
id__in=ResourceFindingMapping.objects.filter(
resource_id=OuterRef("pk"),
tenant_id=tenant_id,
).values("resource_id"),
)
queryset = queryset.filter(
Q(impact_extended__contains=search_value)
| Q(status_extended__contains=search_value)
| Q(check_id=search_value)
| Q(check_id__icontains=search_value)
| Q(text_search=search_query)
| Q(resources__uid=search_value)
| Q(resources__name=search_value)
| Q(resources__region=search_value)
| Q(resources__service=search_value)
| Q(resources__type=search_value)
| Q(resources__uid__contains=search_value)
| Q(resources__name__contains=search_value)
| Q(resources__region__contains=search_value)
| Q(resources__service__contains=search_value)
| Q(resources__tags__text_search=search_query)
| Q(resources__text_search=search_query)
).distinct()
Q(text_search=search_query) | Q(Exists(resource_match))
)
return queryset
@@ -1477,10 +1475,22 @@ class FindingViewSet(BaseRLSViewSet):
return queryset
return super().filter_queryset(queryset)
def inserted_at_to_uuidv7(self, inserted_at):
if inserted_at is None:
return None
return datetime_to_uuid7(inserted_at)
def list(self, request, *args, **kwargs):
base_qs = self.filter_queryset(self.get_queryset())
paginated_ids = self.paginate_queryset(base_qs.values_list("id", flat=True))
if paginated_ids is not None:
ids = list(paginated_ids)
findings = (
Finding.all_objects.filter(tenant_id=self.request.tenant_id, id__in=ids)
.select_related("scan")
.prefetch_related("resources")
)
# Re-sort in Python to preserve ordering:
findings = sorted(findings, key=lambda x: ids.index(x.id))
serializer = self.get_serializer(findings, many=True)
return self.get_paginated_response(serializer.data)
serializer = self.get_serializer(base_qs, many=True)
return Response(serializer.data)
@action(detail=False, methods=["get"], url_name="findings_services_regions")
def findings_services_regions(self, request):