feat(export): add API export system (#6878)

This commit is contained in:
Adrián Jesús Peña Rodríguez
2025-02-26 15:49:44 +01:00
committed by GitHub
parent c4528200b0
commit 669ec74e67
34 changed files with 1613 additions and 90 deletions

View File

@@ -8,6 +8,7 @@ All notable changes to the **Prowler API** are documented in this file.
### Added
- Social login integration with Google and GitHub [(#6906)](https://github.com/prowler-cloud/prowler/pull/6906)
- Add API scan report system, now all scans launched from the API will generate a compressed file with the report in OCSF, CSV and HTML formats [(#6878)](https://github.com/prowler-cloud/prowler/pull/6878).
- Configurable Sentry integration [(#6874)](https://github.com/prowler-cloud/prowler/pull/6874)
### Changed

View File

@@ -28,7 +28,7 @@ start_prod_server() {
start_worker() {
echo "Starting the worker..."
poetry run python -m celery -A config.celery worker -l "${DJANGO_LOGGING_LEVEL:-info}" -Q celery,scans,deletion -E --max-tasks-per-child 1
poetry run python -m celery -A config.celery worker -l "${DJANGO_LOGGING_LEVEL:-info}" -Q celery,scans,scan-reports,deletion -E --max-tasks-per-child 1
}
start_worker_beat() {

View File

@@ -7,7 +7,7 @@ from rest_framework_json_api.serializers import ValidationError
from api.db_utils import POSTGRES_TENANT_VAR, SET_CONFIG_QUERY
def set_tenant(func):
def set_tenant(func=None, *, keep_tenant=False):
"""
Decorator to set the tenant context for a Celery task based on the provided tenant_id.
@@ -40,20 +40,29 @@ def set_tenant(func):
# The tenant context will be set before the task logic executes.
"""
@wraps(func)
@transaction.atomic
def wrapper(*args, **kwargs):
try:
tenant_id = kwargs.pop("tenant_id")
except KeyError:
raise KeyError("This task requires the tenant_id")
try:
uuid.UUID(tenant_id)
except ValueError:
raise ValidationError("Tenant ID must be a valid UUID")
with connection.cursor() as cursor:
cursor.execute(SET_CONFIG_QUERY, [POSTGRES_TENANT_VAR, tenant_id])
def decorator(func):
@wraps(func)
@transaction.atomic
def wrapper(*args, **kwargs):
try:
if not keep_tenant:
tenant_id = kwargs.pop("tenant_id")
else:
tenant_id = kwargs["tenant_id"]
except KeyError:
raise KeyError("This task requires the tenant_id")
try:
uuid.UUID(tenant_id)
except ValueError:
raise ValidationError("Tenant ID must be a valid UUID")
with connection.cursor() as cursor:
cursor.execute(SET_CONFIG_QUERY, [POSTGRES_TENANT_VAR, tenant_id])
return func(*args, **kwargs)
return func(*args, **kwargs)
return wrapper
return wrapper
if func is None:
return decorator
else:
return decorator(func)

View File

@@ -0,0 +1,15 @@
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("api", "0011_findings_performance_indexes_parent"),
]
operations = [
migrations.AddField(
model_name="scan",
name="output_location",
field=models.CharField(blank=True, max_length=200, null=True),
),
]

View File

@@ -414,6 +414,7 @@ class Scan(RowLevelSecurityProtectedModel):
scheduler_task = models.ForeignKey(
PeriodicTask, on_delete=models.CASCADE, null=True, blank=True
)
output_location = models.CharField(blank=True, null=True, max_length=200)
# TODO: mutelist foreign key
class Meta(RowLevelSecurityProtectedModel.Meta):

View File

@@ -4105,6 +4105,43 @@ paths:
schema:
$ref: '#/components/schemas/ScanUpdateResponse'
description: ''
/api/v1/scans/{id}/report:
get:
operationId: scans_report_retrieve
description: Returns a ZIP file containing the requested report
summary: Download ZIP report
parameters:
- in: query
name: fields[scan-reports]
schema:
type: array
items:
type: string
enum:
- id
description: endpoint return only specific fields in the response on a per-type
basis by including a fields[TYPE] query parameter.
explode: false
- in: path
name: id
schema:
type: string
format: uuid
description: A UUID string identifying this scan.
required: true
tags:
- Scan
security:
- jwtAuth: []
responses:
'200':
description: Report obtained successfully
'202':
description: The task is in progress
'403':
description: There is a problem with credentials
'404':
description: The scan has no reports
/api/v1/schedules/daily:
post:
operationId: schedules_daily_create

View File

@@ -274,9 +274,10 @@ class TestValidateInvitation:
expired_time = datetime.now(timezone.utc) - timedelta(days=1)
invitation.expires_at = expired_time
with patch("api.utils.Invitation.objects.using") as mock_using, patch(
"api.utils.datetime"
) as mock_datetime:
with (
patch("api.utils.Invitation.objects.using") as mock_using,
patch("api.utils.datetime") as mock_datetime,
):
mock_db = mock_using.return_value
mock_db.get.return_value = invitation
mock_datetime.now.return_value = datetime.now(timezone.utc)

View File

@@ -1,9 +1,13 @@
import glob
import io
import json
import os
from datetime import datetime, timedelta, timezone
from unittest.mock import ANY, Mock, patch
import jwt
import pytest
from botocore.exceptions import NoCredentialsError
from conftest import API_JSON_CONTENT_TYPE, TEST_PASSWORD, TEST_USER
from django.conf import settings
from django.urls import reverse
@@ -20,6 +24,7 @@ from api.models import (
RoleProviderGroupRelationship,
Scan,
StateChoices,
Task,
User,
UserRoleRelationship,
)
@@ -2079,9 +2084,9 @@ class TestScanViewSet:
("started_at.gte", "2024-01-01", 3),
("started_at.lte", "2024-01-01", 0),
("trigger", Scan.TriggerChoices.MANUAL, 1),
("state", StateChoices.AVAILABLE, 2),
("state", StateChoices.AVAILABLE, 1),
("state", StateChoices.FAILED, 1),
("state.in", f"{StateChoices.FAILED},{StateChoices.AVAILABLE}", 3),
("state.in", f"{StateChoices.FAILED},{StateChoices.AVAILABLE}", 2),
("trigger", Scan.TriggerChoices.MANUAL, 1),
]
),
@@ -2156,6 +2161,159 @@ class TestScanViewSet:
response = authenticated_client.get(reverse("scan-list"), {"sort": "invalid"})
assert response.status_code == status.HTTP_400_BAD_REQUEST
def test_report_executing(self, authenticated_client, scans_fixture):
"""
When the scan is still executing (state == EXECUTING), the view should return
the task data with HTTP 202 and a Content-Location header.
"""
scan = scans_fixture[0]
scan.state = StateChoices.EXECUTING
scan.save()
task = Task.objects.create(tenant_id=scan.tenant_id)
dummy_task_data = {"id": str(task.id), "state": StateChoices.EXECUTING}
scan.task = task
scan.save()
with patch(
"api.v1.views.TaskSerializer",
return_value=type("DummySerializer", (), {"data": dummy_task_data}),
):
url = reverse("scan-report", kwargs={"pk": scan.id})
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_202_ACCEPTED
assert "Content-Location" in response
assert dummy_task_data["id"] in response["Content-Location"]
def test_report_celery_task_executing(self, authenticated_client, scans_fixture):
"""
When the scan is not executing but a related celery task exists and is running,
the view should return that task data with HTTP 202.
"""
scan = scans_fixture[0]
scan.state = StateChoices.COMPLETED
scan.output_location = "dummy"
scan.save()
dummy_task = Task.objects.create(tenant_id=scan.tenant_id)
dummy_task.id = "dummy-task-id"
dummy_task_data = {"id": dummy_task.id, "state": StateChoices.EXECUTING}
with patch("api.v1.views.Task.objects.get", return_value=dummy_task), patch(
"api.v1.views.TaskSerializer",
return_value=type("DummySerializer", (), {"data": dummy_task_data}),
):
url = reverse("scan-report", kwargs={"pk": scan.id})
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_202_ACCEPTED
assert "Content-Location" in response
assert dummy_task_data["id"] in response["Content-Location"]
def test_report_no_output_location(self, authenticated_client, scans_fixture):
"""
If the scan does not have an output_location, the view should return a 404.
"""
scan = scans_fixture[0]
scan.state = StateChoices.COMPLETED
scan.output_location = ""
scan.save()
url = reverse("scan-report", kwargs={"pk": scan.id})
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_404_NOT_FOUND
assert response.json()["errors"]["detail"] == "The scan has no reports."
def test_report_s3_no_credentials(
self, authenticated_client, scans_fixture, monkeypatch
):
"""
When output_location is an S3 URL and get_s3_client() raises a credentials exception,
the view should return HTTP 403 with the proper error message.
"""
scan = scans_fixture[0]
bucket = "test-bucket"
key = "report.zip"
scan.output_location = f"s3://{bucket}/{key}"
scan.state = StateChoices.COMPLETED
scan.save()
def fake_get_s3_client():
raise NoCredentialsError()
monkeypatch.setattr("api.v1.views.get_s3_client", fake_get_s3_client)
url = reverse("scan-report", kwargs={"pk": scan.id})
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_403_FORBIDDEN
assert (
response.json()["errors"]["detail"]
== "There is a problem with credentials."
)
def test_report_s3_success(self, authenticated_client, scans_fixture, monkeypatch):
"""
When output_location is an S3 URL and the S3 client returns the file successfully,
the view should return the ZIP file with HTTP 200 and proper headers.
"""
scan = scans_fixture[0]
bucket = "test-bucket"
key = "report.zip"
scan.output_location = f"s3://{bucket}/{key}"
scan.state = StateChoices.COMPLETED
scan.save()
monkeypatch.setattr(
"api.v1.views.env", type("env", (), {"str": lambda self, key: bucket})()
)
class FakeS3Client:
def get_object(self, Bucket, Key):
assert Bucket == bucket
assert Key == key
return {"Body": io.BytesIO(b"s3 zip content")}
monkeypatch.setattr("api.v1.views.get_s3_client", lambda: FakeS3Client())
url = reverse("scan-report", kwargs={"pk": scan.id})
response = authenticated_client.get(url)
assert response.status_code == 200
expected_filename = os.path.basename("report.zip")
content_disposition = response.get("Content-Disposition")
assert content_disposition.startswith('attachment; filename="')
assert f'filename="{expected_filename}"' in content_disposition
assert response.content == b"s3 zip content"
def test_report_local_file(
self, authenticated_client, scans_fixture, tmp_path, monkeypatch
):
"""
When output_location is a local file path, the view should read the file from disk
and return it with proper headers.
"""
scan = scans_fixture[0]
file_content = b"local zip file content"
file_path = tmp_path / "report.zip"
file_path.write_bytes(file_content)
scan.output_location = str(file_path)
scan.state = StateChoices.COMPLETED
scan.save()
monkeypatch.setattr(
glob,
"glob",
lambda pattern: [str(file_path)] if pattern == str(file_path) else [],
)
url = reverse("scan-report", kwargs={"pk": scan.id})
response = authenticated_client.get(url)
assert response.status_code == 200
assert response.content == file_content
content_disposition = response.get("Content-Disposition")
assert content_disposition.startswith('attachment; filename="')
assert f'filename="{file_path.name}"' in content_disposition
@pytest.mark.django_db
class TestTaskViewSet:

View File

@@ -939,6 +939,14 @@ class ScanTaskSerializer(RLSSerializer):
]
class ScanReportSerializer(serializers.Serializer):
id = serializers.CharField(source="scan")
class Meta:
resource_name = "scan-reports"
fields = ["id"]
class ResourceTagSerializer(RLSSerializer):
"""
Serializer for the ResourceTag model

View File

@@ -1,6 +1,11 @@
import glob
import os
from allauth.socialaccount.providers.github.views import GitHubOAuth2Adapter
from allauth.socialaccount.providers.google.views import GoogleOAuth2Adapter
from botocore.exceptions import ClientError, NoCredentialsError, ParamValidationError
from celery.result import AsyncResult
from config.env import env
from config.settings.social_login import (
GITHUB_OAUTH_CALLBACK_URL,
GOOGLE_OAUTH_CALLBACK_URL,
@@ -12,6 +17,7 @@ from django.contrib.postgres.search import SearchQuery
from django.db import transaction
from django.db.models import Count, Exists, F, OuterRef, Prefetch, Q, Subquery, Sum
from django.db.models.functions import Coalesce
from django.http import HttpResponse
from django.urls import reverse
from django.utils.decorators import method_decorator
from django.views.decorators.cache import cache_control
@@ -38,11 +44,11 @@ from rest_framework.permissions import SAFE_METHODS
from rest_framework_json_api.views import RelationshipView, Response
from rest_framework_simplejwt.exceptions import InvalidToken, TokenError
from tasks.beat import schedule_provider_scan
from tasks.jobs.export import get_s3_client
from tasks.tasks import (
check_provider_connection_task,
delete_provider_task,
delete_tenant_task,
perform_scan_summary_task,
perform_scan_task,
)
@@ -121,6 +127,7 @@ from api.v1.serializers import (
RoleSerializer,
RoleUpdateSerializer,
ScanCreateSerializer,
ScanReportSerializer,
ScanSerializer,
ScanUpdateSerializer,
ScheduleDailyCreateSerializer,
@@ -1116,6 +1123,18 @@ class ProviderViewSet(BaseRLSViewSet):
request=ScanCreateSerializer,
responses={202: OpenApiResponse(response=TaskSerializer)},
),
report=extend_schema(
tags=["Scan"],
summary="Download ZIP report",
description="Returns a ZIP file containing the requested report",
request=ScanReportSerializer,
responses={
200: OpenApiResponse(description="Report obtained successfully"),
202: OpenApiResponse(description="The task is in progress"),
403: OpenApiResponse(description="There is a problem with credentials"),
404: OpenApiResponse(description="The scan has no reports"),
},
),
)
@method_decorator(CACHE_DECORATOR, name="list")
@method_decorator(CACHE_DECORATOR, name="retrieve")
@@ -1164,6 +1183,10 @@ class ScanViewSet(BaseRLSViewSet):
return ScanCreateSerializer
elif self.action == "partial_update":
return ScanUpdateSerializer
elif self.action == "report":
if hasattr(self, "response_serializer_class"):
return self.response_serializer_class
return ScanReportSerializer
return super().get_serializer_class()
def partial_update(self, request, *args, **kwargs):
@@ -1181,6 +1204,93 @@ class ScanViewSet(BaseRLSViewSet):
)
return Response(data=read_serializer.data, status=status.HTTP_200_OK)
@action(detail=True, methods=["get"], url_name="report")
def report(self, request, pk=None):
scan_instance = self.get_object()
if scan_instance.state == StateChoices.EXECUTING:
# If the scan is still running, return the task
prowler_task = Task.objects.get(id=scan_instance.task.id)
self.response_serializer_class = TaskSerializer
output_serializer = self.get_serializer(prowler_task)
return Response(
data=output_serializer.data,
status=status.HTTP_202_ACCEPTED,
headers={
"Content-Location": reverse(
"task-detail", kwargs={"pk": output_serializer.data["id"]}
)
},
)
try:
output_celery_task = Task.objects.get(
task_runner_task__task_name="scan-report",
task_runner_task__task_args__contains=pk,
)
self.response_serializer_class = TaskSerializer
output_serializer = self.get_serializer(output_celery_task)
if output_serializer.data["state"] == StateChoices.EXECUTING:
# If the task is still running, return the task
return Response(
data=output_serializer.data,
status=status.HTTP_202_ACCEPTED,
headers={
"Content-Location": reverse(
"task-detail", kwargs={"pk": output_serializer.data["id"]}
)
},
)
except Task.DoesNotExist:
# If the task does not exist, it means that the task is removed from the database
pass
output_location = scan_instance.output_location
if not output_location:
return Response(
{"detail": "The scan has no reports."},
status=status.HTTP_404_NOT_FOUND,
)
if scan_instance.output_location.startswith("s3://"):
try:
s3_client = get_s3_client()
except (ClientError, NoCredentialsError, ParamValidationError):
return Response(
{"detail": "There is a problem with credentials."},
status=status.HTTP_403_FORBIDDEN,
)
bucket_name = env.str("DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET")
key = output_location[len(f"s3://{bucket_name}/") :]
try:
s3_object = s3_client.get_object(Bucket=bucket_name, Key=key)
except ClientError as e:
error_code = e.response.get("Error", {}).get("Code")
if error_code == "NoSuchKey":
return Response(
{"detail": "The scan has no reports."},
status=status.HTTP_404_NOT_FOUND,
)
return Response(
{"detail": "There is a problem with credentials."},
status=status.HTTP_403_FORBIDDEN,
)
file_content = s3_object["Body"].read()
filename = os.path.basename(output_location.split("/")[-1])
else:
zip_files = glob.glob(output_location)
file_path = zip_files[0]
with open(file_path, "rb") as f:
file_content = f.read()
filename = os.path.basename(file_path)
response = HttpResponse(
file_content, content_type="application/x-zip-compressed"
)
response["Content-Disposition"] = f'attachment; filename="{filename}"'
return response
def create(self, request, *args, **kwargs):
input_serializer = self.get_serializer(data=request.data)
input_serializer.is_valid(raise_exception=True)
@@ -1195,10 +1305,6 @@ class ScanViewSet(BaseRLSViewSet):
# Disabled for now
# checks_to_execute=scan.scanner_args.get("checks_to_execute"),
},
link=perform_scan_summary_task.si(
tenant_id=self.request.tenant_id,
scan_id=str(scan.id),
),
)
scan.task_id = task.id

View File

@@ -221,3 +221,18 @@ CACHE_STALE_WHILE_REVALIDATE = env.int("DJANGO_STALE_WHILE_REVALIDATE", 60)
TESTING = False
FINDINGS_MAX_DAYS_IN_RANGE = env.int("DJANGO_FINDINGS_MAX_DAYS_IN_RANGE", 7)
# API export settings
DJANGO_TMP_OUTPUT_DIRECTORY = env.str(
"DJANGO_TMP_OUTPUT_DIRECTORY", "/tmp/prowler_api_output"
)
DJANGO_FINDINGS_BATCH_SIZE = env.str("DJANGO_FINDINGS_BATCH_SIZE", 1000)
DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET = env.str("DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET", "")
DJANGO_OUTPUT_S3_AWS_ACCESS_KEY_ID = env.str("DJANGO_OUTPUT_S3_AWS_ACCESS_KEY_ID", "")
DJANGO_OUTPUT_S3_AWS_SECRET_ACCESS_KEY = env.str(
"DJANGO_OUTPUT_S3_AWS_SECRET_ACCESS_KEY", ""
)
DJANGO_OUTPUT_S3_AWS_SESSION_TOKEN = env.str("DJANGO_OUTPUT_S3_AWS_SESSION_TOKEN", "")
DJANGO_OUTPUT_S3_AWS_DEFAULT_REGION = env.str("DJANGO_OUTPUT_S3_AWS_DEFAULT_REGION", "")

View File

@@ -486,7 +486,7 @@ def scans_fixture(tenants_fixture, providers_fixture):
name="Scan 1",
provider=provider,
trigger=Scan.TriggerChoices.MANUAL,
state=StateChoices.AVAILABLE,
state=StateChoices.COMPLETED,
tenant_id=tenant.id,
started_at="2024-01-02T00:00:00Z",
)

View File

@@ -0,0 +1,156 @@
import os
import zipfile
import boto3
import config.django.base as base
from botocore.exceptions import ClientError, NoCredentialsError, ParamValidationError
from celery.utils.log import get_task_logger
from django.conf import settings
from prowler.config.config import (
csv_file_suffix,
html_file_suffix,
json_ocsf_file_suffix,
output_file_timestamp,
)
from prowler.lib.outputs.csv.csv import CSV
from prowler.lib.outputs.html.html import HTML
from prowler.lib.outputs.ocsf.ocsf import OCSF
logger = get_task_logger(__name__)
# Predefined mapping for output formats and their configurations
OUTPUT_FORMATS_MAPPING = {
"csv": {
"class": CSV,
"suffix": csv_file_suffix,
"kwargs": {},
},
"json-ocsf": {"class": OCSF, "suffix": json_ocsf_file_suffix, "kwargs": {}},
"html": {"class": HTML, "suffix": html_file_suffix, "kwargs": {"stats": {}}},
}
def _compress_output_files(output_directory: str) -> str:
"""
Compress output files from all configured output formats into a ZIP archive.
Args:
output_directory (str): The directory where the output files are located.
The function looks up all known suffixes in OUTPUT_FORMATS_MAPPING
and compresses those files into a single ZIP.
Returns:
str: The full path to the newly created ZIP archive.
"""
zip_path = f"{output_directory}.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
for suffix in [config["suffix"] for config in OUTPUT_FORMATS_MAPPING.values()]:
zipf.write(
f"{output_directory}{suffix}",
f"output/{output_directory.split('/')[-1]}{suffix}",
)
return zip_path
def get_s3_client():
"""
Create and return a boto3 S3 client using AWS credentials from environment variables.
This function attempts to initialize an S3 client by reading the AWS access key, secret key,
session token, and region from environment variables. It then validates the client by listing
available S3 buckets. If an error occurs during this process (for example, due to missing or
invalid credentials), it falls back to creating an S3 client without explicitly provided credentials,
which may rely on other configuration sources (e.g., IAM roles).
Returns:
boto3.client: A configured S3 client instance.
Raises:
ClientError, NoCredentialsError, or ParamValidationError if both attempts to create a client fail.
"""
s3_client = None
try:
s3_client = boto3.client(
"s3",
aws_access_key_id=settings.DJANGO_OUTPUT_S3_AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.DJANGO_OUTPUT_S3_AWS_SECRET_ACCESS_KEY,
aws_session_token=settings.DJANGO_OUTPUT_S3_AWS_SESSION_TOKEN,
region_name=settings.DJANGO_OUTPUT_S3_AWS_DEFAULT_REGION,
)
s3_client.list_buckets()
except (ClientError, NoCredentialsError, ParamValidationError, ValueError):
s3_client = boto3.client("s3")
s3_client.list_buckets()
return s3_client
def _upload_to_s3(tenant_id: str, zip_path: str, scan_id: str) -> str:
"""
Upload the specified ZIP file to an S3 bucket.
If the S3 bucket environment variables are not configured,
the function returns None without performing an upload.
Args:
tenant_id (str): The tenant identifier, used as part of the S3 key prefix.
zip_path (str): The local file system path to the ZIP file to be uploaded.
scan_id (str): The scan identifier, used as part of the S3 key prefix.
Returns:
str: The S3 URI of the uploaded file (e.g., "s3://<bucket>/<key>") if successful.
None: If the required environment variables for the S3 bucket are not set.
Raises:
botocore.exceptions.ClientError: If the upload attempt to S3 fails for any reason.
"""
if not base.DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET:
return
try:
s3 = get_s3_client()
s3_key = f"{tenant_id}/{scan_id}/{os.path.basename(zip_path)}"
s3.upload_file(
Filename=zip_path,
Bucket=base.DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET,
Key=s3_key,
)
return f"s3://{base.DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET}/{s3_key}"
except (ClientError, NoCredentialsError, ParamValidationError, ValueError) as e:
logger.error(f"S3 upload failed: {str(e)}")
def _generate_output_directory(
output_directory, prowler_provider: object, tenant_id: str, scan_id: str
) -> str:
"""
Generate a file system path for the output directory of a prowler scan.
This function constructs the output directory path by combining a base
temporary output directory, the tenant ID, the scan ID, and details about
the prowler provider along with a timestamp. The resulting path is used to
store the output files of a prowler scan.
Note:
This function depends on one external variable:
- `output_file_timestamp`: A timestamp (as a string) used to uniquely identify the output.
Args:
output_directory (str): The base output directory.
prowler_provider (object): An identifier or descriptor for the prowler provider.
Typically, this is a string indicating the provider (e.g., "aws").
tenant_id (str): The unique identifier for the tenant.
scan_id (str): The unique identifier for the scan.
Returns:
str: The constructed file system path for the prowler scan output directory.
Example:
>>> _generate_output_directory("/tmp", "aws", "tenant-1234", "scan-5678")
'/tmp/tenant-1234/aws/scan-5678/prowler-output-2023-02-15T12:34:56'
"""
path = (
f"{output_directory}/{tenant_id}/{scan_id}/prowler-output-"
f"{prowler_provider}-{output_file_timestamp}"
)
os.makedirs("/".join(path.split("/")[:-1]), exist_ok=True)
return path

View File

@@ -1,14 +1,28 @@
from celery import shared_task
from shutil import rmtree
from celery import chain, shared_task
from celery.utils.log import get_task_logger
from config.celery import RLSTask
from config.django.base import DJANGO_FINDINGS_BATCH_SIZE, DJANGO_TMP_OUTPUT_DIRECTORY
from django_celery_beat.models import PeriodicTask
from tasks.jobs.connection import check_provider_connection
from tasks.jobs.deletion import delete_provider, delete_tenant
from tasks.jobs.export import (
OUTPUT_FORMATS_MAPPING,
_compress_output_files,
_generate_output_directory,
_upload_to_s3,
)
from tasks.jobs.scan import aggregate_findings, perform_prowler_scan
from tasks.utils import get_next_execution_datetime
from tasks.utils import batched, get_next_execution_datetime
from api.db_utils import rls_transaction
from api.decorators import set_tenant
from api.models import Scan, StateChoices
from api.models import Finding, Provider, Scan, ScanSummary, StateChoices
from api.utils import initialize_prowler_provider
from prowler.lib.outputs.finding import Finding as FindingOutput
logger = get_task_logger(__name__)
@shared_task(base=RLSTask, name="provider-connection-check")
@@ -68,13 +82,20 @@ def perform_scan_task(
Returns:
dict: The result of the scan execution, typically including the status and results of the performed checks.
"""
return perform_prowler_scan(
result = perform_prowler_scan(
tenant_id=tenant_id,
scan_id=scan_id,
provider_id=provider_id,
checks_to_execute=checks_to_execute,
)
chain(
perform_scan_summary_task.si(tenant_id, scan_id),
generate_outputs.si(scan_id, provider_id, tenant_id=tenant_id),
).apply_async()
return result
@shared_task(base=RLSTask, bind=True, name="scan-perform-scheduled", queue="scans")
def perform_scheduled_scan_task(self, tenant_id: str, provider_id: str):
@@ -135,12 +156,11 @@ def perform_scheduled_scan_task(self, tenant_id: str, provider_id: str):
scheduler_task_id=periodic_task_instance.id,
)
perform_scan_summary_task.apply_async(
kwargs={
"tenant_id": tenant_id,
"scan_id": str(scan_instance.id),
}
)
chain(
perform_scan_summary_task.si(tenant_id, scan_instance.id),
generate_outputs.si(str(scan_instance.id), provider_id, tenant_id=tenant_id),
).apply_async()
return result
@@ -152,3 +172,108 @@ def perform_scan_summary_task(tenant_id: str, scan_id: str):
@shared_task(name="tenant-deletion", queue="deletion")
def delete_tenant_task(tenant_id: str):
return delete_tenant(pk=tenant_id)
@shared_task(
base=RLSTask,
name="scan-report",
queue="scan-reports",
)
@set_tenant(keep_tenant=True)
def generate_outputs(scan_id: str, provider_id: str, tenant_id: str):
"""
Process findings in batches and generate output files in multiple formats.
This function retrieves findings associated with a scan, processes them
in batches of 50, and writes each batch to the corresponding output files.
It reuses output writer instances across batches, updates them with each
batch of transformed findings, and uses a flag to indicate when the final
batch is being processed. Finally, the output files are compressed and
uploaded to S3.
Args:
tenant_id (str): The tenant identifier.
scan_id (str): The scan identifier.
provider_id (str): The provider_id id to be used in generating outputs.
"""
# Initialize the prowler provider
prowler_provider = initialize_prowler_provider(Provider.objects.get(id=provider_id))
# Get the provider UID
provider_uid = Provider.objects.get(id=provider_id).uid
# Generate and ensure the output directory exists
output_directory = _generate_output_directory(
DJANGO_TMP_OUTPUT_DIRECTORY, provider_uid, tenant_id, scan_id
)
# Define auxiliary variables
output_writers = {}
scan_summary = FindingOutput._transform_findings_stats(
ScanSummary.objects.filter(scan_id=scan_id)
)
# Retrieve findings queryset
findings_qs = Finding.all_objects.filter(scan_id=scan_id).order_by("uid")
# Process findings in batches
for batch, is_last_batch in batched(
findings_qs.iterator(), DJANGO_FINDINGS_BATCH_SIZE
):
finding_outputs = [
FindingOutput.transform_api_finding(finding, prowler_provider)
for finding in batch
]
# Generate output files
for mode, config in OUTPUT_FORMATS_MAPPING.items():
kwargs = dict(config.get("kwargs", {}))
if mode == "html":
kwargs["provider"] = prowler_provider
kwargs["stats"] = scan_summary
writer_class = config["class"]
if writer_class in output_writers:
writer = output_writers[writer_class]
writer.transform(finding_outputs)
writer.close_file = is_last_batch
else:
writer = writer_class(
findings=finding_outputs,
file_path=output_directory,
file_extension=config["suffix"],
from_cli=False,
)
writer.close_file = is_last_batch
output_writers[writer_class] = writer
# Write the current batch using the writer
writer.batch_write_data_to_file(**kwargs)
# TODO: Refactor the output classes to avoid this manual reset
writer._data = []
# Compress output files
output_directory = _compress_output_files(output_directory)
# Save to configured storage
uploaded = _upload_to_s3(tenant_id, output_directory, scan_id)
if uploaded:
output_directory = uploaded
uploaded = True
# Remove the local files after upload
rmtree(DJANGO_TMP_OUTPUT_DIRECTORY, ignore_errors=True)
else:
uploaded = False
# Update the scan instance with the output path
Scan.all_objects.filter(id=scan_id).update(output_location=output_directory)
logger.info(f"Scan output files generated, output location: {output_directory}")
return {
"upload": uploaded,
"scan_id": scan_id,
"provider_id": provider_id,
}

View File

@@ -4,7 +4,7 @@ from unittest.mock import patch
import pytest
from django_celery_beat.models import IntervalSchedule, PeriodicTask
from django_celery_results.models import TaskResult
from tasks.utils import get_next_execution_datetime
from tasks.utils import batched, get_next_execution_datetime
@pytest.mark.django_db
@@ -74,3 +74,29 @@ class TestGetNextExecutionDatetime:
get_next_execution_datetime(
task_id=task_result.task_id, provider_id="nonexistent"
)
class TestBatchedFunction:
def test_empty_iterable(self):
result = list(batched([], 3))
assert result == [([], True)]
def test_exact_batches(self):
result = list(batched([1, 2, 3, 4], 2))
expected = [([1, 2], False), ([3, 4], False), ([], True)]
assert result == expected
def test_inexact_batches(self):
result = list(batched([1, 2, 3, 4, 5], 2))
expected = [([1, 2], False), ([3, 4], False), ([5], True)]
assert result == expected
def test_batch_size_one(self):
result = list(batched([1, 2, 3], 1))
expected = [([1], False), ([2], False), ([3], False), ([], True)]
assert result == expected
def test_batch_size_greater_than_length(self):
result = list(batched([1, 2, 3], 5))
expected = [([1, 2, 3], True)]
assert result == expected

View File

@@ -24,3 +24,27 @@ def get_next_execution_datetime(task_id: int, provider_id: str) -> datetime:
)
return current_scheduled_time + timedelta(**{interval.period: interval.every})
def batched(iterable, batch_size):
"""
Yield successive batches from an iterable.
Args:
iterable: An iterable source of items.
batch_size (int): The number of items per batch.
Yields:
tuple: A pair (batch, is_last_batch) where:
- batch (list): A list of items (with length equal to batch_size,
except possibly for the last batch).
- is_last_batch (bool): True if this is the final batch, False otherwise.
"""
batch = []
for item in iterable:
batch.append(item)
if len(batch) == batch_size:
yield batch, False
batch = []
yield batch, True