feat(export): add API export system (#6878)

2026-07-23 12:31:54 +00:00 · 2025-02-26 15:49:44 +01:00
parent c4528200b0
commit 669ec74e67
34 changed files with 1613 additions and 90 deletions
@@ -8,6 +8,7 @@ All notable changes to the **Prowler API** are documented in this file.

 ### Added
 - Social login integration with Google and GitHub [(#6906)](https://github.com/prowler-cloud/prowler/pull/6906)
+- Add API scan report system, now all scans launched from the API will generate a compressed file with the report in OCSF, CSV and HTML formats [(#6878)](https://github.com/prowler-cloud/prowler/pull/6878).
 - Configurable Sentry integration [(#6874)](https://github.com/prowler-cloud/prowler/pull/6874)

 ### Changed
@@ -28,7 +28,7 @@ start_prod_server() {

 start_worker() {
  echo "Starting the worker..."
-  poetry run python -m celery -A config.celery worker -l "${DJANGO_LOGGING_LEVEL:-info}" -Q celery,scans,deletion -E --max-tasks-per-child 1
+  poetry run python -m celery -A config.celery worker -l "${DJANGO_LOGGING_LEVEL:-info}" -Q celery,scans,scan-reports,deletion -E --max-tasks-per-child 1
 }

 start_worker_beat() {
@@ -7,7 +7,7 @@ from rest_framework_json_api.serializers import ValidationError
 from api.db_utils import POSTGRES_TENANT_VAR, SET_CONFIG_QUERY


-def set_tenant(func):
+def set_tenant(func=None, *, keep_tenant=False):
    """
    Decorator to set the tenant context for a Celery task based on the provided tenant_id.

@@ -40,20 +40,29 @@ def set_tenant(func):
        # The tenant context will be set before the task logic executes.
    """

-    @wraps(func)
-    @transaction.atomic
-    def wrapper(*args, **kwargs):
-        try:
-            tenant_id = kwargs.pop("tenant_id")
-        except KeyError:
-            raise KeyError("This task requires the tenant_id")
-        try:
-            uuid.UUID(tenant_id)
-        except ValueError:
-            raise ValidationError("Tenant ID must be a valid UUID")
-        with connection.cursor() as cursor:
-            cursor.execute(SET_CONFIG_QUERY, [POSTGRES_TENANT_VAR, tenant_id])
+    def decorator(func):
+        @wraps(func)
+        @transaction.atomic
+        def wrapper(*args, **kwargs):
+            try:
+                if not keep_tenant:
+                    tenant_id = kwargs.pop("tenant_id")
+                else:
+                    tenant_id = kwargs["tenant_id"]
+            except KeyError:
+                raise KeyError("This task requires the tenant_id")
+            try:
+                uuid.UUID(tenant_id)
+            except ValueError:
+                raise ValidationError("Tenant ID must be a valid UUID")
+            with connection.cursor() as cursor:
+                cursor.execute(SET_CONFIG_QUERY, [POSTGRES_TENANT_VAR, tenant_id])

-        return func(*args, **kwargs)
+            return func(*args, **kwargs)

-    return wrapper
+        return wrapper
+
+    if func is None:
+        return decorator
+    else:
+        return decorator(func)
@@ -0,0 +1,15 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("api", "0011_findings_performance_indexes_parent"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="scan",
+            name="output_location",
+            field=models.CharField(blank=True, max_length=200, null=True),
+        ),
+    ]
@@ -414,6 +414,7 @@ class Scan(RowLevelSecurityProtectedModel):
    scheduler_task = models.ForeignKey(
        PeriodicTask, on_delete=models.CASCADE, null=True, blank=True
    )
+    output_location = models.CharField(blank=True, null=True, max_length=200)
    # TODO: mutelist foreign key

    class Meta(RowLevelSecurityProtectedModel.Meta):
@@ -4105,6 +4105,43 @@ paths:
              schema:
                $ref: '#/components/schemas/ScanUpdateResponse'
          description: ''
+  /api/v1/scans/{id}/report:
+    get:
+      operationId: scans_report_retrieve
+      description: Returns a ZIP file containing the requested report
+      summary: Download ZIP report
+      parameters:
+      - in: query
+        name: fields[scan-reports]
+        schema:
+          type: array
+          items:
+            type: string
+            enum:
+            - id
+        description: endpoint return only specific fields in the response on a per-type
+          basis by including a fields[TYPE] query parameter.
+        explode: false
+      - in: path
+        name: id
+        schema:
+          type: string
+          format: uuid
+        description: A UUID string identifying this scan.
+        required: true
+      tags:
+      - Scan
+      security:
+      - jwtAuth: []
+      responses:
+        '200':
+          description: Report obtained successfully
+        '202':
+          description: The task is in progress
+        '403':
+          description: There is a problem with credentials
+        '404':
+          description: The scan has no reports
  /api/v1/schedules/daily:
    post:
      operationId: schedules_daily_create
@@ -274,9 +274,10 @@ class TestValidateInvitation:
        expired_time = datetime.now(timezone.utc) - timedelta(days=1)
        invitation.expires_at = expired_time

-        with patch("api.utils.Invitation.objects.using") as mock_using, patch(
-            "api.utils.datetime"
-        ) as mock_datetime:
+        with (
+            patch("api.utils.Invitation.objects.using") as mock_using,
+            patch("api.utils.datetime") as mock_datetime,
+        ):
            mock_db = mock_using.return_value
            mock_db.get.return_value = invitation
            mock_datetime.now.return_value = datetime.now(timezone.utc)
@@ -1,9 +1,13 @@
+import glob
+import io
 import json
+import os
 from datetime import datetime, timedelta, timezone
 from unittest.mock import ANY, Mock, patch

 import jwt
 import pytest
+from botocore.exceptions import NoCredentialsError
 from conftest import API_JSON_CONTENT_TYPE, TEST_PASSWORD, TEST_USER
 from django.conf import settings
 from django.urls import reverse
@@ -20,6 +24,7 @@ from api.models import (
    RoleProviderGroupRelationship,
    Scan,
    StateChoices,
+    Task,
    User,
    UserRoleRelationship,
 )
@@ -2079,9 +2084,9 @@ class TestScanViewSet:
                ("started_at.gte", "2024-01-01", 3),
                ("started_at.lte", "2024-01-01", 0),
                ("trigger", Scan.TriggerChoices.MANUAL, 1),
-                ("state", StateChoices.AVAILABLE, 2),
+                ("state", StateChoices.AVAILABLE, 1),
                ("state", StateChoices.FAILED, 1),
-                ("state.in", f"{StateChoices.FAILED},{StateChoices.AVAILABLE}", 3),
+                ("state.in", f"{StateChoices.FAILED},{StateChoices.AVAILABLE}", 2),
                ("trigger", Scan.TriggerChoices.MANUAL, 1),
            ]
        ),
@@ -2156,6 +2161,159 @@ class TestScanViewSet:
        response = authenticated_client.get(reverse("scan-list"), {"sort": "invalid"})
        assert response.status_code == status.HTTP_400_BAD_REQUEST

+    def test_report_executing(self, authenticated_client, scans_fixture):
+        """
+        When the scan is still executing (state == EXECUTING), the view should return
+        the task data with HTTP 202 and a Content-Location header.
+        """
+        scan = scans_fixture[0]
+        scan.state = StateChoices.EXECUTING
+        scan.save()
+
+        task = Task.objects.create(tenant_id=scan.tenant_id)
+        dummy_task_data = {"id": str(task.id), "state": StateChoices.EXECUTING}
+
+        scan.task = task
+        scan.save()
+
+        with patch(
+            "api.v1.views.TaskSerializer",
+            return_value=type("DummySerializer", (), {"data": dummy_task_data}),
+        ):
+            url = reverse("scan-report", kwargs={"pk": scan.id})
+            response = authenticated_client.get(url)
+            assert response.status_code == status.HTTP_202_ACCEPTED
+            assert "Content-Location" in response
+            assert dummy_task_data["id"] in response["Content-Location"]
+
+    def test_report_celery_task_executing(self, authenticated_client, scans_fixture):
+        """
+        When the scan is not executing but a related celery task exists and is running,
+        the view should return that task data with HTTP 202.
+        """
+        scan = scans_fixture[0]
+        scan.state = StateChoices.COMPLETED
+        scan.output_location = "dummy"
+        scan.save()
+
+        dummy_task = Task.objects.create(tenant_id=scan.tenant_id)
+        dummy_task.id = "dummy-task-id"
+        dummy_task_data = {"id": dummy_task.id, "state": StateChoices.EXECUTING}
+
+        with patch("api.v1.views.Task.objects.get", return_value=dummy_task), patch(
+            "api.v1.views.TaskSerializer",
+            return_value=type("DummySerializer", (), {"data": dummy_task_data}),
+        ):
+            url = reverse("scan-report", kwargs={"pk": scan.id})
+            response = authenticated_client.get(url)
+            assert response.status_code == status.HTTP_202_ACCEPTED
+            assert "Content-Location" in response
+            assert dummy_task_data["id"] in response["Content-Location"]
+
+    def test_report_no_output_location(self, authenticated_client, scans_fixture):
+        """
+        If the scan does not have an output_location, the view should return a 404.
+        """
+        scan = scans_fixture[0]
+        scan.state = StateChoices.COMPLETED
+        scan.output_location = ""
+        scan.save()
+
+        url = reverse("scan-report", kwargs={"pk": scan.id})
+        response = authenticated_client.get(url)
+        assert response.status_code == status.HTTP_404_NOT_FOUND
+        assert response.json()["errors"]["detail"] == "The scan has no reports."
+
+    def test_report_s3_no_credentials(
+        self, authenticated_client, scans_fixture, monkeypatch
+    ):
+        """
+        When output_location is an S3 URL and get_s3_client() raises a credentials exception,
+        the view should return HTTP 403 with the proper error message.
+        """
+        scan = scans_fixture[0]
+        bucket = "test-bucket"
+        key = "report.zip"
+        scan.output_location = f"s3://{bucket}/{key}"
+        scan.state = StateChoices.COMPLETED
+        scan.save()
+
+        def fake_get_s3_client():
+            raise NoCredentialsError()
+
+        monkeypatch.setattr("api.v1.views.get_s3_client", fake_get_s3_client)
+
+        url = reverse("scan-report", kwargs={"pk": scan.id})
+        response = authenticated_client.get(url)
+        assert response.status_code == status.HTTP_403_FORBIDDEN
+        assert (
+            response.json()["errors"]["detail"]
+            == "There is a problem with credentials."
+        )
+
+    def test_report_s3_success(self, authenticated_client, scans_fixture, monkeypatch):
+        """
+        When output_location is an S3 URL and the S3 client returns the file successfully,
+        the view should return the ZIP file with HTTP 200 and proper headers.
+        """
+        scan = scans_fixture[0]
+        bucket = "test-bucket"
+        key = "report.zip"
+        scan.output_location = f"s3://{bucket}/{key}"
+        scan.state = StateChoices.COMPLETED
+        scan.save()
+
+        monkeypatch.setattr(
+            "api.v1.views.env", type("env", (), {"str": lambda self, key: bucket})()
+        )
+
+        class FakeS3Client:
+            def get_object(self, Bucket, Key):
+                assert Bucket == bucket
+                assert Key == key
+                return {"Body": io.BytesIO(b"s3 zip content")}
+
+        monkeypatch.setattr("api.v1.views.get_s3_client", lambda: FakeS3Client())
+
+        url = reverse("scan-report", kwargs={"pk": scan.id})
+        response = authenticated_client.get(url)
+        assert response.status_code == 200
+        expected_filename = os.path.basename("report.zip")
+        content_disposition = response.get("Content-Disposition")
+        assert content_disposition.startswith('attachment; filename="')
+        assert f'filename="{expected_filename}"' in content_disposition
+        assert response.content == b"s3 zip content"
+
+    def test_report_local_file(
+        self, authenticated_client, scans_fixture, tmp_path, monkeypatch
+    ):
+        """
+        When output_location is a local file path, the view should read the file from disk
+        and return it with proper headers.
+        """
+        scan = scans_fixture[0]
+        file_content = b"local zip file content"
+        file_path = tmp_path / "report.zip"
+        file_path.write_bytes(file_content)
+
+        scan.output_location = str(file_path)
+        scan.state = StateChoices.COMPLETED
+        scan.save()
+
+        monkeypatch.setattr(
+            glob,
+            "glob",
+            lambda pattern: [str(file_path)] if pattern == str(file_path) else [],
+        )
+
+        url = reverse("scan-report", kwargs={"pk": scan.id})
+        response = authenticated_client.get(url)
+        assert response.status_code == 200
+        assert response.content == file_content
+        content_disposition = response.get("Content-Disposition")
+        assert content_disposition.startswith('attachment; filename="')
+        assert f'filename="{file_path.name}"' in content_disposition
+

@pytest.mark.django_db
 class TestTaskViewSet:
@@ -939,6 +939,14 @@ class ScanTaskSerializer(RLSSerializer):
        ]


+class ScanReportSerializer(serializers.Serializer):
+    id = serializers.CharField(source="scan")
+
+    class Meta:
+        resource_name = "scan-reports"
+        fields = ["id"]
+
+
 class ResourceTagSerializer(RLSSerializer):
    """
    Serializer for the ResourceTag model
@@ -1,6 +1,11 @@
+import glob
+import os
+
 from allauth.socialaccount.providers.github.views import GitHubOAuth2Adapter
 from allauth.socialaccount.providers.google.views import GoogleOAuth2Adapter
+from botocore.exceptions import ClientError, NoCredentialsError, ParamValidationError
 from celery.result import AsyncResult
+from config.env import env
 from config.settings.social_login import (
    GITHUB_OAUTH_CALLBACK_URL,
    GOOGLE_OAUTH_CALLBACK_URL,
@@ -12,6 +17,7 @@ from django.contrib.postgres.search import SearchQuery
 from django.db import transaction
 from django.db.models import Count, Exists, F, OuterRef, Prefetch, Q, Subquery, Sum
 from django.db.models.functions import Coalesce
+from django.http import HttpResponse
 from django.urls import reverse
 from django.utils.decorators import method_decorator
 from django.views.decorators.cache import cache_control
@@ -38,11 +44,11 @@ from rest_framework.permissions import SAFE_METHODS
 from rest_framework_json_api.views import RelationshipView, Response
 from rest_framework_simplejwt.exceptions import InvalidToken, TokenError
 from tasks.beat import schedule_provider_scan
+from tasks.jobs.export import get_s3_client
 from tasks.tasks import (
    check_provider_connection_task,
    delete_provider_task,
    delete_tenant_task,
-    perform_scan_summary_task,
    perform_scan_task,
 )

@@ -121,6 +127,7 @@ from api.v1.serializers import (
    RoleSerializer,
    RoleUpdateSerializer,
    ScanCreateSerializer,
+    ScanReportSerializer,
    ScanSerializer,
    ScanUpdateSerializer,
    ScheduleDailyCreateSerializer,
@@ -1116,6 +1123,18 @@ class ProviderViewSet(BaseRLSViewSet):
        request=ScanCreateSerializer,
        responses={202: OpenApiResponse(response=TaskSerializer)},
    ),
+    report=extend_schema(
+        tags=["Scan"],
+        summary="Download ZIP report",
+        description="Returns a ZIP file containing the requested report",
+        request=ScanReportSerializer,
+        responses={
+            200: OpenApiResponse(description="Report obtained successfully"),
+            202: OpenApiResponse(description="The task is in progress"),
+            403: OpenApiResponse(description="There is a problem with credentials"),
+            404: OpenApiResponse(description="The scan has no reports"),
+        },
+    ),
 )
@method_decorator(CACHE_DECORATOR, name="list")
@method_decorator(CACHE_DECORATOR, name="retrieve")
@@ -1164,6 +1183,10 @@ class ScanViewSet(BaseRLSViewSet):
            return ScanCreateSerializer
        elif self.action == "partial_update":
            return ScanUpdateSerializer
+        elif self.action == "report":
+            if hasattr(self, "response_serializer_class"):
+                return self.response_serializer_class
+            return ScanReportSerializer
        return super().get_serializer_class()

    def partial_update(self, request, *args, **kwargs):
@@ -1181,6 +1204,93 @@ class ScanViewSet(BaseRLSViewSet):
        )
        return Response(data=read_serializer.data, status=status.HTTP_200_OK)

+    @action(detail=True, methods=["get"], url_name="report")
+    def report(self, request, pk=None):
+        scan_instance = self.get_object()
+
+        if scan_instance.state == StateChoices.EXECUTING:
+            # If the scan is still running, return the task
+            prowler_task = Task.objects.get(id=scan_instance.task.id)
+            self.response_serializer_class = TaskSerializer
+            output_serializer = self.get_serializer(prowler_task)
+            return Response(
+                data=output_serializer.data,
+                status=status.HTTP_202_ACCEPTED,
+                headers={
+                    "Content-Location": reverse(
+                        "task-detail", kwargs={"pk": output_serializer.data["id"]}
+                    )
+                },
+            )
+
+        try:
+            output_celery_task = Task.objects.get(
+                task_runner_task__task_name="scan-report",
+                task_runner_task__task_args__contains=pk,
+            )
+            self.response_serializer_class = TaskSerializer
+            output_serializer = self.get_serializer(output_celery_task)
+            if output_serializer.data["state"] == StateChoices.EXECUTING:
+                # If the task is still running, return the task
+                return Response(
+                    data=output_serializer.data,
+                    status=status.HTTP_202_ACCEPTED,
+                    headers={
+                        "Content-Location": reverse(
+                            "task-detail", kwargs={"pk": output_serializer.data["id"]}
+                        )
+                    },
+                )
+        except Task.DoesNotExist:
+            # If the task does not exist, it means that the task is removed from the database
+            pass
+
+        output_location = scan_instance.output_location
+        if not output_location:
+            return Response(
+                {"detail": "The scan has no reports."},
+                status=status.HTTP_404_NOT_FOUND,
+            )
+
+        if scan_instance.output_location.startswith("s3://"):
+            try:
+                s3_client = get_s3_client()
+            except (ClientError, NoCredentialsError, ParamValidationError):
+                return Response(
+                    {"detail": "There is a problem with credentials."},
+                    status=status.HTTP_403_FORBIDDEN,
+                )
+
+            bucket_name = env.str("DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET")
+            key = output_location[len(f"s3://{bucket_name}/") :]
+            try:
+                s3_object = s3_client.get_object(Bucket=bucket_name, Key=key)
+            except ClientError as e:
+                error_code = e.response.get("Error", {}).get("Code")
+                if error_code == "NoSuchKey":
+                    return Response(
+                        {"detail": "The scan has no reports."},
+                        status=status.HTTP_404_NOT_FOUND,
+                    )
+                return Response(
+                    {"detail": "There is a problem with credentials."},
+                    status=status.HTTP_403_FORBIDDEN,
+                )
+            file_content = s3_object["Body"].read()
+            filename = os.path.basename(output_location.split("/")[-1])
+        else:
+            zip_files = glob.glob(output_location)
+            file_path = zip_files[0]
+            with open(file_path, "rb") as f:
+                file_content = f.read()
+            filename = os.path.basename(file_path)
+
+        response = HttpResponse(
+            file_content, content_type="application/x-zip-compressed"
+        )
+        response["Content-Disposition"] = f'attachment; filename="{filename}"'
+        return response
+
    def create(self, request, *args, **kwargs):
        input_serializer = self.get_serializer(data=request.data)
        input_serializer.is_valid(raise_exception=True)
@@ -1195,10 +1305,6 @@ class ScanViewSet(BaseRLSViewSet):
                    # Disabled for now
                    # checks_to_execute=scan.scanner_args.get("checks_to_execute"),
                },
-                link=perform_scan_summary_task.si(
-                    tenant_id=self.request.tenant_id,
-                    scan_id=str(scan.id),
-                ),
            )

        scan.task_id = task.id
@@ -221,3 +221,18 @@ CACHE_STALE_WHILE_REVALIDATE = env.int("DJANGO_STALE_WHILE_REVALIDATE", 60)
 TESTING = False

 FINDINGS_MAX_DAYS_IN_RANGE = env.int("DJANGO_FINDINGS_MAX_DAYS_IN_RANGE", 7)
+
+
+# API export settings
+DJANGO_TMP_OUTPUT_DIRECTORY = env.str(
+    "DJANGO_TMP_OUTPUT_DIRECTORY", "/tmp/prowler_api_output"
+)
+DJANGO_FINDINGS_BATCH_SIZE = env.str("DJANGO_FINDINGS_BATCH_SIZE", 1000)
+
+DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET = env.str("DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET", "")
+DJANGO_OUTPUT_S3_AWS_ACCESS_KEY_ID = env.str("DJANGO_OUTPUT_S3_AWS_ACCESS_KEY_ID", "")
+DJANGO_OUTPUT_S3_AWS_SECRET_ACCESS_KEY = env.str(
+    "DJANGO_OUTPUT_S3_AWS_SECRET_ACCESS_KEY", ""
+)
+DJANGO_OUTPUT_S3_AWS_SESSION_TOKEN = env.str("DJANGO_OUTPUT_S3_AWS_SESSION_TOKEN", "")
+DJANGO_OUTPUT_S3_AWS_DEFAULT_REGION = env.str("DJANGO_OUTPUT_S3_AWS_DEFAULT_REGION", "")
@@ -486,7 +486,7 @@ def scans_fixture(tenants_fixture, providers_fixture):
        name="Scan 1",
        provider=provider,
        trigger=Scan.TriggerChoices.MANUAL,
-        state=StateChoices.AVAILABLE,
+        state=StateChoices.COMPLETED,
        tenant_id=tenant.id,
        started_at="2024-01-02T00:00:00Z",
    )
@@ -0,0 +1,156 @@
+import os
+import zipfile
+
+import boto3
+import config.django.base as base
+from botocore.exceptions import ClientError, NoCredentialsError, ParamValidationError
+from celery.utils.log import get_task_logger
+from django.conf import settings
+
+from prowler.config.config import (
+    csv_file_suffix,
+    html_file_suffix,
+    json_ocsf_file_suffix,
+    output_file_timestamp,
+)
+from prowler.lib.outputs.csv.csv import CSV
+from prowler.lib.outputs.html.html import HTML
+from prowler.lib.outputs.ocsf.ocsf import OCSF
+
+logger = get_task_logger(__name__)
+
+
+# Predefined mapping for output formats and their configurations
+OUTPUT_FORMATS_MAPPING = {
+    "csv": {
+        "class": CSV,
+        "suffix": csv_file_suffix,
+        "kwargs": {},
+    },
+    "json-ocsf": {"class": OCSF, "suffix": json_ocsf_file_suffix, "kwargs": {}},
+    "html": {"class": HTML, "suffix": html_file_suffix, "kwargs": {"stats": {}}},
+}
+
+
+def _compress_output_files(output_directory: str) -> str:
+    """
+    Compress output files from all configured output formats into a ZIP archive.
+    Args:
+        output_directory (str): The directory where the output files are located.
+            The function looks up all known suffixes in OUTPUT_FORMATS_MAPPING
+            and compresses those files into a single ZIP.
+    Returns:
+        str: The full path to the newly created ZIP archive.
+    """
+    zip_path = f"{output_directory}.zip"
+
+    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
+        for suffix in [config["suffix"] for config in OUTPUT_FORMATS_MAPPING.values()]:
+            zipf.write(
+                f"{output_directory}{suffix}",
+                f"output/{output_directory.split('/')[-1]}{suffix}",
+            )
+
+    return zip_path
+
+
+def get_s3_client():
+    """
+    Create and return a boto3 S3 client using AWS credentials from environment variables.
+
+    This function attempts to initialize an S3 client by reading the AWS access key, secret key,
+    session token, and region from environment variables. It then validates the client by listing
+    available S3 buckets. If an error occurs during this process (for example, due to missing or
+    invalid credentials), it falls back to creating an S3 client without explicitly provided credentials,
+    which may rely on other configuration sources (e.g., IAM roles).
+
+    Returns:
+        boto3.client: A configured S3 client instance.
+
+    Raises:
+        ClientError, NoCredentialsError, or ParamValidationError if both attempts to create a client fail.
+    """
+    s3_client = None
+    try:
+        s3_client = boto3.client(
+            "s3",
+            aws_access_key_id=settings.DJANGO_OUTPUT_S3_AWS_ACCESS_KEY_ID,
+            aws_secret_access_key=settings.DJANGO_OUTPUT_S3_AWS_SECRET_ACCESS_KEY,
+            aws_session_token=settings.DJANGO_OUTPUT_S3_AWS_SESSION_TOKEN,
+            region_name=settings.DJANGO_OUTPUT_S3_AWS_DEFAULT_REGION,
+        )
+        s3_client.list_buckets()
+    except (ClientError, NoCredentialsError, ParamValidationError, ValueError):
+        s3_client = boto3.client("s3")
+        s3_client.list_buckets()
+
+    return s3_client
+
+
+def _upload_to_s3(tenant_id: str, zip_path: str, scan_id: str) -> str:
+    """
+    Upload the specified ZIP file to an S3 bucket.
+    If the S3 bucket environment variables are not configured,
+    the function returns None without performing an upload.
+    Args:
+        tenant_id (str): The tenant identifier, used as part of the S3 key prefix.
+        zip_path (str): The local file system path to the ZIP file to be uploaded.
+        scan_id (str): The scan identifier, used as part of the S3 key prefix.
+    Returns:
+        str: The S3 URI of the uploaded file (e.g., "s3://<bucket>/<key>") if successful.
+        None: If the required environment variables for the S3 bucket are not set.
+    Raises:
+        botocore.exceptions.ClientError: If the upload attempt to S3 fails for any reason.
+    """
+    if not base.DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET:
+        return
+
+    try:
+        s3 = get_s3_client()
+        s3_key = f"{tenant_id}/{scan_id}/{os.path.basename(zip_path)}"
+        s3.upload_file(
+            Filename=zip_path,
+            Bucket=base.DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET,
+            Key=s3_key,
+        )
+        return f"s3://{base.DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET}/{s3_key}"
+    except (ClientError, NoCredentialsError, ParamValidationError, ValueError) as e:
+        logger.error(f"S3 upload failed: {str(e)}")
+
+
+def _generate_output_directory(
+    output_directory, prowler_provider: object, tenant_id: str, scan_id: str
+) -> str:
+    """
+    Generate a file system path for the output directory of a prowler scan.
+
+    This function constructs the output directory path by combining a base
+    temporary output directory, the tenant ID, the scan ID, and details about
+    the prowler provider along with a timestamp. The resulting path is used to
+    store the output files of a prowler scan.
+
+    Note:
+        This function depends on one external variable:
+          - `output_file_timestamp`: A timestamp (as a string) used to uniquely identify the output.
+
+    Args:
+        output_directory (str): The base output directory.
+        prowler_provider (object): An identifier or descriptor for the prowler provider.
+                                   Typically, this is a string indicating the provider (e.g., "aws").
+        tenant_id (str): The unique identifier for the tenant.
+        scan_id (str): The unique identifier for the scan.
+
+    Returns:
+        str: The constructed file system path for the prowler scan output directory.
+
+    Example:
+        >>> _generate_output_directory("/tmp", "aws", "tenant-1234", "scan-5678")
+        '/tmp/tenant-1234/aws/scan-5678/prowler-output-2023-02-15T12:34:56'
+    """
+    path = (
+        f"{output_directory}/{tenant_id}/{scan_id}/prowler-output-"
+        f"{prowler_provider}-{output_file_timestamp}"
+    )
+    os.makedirs("/".join(path.split("/")[:-1]), exist_ok=True)
+
+    return path
@@ -1,14 +1,28 @@
-from celery import shared_task
+from shutil import rmtree
+
+from celery import chain, shared_task
+from celery.utils.log import get_task_logger
 from config.celery import RLSTask
+from config.django.base import DJANGO_FINDINGS_BATCH_SIZE, DJANGO_TMP_OUTPUT_DIRECTORY
 from django_celery_beat.models import PeriodicTask
 from tasks.jobs.connection import check_provider_connection
 from tasks.jobs.deletion import delete_provider, delete_tenant
+from tasks.jobs.export import (
+    OUTPUT_FORMATS_MAPPING,
+    _compress_output_files,
+    _generate_output_directory,
+    _upload_to_s3,
+)
 from tasks.jobs.scan import aggregate_findings, perform_prowler_scan
-from tasks.utils import get_next_execution_datetime
+from tasks.utils import batched, get_next_execution_datetime

 from api.db_utils import rls_transaction
 from api.decorators import set_tenant
-from api.models import Scan, StateChoices
+from api.models import Finding, Provider, Scan, ScanSummary, StateChoices
+from api.utils import initialize_prowler_provider
+from prowler.lib.outputs.finding import Finding as FindingOutput
+
+logger = get_task_logger(__name__)


@shared_task(base=RLSTask, name="provider-connection-check")
@@ -68,13 +82,20 @@ def perform_scan_task(
    Returns:
        dict: The result of the scan execution, typically including the status and results of the performed checks.
    """
-    return perform_prowler_scan(
+    result = perform_prowler_scan(
        tenant_id=tenant_id,
        scan_id=scan_id,
        provider_id=provider_id,
        checks_to_execute=checks_to_execute,
    )

+    chain(
+        perform_scan_summary_task.si(tenant_id, scan_id),
+        generate_outputs.si(scan_id, provider_id, tenant_id=tenant_id),
+    ).apply_async()
+
+    return result
+

@shared_task(base=RLSTask, bind=True, name="scan-perform-scheduled", queue="scans")
 def perform_scheduled_scan_task(self, tenant_id: str, provider_id: str):
@@ -135,12 +156,11 @@ def perform_scheduled_scan_task(self, tenant_id: str, provider_id: str):
                scheduler_task_id=periodic_task_instance.id,
            )

-    perform_scan_summary_task.apply_async(
-        kwargs={
-            "tenant_id": tenant_id,
-            "scan_id": str(scan_instance.id),
-        }
-    )
+    chain(
+        perform_scan_summary_task.si(tenant_id, scan_instance.id),
+        generate_outputs.si(str(scan_instance.id), provider_id, tenant_id=tenant_id),
+    ).apply_async()
+
    return result


@@ -152,3 +172,108 @@ def perform_scan_summary_task(tenant_id: str, scan_id: str):
@shared_task(name="tenant-deletion", queue="deletion")
 def delete_tenant_task(tenant_id: str):
    return delete_tenant(pk=tenant_id)
+
+
+@shared_task(
+    base=RLSTask,
+    name="scan-report",
+    queue="scan-reports",
+)
+@set_tenant(keep_tenant=True)
+def generate_outputs(scan_id: str, provider_id: str, tenant_id: str):
+    """
+    Process findings in batches and generate output files in multiple formats.
+
+    This function retrieves findings associated with a scan, processes them
+    in batches of 50, and writes each batch to the corresponding output files.
+    It reuses output writer instances across batches, updates them with each
+    batch of transformed findings, and uses a flag to indicate when the final
+    batch is being processed. Finally, the output files are compressed and
+    uploaded to S3.
+
+    Args:
+        tenant_id (str): The tenant identifier.
+        scan_id (str): The scan identifier.
+        provider_id (str): The provider_id id to be used in generating outputs.
+    """
+    # Initialize the prowler provider
+    prowler_provider = initialize_prowler_provider(Provider.objects.get(id=provider_id))
+
+    # Get the provider UID
+    provider_uid = Provider.objects.get(id=provider_id).uid
+
+    # Generate and ensure the output directory exists
+    output_directory = _generate_output_directory(
+        DJANGO_TMP_OUTPUT_DIRECTORY, provider_uid, tenant_id, scan_id
+    )
+
+    # Define auxiliary variables
+    output_writers = {}
+    scan_summary = FindingOutput._transform_findings_stats(
+        ScanSummary.objects.filter(scan_id=scan_id)
+    )
+
+    # Retrieve findings queryset
+    findings_qs = Finding.all_objects.filter(scan_id=scan_id).order_by("uid")
+
+    # Process findings in batches
+    for batch, is_last_batch in batched(
+        findings_qs.iterator(), DJANGO_FINDINGS_BATCH_SIZE
+    ):
+        finding_outputs = [
+            FindingOutput.transform_api_finding(finding, prowler_provider)
+            for finding in batch
+        ]
+
+        # Generate output files
+        for mode, config in OUTPUT_FORMATS_MAPPING.items():
+            kwargs = dict(config.get("kwargs", {}))
+            if mode == "html":
+                kwargs["provider"] = prowler_provider
+                kwargs["stats"] = scan_summary
+
+            writer_class = config["class"]
+            if writer_class in output_writers:
+                writer = output_writers[writer_class]
+                writer.transform(finding_outputs)
+                writer.close_file = is_last_batch
+            else:
+                writer = writer_class(
+                    findings=finding_outputs,
+                    file_path=output_directory,
+                    file_extension=config["suffix"],
+                    from_cli=False,
+                )
+                writer.close_file = is_last_batch
+                output_writers[writer_class] = writer
+
+            # Write the current batch using the writer
+            writer.batch_write_data_to_file(**kwargs)
+
+            # TODO: Refactor the output classes to avoid this manual reset
+            writer._data = []
+
+    # Compress output files
+    output_directory = _compress_output_files(output_directory)
+
+    # Save to configured storage
+    uploaded = _upload_to_s3(tenant_id, output_directory, scan_id)
+
+    if uploaded:
+        output_directory = uploaded
+        uploaded = True
+        # Remove the local files after upload
+        rmtree(DJANGO_TMP_OUTPUT_DIRECTORY, ignore_errors=True)
+    else:
+        uploaded = False
+
+    # Update the scan instance with the output path
+    Scan.all_objects.filter(id=scan_id).update(output_location=output_directory)
+
+    logger.info(f"Scan output files generated, output location: {output_directory}")
+
+    return {
+        "upload": uploaded,
+        "scan_id": scan_id,
+        "provider_id": provider_id,
+    }
@@ -4,7 +4,7 @@ from unittest.mock import patch
 import pytest
 from django_celery_beat.models import IntervalSchedule, PeriodicTask
 from django_celery_results.models import TaskResult
-from tasks.utils import get_next_execution_datetime
+from tasks.utils import batched, get_next_execution_datetime


@pytest.mark.django_db
@@ -74,3 +74,29 @@ class TestGetNextExecutionDatetime:
            get_next_execution_datetime(
                task_id=task_result.task_id, provider_id="nonexistent"
            )
+
+
+class TestBatchedFunction:
+    def test_empty_iterable(self):
+        result = list(batched([], 3))
+        assert result == [([], True)]
+
+    def test_exact_batches(self):
+        result = list(batched([1, 2, 3, 4], 2))
+        expected = [([1, 2], False), ([3, 4], False), ([], True)]
+        assert result == expected
+
+    def test_inexact_batches(self):
+        result = list(batched([1, 2, 3, 4, 5], 2))
+        expected = [([1, 2], False), ([3, 4], False), ([5], True)]
+        assert result == expected
+
+    def test_batch_size_one(self):
+        result = list(batched([1, 2, 3], 1))
+        expected = [([1], False), ([2], False), ([3], False), ([], True)]
+        assert result == expected
+
+    def test_batch_size_greater_than_length(self):
+        result = list(batched([1, 2, 3], 5))
+        expected = [([1, 2, 3], True)]
+        assert result == expected
@@ -24,3 +24,27 @@ def get_next_execution_datetime(task_id: int, provider_id: str) -> datetime:
    )

    return current_scheduled_time + timedelta(**{interval.period: interval.every})
+
+
+def batched(iterable, batch_size):
+    """
+    Yield successive batches from an iterable.
+
+    Args:
+        iterable: An iterable source of items.
+        batch_size (int): The number of items per batch.
+
+    Yields:
+        tuple: A pair (batch, is_last_batch) where:
+            - batch (list): A list of items (with length equal to batch_size,
+              except possibly for the last batch).
+            - is_last_batch (bool): True if this is the final batch, False otherwise.
+    """
+    batch = []
+    for item in iterable:
+        batch.append(item)
+        if len(batch) == batch_size:
+            yield batch, False
+            batch = []
+
+    yield batch, True