fix(api): redirect scan report and compliance downloads to presigned S3 URLs (#10927)

2026-05-06 08:47:18 +00:00 · 2026-04-29 13:19:19 +02:00
parent a981dc64a7
commit 5d90352a0f
5 changed files with 1793 additions and 125 deletions
@@ -7,6 +7,7 @@ All notable changes to the **Prowler API** are documented in this file.
 ### 🐞 Fixed

 - Attack Paths: AWS scans no longer fail when enabled regions cannot be retrieved, and scans stuck in `scheduled` state are now cleaned up after the stale threshold [(#10917)](https://github.com/prowler-cloud/prowler/pull/10917)
+- Scan report and compliance downloads now redirect to a presigned S3 URL instead of streaming through the API worker, preventing gunicorn timeouts on large files [(#10927)](https://github.com/prowler-cloud/prowler/pull/10927)

 ---

@@ -20,7 +21,7 @@ All notable changes to the **Prowler API** are documented in this file.

 ### 🔄 Changed

- Allows tenant owners to expel users from their organizations  [(#10787)](https://github.com/prowler-cloud/prowler/pull/10787)
+- Allows tenant owners to expel users from their organizations [(#10787)](https://github.com/prowler-cloud/prowler/pull/10787)
 - `aggregate_findings`, `aggregate_attack_surface`, `aggregate_scan_resource_group_summaries` and `aggregate_scan_category_summaries` now upsert via `bulk_create(update_conflicts=True, ...)` instead of the prior `ignore_conflicts=True` / plain INSERT / `already backfilled` short-circuit. Re-runs triggered by the post-mute reaggregation pipeline no longer trip the `unique_*_per_scan` constraints nor silently drop updates, and are race-safe under concurrent writers (e.g. scan completion overlapping with a fresh mute rule) [(#10843)](https://github.com/prowler-cloud/prowler/pull/10843)
 - Rename the scan-category and scan-resource-group summary aggregators from `backfill_*` to `aggregate_*` [(#10843)](https://github.com/prowler-cloud/prowler/pull/10843)

@@ -52,7 +52,7 @@ class ApiConfig(AppConfig):
            "check_and_fix_socialaccount_sites_migration",
        ]

-        # Skip Neo4j initialization during tests, some Django commands, and Celery
+        # Skip eager Neo4j init for tests, some Django commands, and Celery (prefork pool: driver must stay lazy, no post_fork hook)
        if getattr(settings, "TESTING", False) or (
            len(sys.argv) > 1
            and (
@@ -64,7 +64,7 @@ class ApiConfig(AppConfig):
            )
        ):
            logger.info(
-                "Skipping Neo4j initialization because tests, some Django commands or Celery"
+                "Skipping eager Neo4j init: tests, some Django commands, or Celery prefork pool (driver stays lazy)"
            )

        else:
@@ -3841,9 +3841,14 @@ class TestScanViewSet:
            "prowler-output-123_threatscore_report.pdf",
        )

+        presigned_url = (
+            "https://test-bucket.s3.amazonaws.com/"
+            "tenant-id/scan-id/threatscore/prowler-output-123_threatscore_report.pdf"
+            "?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=300"
+        )
        mock_s3_client = Mock()
        mock_s3_client.list_objects_v2.return_value = {"Contents": [{"Key": pdf_key}]}
-        mock_s3_client.get_object.return_value = {"Body": io.BytesIO(b"pdf-bytes")}
+        mock_s3_client.generate_presigned_url.return_value = presigned_url

        mock_env_str.return_value = bucket
        mock_get_s3_client.return_value = mock_s3_client
@@ -3852,19 +3857,26 @@ class TestScanViewSet:
        url = reverse("scan-threatscore", kwargs={"pk": scan.id})
        response = authenticated_client.get(url)

-        assert response.status_code == status.HTTP_200_OK
-        assert response["Content-Type"] == "application/pdf"
-        assert response["Content-Disposition"].endswith(
-            '"prowler-output-123_threatscore_report.pdf"'
-        )
-        assert response.content == b"pdf-bytes"
+        assert response.status_code == status.HTTP_302_FOUND
+        assert response["Location"] == presigned_url
        mock_s3_client.list_objects_v2.assert_called_once()
-        mock_s3_client.get_object.assert_called_once_with(Bucket=bucket, Key=pdf_key)
+        mock_s3_client.generate_presigned_url.assert_called_once_with(
+            "get_object",
+            Params={
+                "Bucket": bucket,
+                "Key": pdf_key,
+                "ResponseContentDisposition": (
+                    'attachment; filename="prowler-output-123_threatscore_report.pdf"'
+                ),
+                "ResponseContentType": "application/pdf",
+            },
+            ExpiresIn=300,
+        )

    def test_report_s3_success(self, authenticated_client, scans_fixture, monkeypatch):
        """
-        When output_location is an S3 URL and the S3 client returns the file successfully,
-        the view should return the ZIP file with HTTP 200 and proper headers.
+        When output_location is an S3 URL and the object exists,
+        the view should return a 302 redirect to a presigned S3 URL.
        """
        scan = scans_fixture[0]
        bucket = "test-bucket"
@@ -3878,22 +3890,33 @@ class TestScanViewSet:
            type("env", (), {"str": lambda self, *args, **kwargs: "test-bucket"})(),
        )

+        presigned_url = (
+            "https://test-bucket.s3.amazonaws.com/report.zip"
+            "?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=300"
+        )
+
        class FakeS3Client:
-            def get_object(self, Bucket, Key):
+            def head_object(self, Bucket, Key):
                assert Bucket == bucket
                assert Key == key
-                return {"Body": io.BytesIO(b"s3 zip content")}
+                return {}
+
+            def generate_presigned_url(self, ClientMethod, Params, ExpiresIn):
+                assert ClientMethod == "get_object"
+                assert Params["Bucket"] == bucket
+                assert Params["Key"] == key
+                assert Params["ResponseContentDisposition"] == (
+                    'attachment; filename="report.zip"'
+                )
+                assert ExpiresIn == 300
+                return presigned_url

        monkeypatch.setattr("api.v1.views.get_s3_client", lambda: FakeS3Client())

        url = reverse("scan-report", kwargs={"pk": scan.id})
        response = authenticated_client.get(url)
-        assert response.status_code == 200
-        expected_filename = os.path.basename("report.zip")
-        content_disposition = response.get("Content-Disposition")
-        assert content_disposition.startswith('attachment; filename="')
-        assert f'filename="{expected_filename}"' in content_disposition
-        assert response.content == b"s3 zip content"
+        assert response.status_code == status.HTTP_302_FOUND
+        assert response["Location"] == presigned_url

    def test_report_s3_success_no_local_files(
        self, authenticated_client, scans_fixture, monkeypatch
@@ -4032,23 +4055,31 @@ class TestScanViewSet:
        )

        match_key = "path/compliance/mitre_attack_aws.csv"
+        presigned_url = (
+            "https://test-bucket.s3.amazonaws.com/path/compliance/mitre_attack_aws.csv"
+            "?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=300"
+        )

        class FakeS3Client:
            def list_objects_v2(self, Bucket, Prefix):
                return {"Contents": [{"Key": match_key}]}

-            def get_object(self, Bucket, Key):
-                return {"Body": io.BytesIO(b"ignored")}
+            def generate_presigned_url(self, ClientMethod, Params, ExpiresIn):
+                assert ClientMethod == "get_object"
+                assert Params["Key"] == match_key
+                assert Params["ResponseContentDisposition"] == (
+                    'attachment; filename="mitre_attack_aws.csv"'
+                )
+                assert ExpiresIn == 300
+                return presigned_url

        monkeypatch.setattr("api.v1.views.get_s3_client", lambda: FakeS3Client())

        framework = match_key.split("/")[-1].split(".")[0]
        url = reverse("scan-compliance", kwargs={"pk": scan.id, "name": framework})
        resp = authenticated_client.get(url)
-        assert resp.status_code == status.HTTP_200_OK
-        cd = resp["Content-Disposition"]
-        assert cd.startswith('attachment; filename="')
-        assert cd.endswith('filename="mitre_attack_aws.csv"')
+        assert resp.status_code == status.HTTP_302_FOUND
+        assert resp["Location"] == presigned_url

    def test_compliance_s3_not_found(
        self, authenticated_client, scans_fixture, monkeypatch
@@ -4251,8 +4282,8 @@ class TestScanViewSet:
        scan.save()

        fake_client = MagicMock()
-        fake_client.get_object.side_effect = ClientError(
-            {"Error": {"Code": "NoSuchKey"}}, "GetObject"
+        fake_client.head_object.side_effect = ClientError(
+            {"Error": {"Code": "NoSuchKey"}}, "HeadObject"
        )
        mock_get_s3_client.return_value = fake_client

@@ -4275,8 +4306,8 @@ class TestScanViewSet:
        scan.save()

        fake_client = MagicMock()
-        fake_client.get_object.side_effect = ClientError(
-            {"Error": {"Code": "AccessDenied"}}, "GetObject"
+        fake_client.head_object.side_effect = ClientError(
+            {"Error": {"Code": "AccessDenied"}}, "HeadObject"
        )
        mock_get_s3_client.return_value = fake_client

@@ -53,7 +53,7 @@ from django.db.models import (
 )
 from django.db.models.fields.json import KeyTextTransform
 from django.db.models.functions import Cast, Coalesce, RowNumber
-from django.http import HttpResponse, QueryDict
+from django.http import HttpResponse, HttpResponseBase, HttpResponseRedirect, QueryDict
 from django.shortcuts import redirect
 from django.urls import reverse
 from django.utils.dateparse import parse_date
@@ -2080,24 +2080,38 @@ class ScanViewSet(BaseRLSViewSet):
            },
        )

-    def _load_file(self, path_pattern, s3=False, bucket=None, list_objects=False):
+    def _load_file(
+        self,
+        path_pattern,
+        s3=False,
+        bucket=None,
+        list_objects=False,
+        content_type=None,
+    ):
        """
-        Loads a binary file (e.g., ZIP or CSV) and returns its content and filename.
+        Resolve a report file location and return the bytes (filesystem) or a redirect (S3).

        Depending on the input parameters, this method supports loading:
-        - From S3 using a direct key.
-        - From S3 by listing objects under a prefix and matching suffix.
-        - From the local filesystem using glob pattern matching.
+        - From S3 using a direct key, returns a 302 to a short-lived presigned URL.
+        - From S3 by listing objects under a prefix and matching suffix, returns a 302 to a short-lived presigned URL.
+        - From the local filesystem using glob pattern matching, returns the file bytes.
+
+        The S3 branch never streams bytes through the worker; this prevents gunicorn
+        worker timeouts on large reports.

        Args:
            path_pattern (str): The key or glob pattern representing the file location.
            s3 (bool, optional): Whether the file is stored in S3. Defaults to False.
            bucket (str, optional): The name of the S3 bucket, required if `s3=True`. Defaults to None.
            list_objects (bool, optional): If True and `s3=True`, list objects by prefix to find the file. Defaults to False.
+            content_type (str, optional): On the S3 branch, forwarded as `ResponseContentType`
+                so the presigned download advertises the same Content-Type the API used to send.
+                Ignored on the filesystem branch.

        Returns:
-            tuple[bytes, str]: A tuple containing the file content as bytes and the filename if successful.
-            Response: A DRF `Response` object with an appropriate status and error detail if an error occurs.
+            tuple[bytes, str]: For the filesystem branch, the file content and filename.
+            HttpResponseRedirect: For the S3 branch on success, a 302 redirect to a presigned `GetObject` URL.
+            Response: For any error path, a DRF `Response` with an appropriate status and detail.
        """
        if s3:
            try:
@@ -2144,25 +2158,45 @@ class ScanViewSet(BaseRLSViewSet):
                # path_pattern here is prefix, but in compliance we build correct suffix check before
                key = keys[0]
            else:
-                # path_pattern is exact key
+                # path_pattern is exact key; HEAD before presigning to preserve the 404 contract.
                key = path_pattern
-            try:
-                s3_obj = client.get_object(Bucket=bucket, Key=key)
-            except ClientError as e:
-                code = e.response.get("Error", {}).get("Code")
-                if code == "NoSuchKey":
+                try:
+                    client.head_object(Bucket=bucket, Key=key)
+                except ClientError as e:
+                    code = e.response.get("Error", {}).get("Code")
+                    if code in ("NoSuchKey", "404"):
+                        return Response(
+                            {
+                                "detail": "The scan has no reports, or the report generation task has not started yet."
+                            },
+                            status=status.HTTP_404_NOT_FOUND,
+                        )
                    return Response(
-                        {
-                            "detail": "The scan has no reports, or the report generation task has not started yet."
-                        },
-                        status=status.HTTP_404_NOT_FOUND,
+                        {"detail": "There is a problem with credentials."},
+                        status=status.HTTP_403_FORBIDDEN,
                    )
-                return Response(
-                    {"detail": "There is a problem with credentials."},
-                    status=status.HTTP_403_FORBIDDEN,
-                )
-            content = s3_obj["Body"].read()
+
            filename = os.path.basename(key)
+            # escape quotes and strip CR/LF so a malformed key cannot break out of the header
+            safe_filename = (
+                filename.replace("\\", "\\\\")
+                .replace('"', '\\"')
+                .replace("\r", "")
+                .replace("\n", "")
+            )
+            params = {
+                "Bucket": bucket,
+                "Key": key,
+                "ResponseContentDisposition": f'attachment; filename="{safe_filename}"',
+            }
+            if content_type:
+                params["ResponseContentType"] = content_type
+            url = client.generate_presigned_url(
+                "get_object",
+                Params=params,
+                ExpiresIn=300,
+            )
+            return HttpResponseRedirect(url)
        else:
            files = glob.glob(path_pattern)
            if not files:
@@ -2205,12 +2239,16 @@ class ScanViewSet(BaseRLSViewSet):
            bucket = env.str("DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET", "")
            key_prefix = scan.output_location.removeprefix(f"s3://{bucket}/")
            loader = self._load_file(
-                key_prefix, s3=True, bucket=bucket, list_objects=False
+                key_prefix,
+                s3=True,
+                bucket=bucket,
+                list_objects=False,
+                content_type="application/x-zip-compressed",
            )
        else:
            loader = self._load_file(scan.output_location, s3=False)

-        if isinstance(loader, Response):
+        if isinstance(loader, HttpResponseBase):
            return loader

        content, filename = loader
@@ -2248,13 +2286,19 @@ class ScanViewSet(BaseRLSViewSet):
            prefix = os.path.join(
                os.path.dirname(key_prefix), "compliance", f"{name}.csv"
            )
-            loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
+            loader = self._load_file(
+                prefix,
+                s3=True,
+                bucket=bucket,
+                list_objects=True,
+                content_type="text/csv",
+            )
        else:
            base = os.path.dirname(scan.output_location)
            pattern = os.path.join(base, "compliance", f"*_{name}.csv")
            loader = self._load_file(pattern, s3=False)

-        if isinstance(loader, Response):
+        if isinstance(loader, HttpResponseBase):
            return loader

        content, filename = loader
@@ -2287,13 +2331,19 @@ class ScanViewSet(BaseRLSViewSet):
                "cis",
                "*_cis_report.pdf",
            )
-            loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
+            loader = self._load_file(
+                prefix,
+                s3=True,
+                bucket=bucket,
+                list_objects=True,
+                content_type="application/pdf",
+            )
        else:
            base = os.path.dirname(scan.output_location)
            pattern = os.path.join(base, "cis", "*_cis_report.pdf")
            loader = self._load_file(pattern, s3=False)

-        if isinstance(loader, Response):
+        if isinstance(loader, HttpResponseBase):
            return loader

        content, filename = loader
@@ -2327,13 +2377,19 @@ class ScanViewSet(BaseRLSViewSet):
                "threatscore",
                "*_threatscore_report.pdf",
            )
-            loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
+            loader = self._load_file(
+                prefix,
+                s3=True,
+                bucket=bucket,
+                list_objects=True,
+                content_type="application/pdf",
+            )
        else:
            base = os.path.dirname(scan.output_location)
            pattern = os.path.join(base, "threatscore", "*_threatscore_report.pdf")
            loader = self._load_file(pattern, s3=False)

-        if isinstance(loader, Response):
+        if isinstance(loader, HttpResponseBase):
            return loader

        content, filename = loader
@@ -2367,13 +2423,19 @@ class ScanViewSet(BaseRLSViewSet):
                "ens",
                "*_ens_report.pdf",
            )
-            loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
+            loader = self._load_file(
+                prefix,
+                s3=True,
+                bucket=bucket,
+                list_objects=True,
+                content_type="application/pdf",
+            )
        else:
            base = os.path.dirname(scan.output_location)
            pattern = os.path.join(base, "ens", "*_ens_report.pdf")
            loader = self._load_file(pattern, s3=False)

-        if isinstance(loader, Response):
+        if isinstance(loader, HttpResponseBase):
            return loader

        content, filename = loader
@@ -2406,13 +2468,19 @@ class ScanViewSet(BaseRLSViewSet):
                "nis2",
                "*_nis2_report.pdf",
            )
-            loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
+            loader = self._load_file(
+                prefix,
+                s3=True,
+                bucket=bucket,
+                list_objects=True,
+                content_type="application/pdf",
+            )
        else:
            base = os.path.dirname(scan.output_location)
            pattern = os.path.join(base, "nis2", "*_nis2_report.pdf")
            loader = self._load_file(pattern, s3=False)

-        if isinstance(loader, Response):
+        if isinstance(loader, HttpResponseBase):
            return loader

        content, filename = loader
@@ -2445,13 +2513,19 @@ class ScanViewSet(BaseRLSViewSet):
                "csa",
                "*_csa_report.pdf",
            )
-            loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
+            loader = self._load_file(
+                prefix,
+                s3=True,
+                bucket=bucket,
+                list_objects=True,
+                content_type="application/pdf",
+            )
        else:
            base = os.path.dirname(scan.output_location)
            pattern = os.path.join(base, "csa", "*_csa_report.pdf")
            loader = self._load_file(pattern, s3=False)

-        if isinstance(loader, Response):
+        if isinstance(loader, HttpResponseBase):
            return loader

        content, filename = loader