fix(api): redirect scan report and compliance downloads to presigned S3 URLs (#10927)

This commit is contained in:
Josema Camacho
2026-04-29 13:19:19 +02:00
committed by GitHub
parent a981dc64a7
commit 5d90352a0f
5 changed files with 1793 additions and 125 deletions
+2 -1
View File
@@ -7,6 +7,7 @@ All notable changes to the **Prowler API** are documented in this file.
### 🐞 Fixed
- Attack Paths: AWS scans no longer fail when enabled regions cannot be retrieved, and scans stuck in `scheduled` state are now cleaned up after the stale threshold [(#10917)](https://github.com/prowler-cloud/prowler/pull/10917)
- Scan report and compliance downloads now redirect to a presigned S3 URL instead of streaming through the API worker, preventing gunicorn timeouts on large files [(#10927)](https://github.com/prowler-cloud/prowler/pull/10927)
---
@@ -20,7 +21,7 @@ All notable changes to the **Prowler API** are documented in this file.
### 🔄 Changed
- Allows tenant owners to expel users from their organizations [(#10787)](https://github.com/prowler-cloud/prowler/pull/10787)
- Allows tenant owners to expel users from their organizations [(#10787)](https://github.com/prowler-cloud/prowler/pull/10787)
- `aggregate_findings`, `aggregate_attack_surface`, `aggregate_scan_resource_group_summaries` and `aggregate_scan_category_summaries` now upsert via `bulk_create(update_conflicts=True, ...)` instead of the prior `ignore_conflicts=True` / plain INSERT / `already backfilled` short-circuit. Re-runs triggered by the post-mute reaggregation pipeline no longer trip the `unique_*_per_scan` constraints nor silently drop updates, and are race-safe under concurrent writers (e.g. scan completion overlapping with a fresh mute rule) [(#10843)](https://github.com/prowler-cloud/prowler/pull/10843)
- Rename the scan-category and scan-resource-group summary aggregators from `backfill_*` to `aggregate_*` [(#10843)](https://github.com/prowler-cloud/prowler/pull/10843)
+2 -2
View File
@@ -52,7 +52,7 @@ class ApiConfig(AppConfig):
"check_and_fix_socialaccount_sites_migration",
]
# Skip Neo4j initialization during tests, some Django commands, and Celery
# Skip eager Neo4j init for tests, some Django commands, and Celery (prefork pool: driver must stay lazy, no post_fork hook)
if getattr(settings, "TESTING", False) or (
len(sys.argv) > 1
and (
@@ -64,7 +64,7 @@ class ApiConfig(AppConfig):
)
):
logger.info(
"Skipping Neo4j initialization because tests, some Django commands or Celery"
"Skipping eager Neo4j init: tests, some Django commands, or Celery prefork pool (driver stays lazy)"
)
else:
File diff suppressed because it is too large Load Diff
+59 -28
View File
@@ -3841,9 +3841,14 @@ class TestScanViewSet:
"prowler-output-123_threatscore_report.pdf",
)
presigned_url = (
"https://test-bucket.s3.amazonaws.com/"
"tenant-id/scan-id/threatscore/prowler-output-123_threatscore_report.pdf"
"?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=300"
)
mock_s3_client = Mock()
mock_s3_client.list_objects_v2.return_value = {"Contents": [{"Key": pdf_key}]}
mock_s3_client.get_object.return_value = {"Body": io.BytesIO(b"pdf-bytes")}
mock_s3_client.generate_presigned_url.return_value = presigned_url
mock_env_str.return_value = bucket
mock_get_s3_client.return_value = mock_s3_client
@@ -3852,19 +3857,26 @@ class TestScanViewSet:
url = reverse("scan-threatscore", kwargs={"pk": scan.id})
response = authenticated_client.get(url)
assert response.status_code == status.HTTP_200_OK
assert response["Content-Type"] == "application/pdf"
assert response["Content-Disposition"].endswith(
'"prowler-output-123_threatscore_report.pdf"'
)
assert response.content == b"pdf-bytes"
assert response.status_code == status.HTTP_302_FOUND
assert response["Location"] == presigned_url
mock_s3_client.list_objects_v2.assert_called_once()
mock_s3_client.get_object.assert_called_once_with(Bucket=bucket, Key=pdf_key)
mock_s3_client.generate_presigned_url.assert_called_once_with(
"get_object",
Params={
"Bucket": bucket,
"Key": pdf_key,
"ResponseContentDisposition": (
'attachment; filename="prowler-output-123_threatscore_report.pdf"'
),
"ResponseContentType": "application/pdf",
},
ExpiresIn=300,
)
def test_report_s3_success(self, authenticated_client, scans_fixture, monkeypatch):
"""
When output_location is an S3 URL and the S3 client returns the file successfully,
the view should return the ZIP file with HTTP 200 and proper headers.
When output_location is an S3 URL and the object exists,
the view should return a 302 redirect to a presigned S3 URL.
"""
scan = scans_fixture[0]
bucket = "test-bucket"
@@ -3878,22 +3890,33 @@ class TestScanViewSet:
type("env", (), {"str": lambda self, *args, **kwargs: "test-bucket"})(),
)
presigned_url = (
"https://test-bucket.s3.amazonaws.com/report.zip"
"?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=300"
)
class FakeS3Client:
def get_object(self, Bucket, Key):
def head_object(self, Bucket, Key):
assert Bucket == bucket
assert Key == key
return {"Body": io.BytesIO(b"s3 zip content")}
return {}
def generate_presigned_url(self, ClientMethod, Params, ExpiresIn):
assert ClientMethod == "get_object"
assert Params["Bucket"] == bucket
assert Params["Key"] == key
assert Params["ResponseContentDisposition"] == (
'attachment; filename="report.zip"'
)
assert ExpiresIn == 300
return presigned_url
monkeypatch.setattr("api.v1.views.get_s3_client", lambda: FakeS3Client())
url = reverse("scan-report", kwargs={"pk": scan.id})
response = authenticated_client.get(url)
assert response.status_code == 200
expected_filename = os.path.basename("report.zip")
content_disposition = response.get("Content-Disposition")
assert content_disposition.startswith('attachment; filename="')
assert f'filename="{expected_filename}"' in content_disposition
assert response.content == b"s3 zip content"
assert response.status_code == status.HTTP_302_FOUND
assert response["Location"] == presigned_url
def test_report_s3_success_no_local_files(
self, authenticated_client, scans_fixture, monkeypatch
@@ -4032,23 +4055,31 @@ class TestScanViewSet:
)
match_key = "path/compliance/mitre_attack_aws.csv"
presigned_url = (
"https://test-bucket.s3.amazonaws.com/path/compliance/mitre_attack_aws.csv"
"?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Expires=300"
)
class FakeS3Client:
def list_objects_v2(self, Bucket, Prefix):
return {"Contents": [{"Key": match_key}]}
def get_object(self, Bucket, Key):
return {"Body": io.BytesIO(b"ignored")}
def generate_presigned_url(self, ClientMethod, Params, ExpiresIn):
assert ClientMethod == "get_object"
assert Params["Key"] == match_key
assert Params["ResponseContentDisposition"] == (
'attachment; filename="mitre_attack_aws.csv"'
)
assert ExpiresIn == 300
return presigned_url
monkeypatch.setattr("api.v1.views.get_s3_client", lambda: FakeS3Client())
framework = match_key.split("/")[-1].split(".")[0]
url = reverse("scan-compliance", kwargs={"pk": scan.id, "name": framework})
resp = authenticated_client.get(url)
assert resp.status_code == status.HTTP_200_OK
cd = resp["Content-Disposition"]
assert cd.startswith('attachment; filename="')
assert cd.endswith('filename="mitre_attack_aws.csv"')
assert resp.status_code == status.HTTP_302_FOUND
assert resp["Location"] == presigned_url
def test_compliance_s3_not_found(
self, authenticated_client, scans_fixture, monkeypatch
@@ -4251,8 +4282,8 @@ class TestScanViewSet:
scan.save()
fake_client = MagicMock()
fake_client.get_object.side_effect = ClientError(
{"Error": {"Code": "NoSuchKey"}}, "GetObject"
fake_client.head_object.side_effect = ClientError(
{"Error": {"Code": "NoSuchKey"}}, "HeadObject"
)
mock_get_s3_client.return_value = fake_client
@@ -4275,8 +4306,8 @@ class TestScanViewSet:
scan.save()
fake_client = MagicMock()
fake_client.get_object.side_effect = ClientError(
{"Error": {"Code": "AccessDenied"}}, "GetObject"
fake_client.head_object.side_effect = ClientError(
{"Error": {"Code": "AccessDenied"}}, "HeadObject"
)
mock_get_s3_client.return_value = fake_client
+111 -37
View File
@@ -53,7 +53,7 @@ from django.db.models import (
)
from django.db.models.fields.json import KeyTextTransform
from django.db.models.functions import Cast, Coalesce, RowNumber
from django.http import HttpResponse, QueryDict
from django.http import HttpResponse, HttpResponseBase, HttpResponseRedirect, QueryDict
from django.shortcuts import redirect
from django.urls import reverse
from django.utils.dateparse import parse_date
@@ -2080,24 +2080,38 @@ class ScanViewSet(BaseRLSViewSet):
},
)
def _load_file(self, path_pattern, s3=False, bucket=None, list_objects=False):
def _load_file(
self,
path_pattern,
s3=False,
bucket=None,
list_objects=False,
content_type=None,
):
"""
Loads a binary file (e.g., ZIP or CSV) and returns its content and filename.
Resolve a report file location and return the bytes (filesystem) or a redirect (S3).
Depending on the input parameters, this method supports loading:
- From S3 using a direct key.
- From S3 by listing objects under a prefix and matching suffix.
- From the local filesystem using glob pattern matching.
- From S3 using a direct key, returns a 302 to a short-lived presigned URL.
- From S3 by listing objects under a prefix and matching suffix, returns a 302 to a short-lived presigned URL.
- From the local filesystem using glob pattern matching, returns the file bytes.
The S3 branch never streams bytes through the worker; this prevents gunicorn
worker timeouts on large reports.
Args:
path_pattern (str): The key or glob pattern representing the file location.
s3 (bool, optional): Whether the file is stored in S3. Defaults to False.
bucket (str, optional): The name of the S3 bucket, required if `s3=True`. Defaults to None.
list_objects (bool, optional): If True and `s3=True`, list objects by prefix to find the file. Defaults to False.
content_type (str, optional): On the S3 branch, forwarded as `ResponseContentType`
so the presigned download advertises the same Content-Type the API used to send.
Ignored on the filesystem branch.
Returns:
tuple[bytes, str]: A tuple containing the file content as bytes and the filename if successful.
Response: A DRF `Response` object with an appropriate status and error detail if an error occurs.
tuple[bytes, str]: For the filesystem branch, the file content and filename.
HttpResponseRedirect: For the S3 branch on success, a 302 redirect to a presigned `GetObject` URL.
Response: For any error path, a DRF `Response` with an appropriate status and detail.
"""
if s3:
try:
@@ -2144,25 +2158,45 @@ class ScanViewSet(BaseRLSViewSet):
# path_pattern here is prefix, but in compliance we build correct suffix check before
key = keys[0]
else:
# path_pattern is exact key
# path_pattern is exact key; HEAD before presigning to preserve the 404 contract.
key = path_pattern
try:
s3_obj = client.get_object(Bucket=bucket, Key=key)
except ClientError as e:
code = e.response.get("Error", {}).get("Code")
if code == "NoSuchKey":
try:
client.head_object(Bucket=bucket, Key=key)
except ClientError as e:
code = e.response.get("Error", {}).get("Code")
if code in ("NoSuchKey", "404"):
return Response(
{
"detail": "The scan has no reports, or the report generation task has not started yet."
},
status=status.HTTP_404_NOT_FOUND,
)
return Response(
{
"detail": "The scan has no reports, or the report generation task has not started yet."
},
status=status.HTTP_404_NOT_FOUND,
{"detail": "There is a problem with credentials."},
status=status.HTTP_403_FORBIDDEN,
)
return Response(
{"detail": "There is a problem with credentials."},
status=status.HTTP_403_FORBIDDEN,
)
content = s3_obj["Body"].read()
filename = os.path.basename(key)
# escape quotes and strip CR/LF so a malformed key cannot break out of the header
safe_filename = (
filename.replace("\\", "\\\\")
.replace('"', '\\"')
.replace("\r", "")
.replace("\n", "")
)
params = {
"Bucket": bucket,
"Key": key,
"ResponseContentDisposition": f'attachment; filename="{safe_filename}"',
}
if content_type:
params["ResponseContentType"] = content_type
url = client.generate_presigned_url(
"get_object",
Params=params,
ExpiresIn=300,
)
return HttpResponseRedirect(url)
else:
files = glob.glob(path_pattern)
if not files:
@@ -2205,12 +2239,16 @@ class ScanViewSet(BaseRLSViewSet):
bucket = env.str("DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET", "")
key_prefix = scan.output_location.removeprefix(f"s3://{bucket}/")
loader = self._load_file(
key_prefix, s3=True, bucket=bucket, list_objects=False
key_prefix,
s3=True,
bucket=bucket,
list_objects=False,
content_type="application/x-zip-compressed",
)
else:
loader = self._load_file(scan.output_location, s3=False)
if isinstance(loader, Response):
if isinstance(loader, HttpResponseBase):
return loader
content, filename = loader
@@ -2248,13 +2286,19 @@ class ScanViewSet(BaseRLSViewSet):
prefix = os.path.join(
os.path.dirname(key_prefix), "compliance", f"{name}.csv"
)
loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
loader = self._load_file(
prefix,
s3=True,
bucket=bucket,
list_objects=True,
content_type="text/csv",
)
else:
base = os.path.dirname(scan.output_location)
pattern = os.path.join(base, "compliance", f"*_{name}.csv")
loader = self._load_file(pattern, s3=False)
if isinstance(loader, Response):
if isinstance(loader, HttpResponseBase):
return loader
content, filename = loader
@@ -2287,13 +2331,19 @@ class ScanViewSet(BaseRLSViewSet):
"cis",
"*_cis_report.pdf",
)
loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
loader = self._load_file(
prefix,
s3=True,
bucket=bucket,
list_objects=True,
content_type="application/pdf",
)
else:
base = os.path.dirname(scan.output_location)
pattern = os.path.join(base, "cis", "*_cis_report.pdf")
loader = self._load_file(pattern, s3=False)
if isinstance(loader, Response):
if isinstance(loader, HttpResponseBase):
return loader
content, filename = loader
@@ -2327,13 +2377,19 @@ class ScanViewSet(BaseRLSViewSet):
"threatscore",
"*_threatscore_report.pdf",
)
loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
loader = self._load_file(
prefix,
s3=True,
bucket=bucket,
list_objects=True,
content_type="application/pdf",
)
else:
base = os.path.dirname(scan.output_location)
pattern = os.path.join(base, "threatscore", "*_threatscore_report.pdf")
loader = self._load_file(pattern, s3=False)
if isinstance(loader, Response):
if isinstance(loader, HttpResponseBase):
return loader
content, filename = loader
@@ -2367,13 +2423,19 @@ class ScanViewSet(BaseRLSViewSet):
"ens",
"*_ens_report.pdf",
)
loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
loader = self._load_file(
prefix,
s3=True,
bucket=bucket,
list_objects=True,
content_type="application/pdf",
)
else:
base = os.path.dirname(scan.output_location)
pattern = os.path.join(base, "ens", "*_ens_report.pdf")
loader = self._load_file(pattern, s3=False)
if isinstance(loader, Response):
if isinstance(loader, HttpResponseBase):
return loader
content, filename = loader
@@ -2406,13 +2468,19 @@ class ScanViewSet(BaseRLSViewSet):
"nis2",
"*_nis2_report.pdf",
)
loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
loader = self._load_file(
prefix,
s3=True,
bucket=bucket,
list_objects=True,
content_type="application/pdf",
)
else:
base = os.path.dirname(scan.output_location)
pattern = os.path.join(base, "nis2", "*_nis2_report.pdf")
loader = self._load_file(pattern, s3=False)
if isinstance(loader, Response):
if isinstance(loader, HttpResponseBase):
return loader
content, filename = loader
@@ -2445,13 +2513,19 @@ class ScanViewSet(BaseRLSViewSet):
"csa",
"*_csa_report.pdf",
)
loader = self._load_file(prefix, s3=True, bucket=bucket, list_objects=True)
loader = self._load_file(
prefix,
s3=True,
bucket=bucket,
list_objects=True,
content_type="application/pdf",
)
else:
base = os.path.dirname(scan.output_location)
pattern = os.path.join(base, "csa", "*_csa_report.pdf")
loader = self._load_file(pattern, s3=False)
if isinstance(loader, Response):
if isinstance(loader, HttpResponseBase):
return loader
content, filename = loader