mirror of
https://github.com/prowler-cloud/prowler.git
synced 2026-03-22 03:08:23 +00:00
feat(export): add API export system (#6878)
This commit is contained in:
committed by
GitHub
parent
c4528200b0
commit
669ec74e67
@@ -8,6 +8,7 @@ All notable changes to the **Prowler API** are documented in this file.
|
||||
|
||||
### Added
|
||||
- Social login integration with Google and GitHub [(#6906)](https://github.com/prowler-cloud/prowler/pull/6906)
|
||||
- Add API scan report system, now all scans launched from the API will generate a compressed file with the report in OCSF, CSV and HTML formats [(#6878)](https://github.com/prowler-cloud/prowler/pull/6878).
|
||||
- Configurable Sentry integration [(#6874)](https://github.com/prowler-cloud/prowler/pull/6874)
|
||||
|
||||
### Changed
|
||||
|
||||
@@ -28,7 +28,7 @@ start_prod_server() {
|
||||
|
||||
start_worker() {
|
||||
echo "Starting the worker..."
|
||||
poetry run python -m celery -A config.celery worker -l "${DJANGO_LOGGING_LEVEL:-info}" -Q celery,scans,deletion -E --max-tasks-per-child 1
|
||||
poetry run python -m celery -A config.celery worker -l "${DJANGO_LOGGING_LEVEL:-info}" -Q celery,scans,scan-reports,deletion -E --max-tasks-per-child 1
|
||||
}
|
||||
|
||||
start_worker_beat() {
|
||||
|
||||
@@ -7,7 +7,7 @@ from rest_framework_json_api.serializers import ValidationError
|
||||
from api.db_utils import POSTGRES_TENANT_VAR, SET_CONFIG_QUERY
|
||||
|
||||
|
||||
def set_tenant(func):
|
||||
def set_tenant(func=None, *, keep_tenant=False):
|
||||
"""
|
||||
Decorator to set the tenant context for a Celery task based on the provided tenant_id.
|
||||
|
||||
@@ -40,20 +40,29 @@ def set_tenant(func):
|
||||
# The tenant context will be set before the task logic executes.
|
||||
"""
|
||||
|
||||
@wraps(func)
|
||||
@transaction.atomic
|
||||
def wrapper(*args, **kwargs):
|
||||
try:
|
||||
tenant_id = kwargs.pop("tenant_id")
|
||||
except KeyError:
|
||||
raise KeyError("This task requires the tenant_id")
|
||||
try:
|
||||
uuid.UUID(tenant_id)
|
||||
except ValueError:
|
||||
raise ValidationError("Tenant ID must be a valid UUID")
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute(SET_CONFIG_QUERY, [POSTGRES_TENANT_VAR, tenant_id])
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
@transaction.atomic
|
||||
def wrapper(*args, **kwargs):
|
||||
try:
|
||||
if not keep_tenant:
|
||||
tenant_id = kwargs.pop("tenant_id")
|
||||
else:
|
||||
tenant_id = kwargs["tenant_id"]
|
||||
except KeyError:
|
||||
raise KeyError("This task requires the tenant_id")
|
||||
try:
|
||||
uuid.UUID(tenant_id)
|
||||
except ValueError:
|
||||
raise ValidationError("Tenant ID must be a valid UUID")
|
||||
with connection.cursor() as cursor:
|
||||
cursor.execute(SET_CONFIG_QUERY, [POSTGRES_TENANT_VAR, tenant_id])
|
||||
|
||||
return func(*args, **kwargs)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
return wrapper
|
||||
|
||||
if func is None:
|
||||
return decorator
|
||||
else:
|
||||
return decorator(func)
|
||||
|
||||
15
api/src/backend/api/migrations/0012_scan_report_output.py
Normal file
15
api/src/backend/api/migrations/0012_scan_report_output.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("api", "0011_findings_performance_indexes_parent"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="scan",
|
||||
name="output_location",
|
||||
field=models.CharField(blank=True, max_length=200, null=True),
|
||||
),
|
||||
]
|
||||
@@ -414,6 +414,7 @@ class Scan(RowLevelSecurityProtectedModel):
|
||||
scheduler_task = models.ForeignKey(
|
||||
PeriodicTask, on_delete=models.CASCADE, null=True, blank=True
|
||||
)
|
||||
output_location = models.CharField(blank=True, null=True, max_length=200)
|
||||
# TODO: mutelist foreign key
|
||||
|
||||
class Meta(RowLevelSecurityProtectedModel.Meta):
|
||||
|
||||
@@ -4105,6 +4105,43 @@ paths:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ScanUpdateResponse'
|
||||
description: ''
|
||||
/api/v1/scans/{id}/report:
|
||||
get:
|
||||
operationId: scans_report_retrieve
|
||||
description: Returns a ZIP file containing the requested report
|
||||
summary: Download ZIP report
|
||||
parameters:
|
||||
- in: query
|
||||
name: fields[scan-reports]
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
enum:
|
||||
- id
|
||||
description: endpoint return only specific fields in the response on a per-type
|
||||
basis by including a fields[TYPE] query parameter.
|
||||
explode: false
|
||||
- in: path
|
||||
name: id
|
||||
schema:
|
||||
type: string
|
||||
format: uuid
|
||||
description: A UUID string identifying this scan.
|
||||
required: true
|
||||
tags:
|
||||
- Scan
|
||||
security:
|
||||
- jwtAuth: []
|
||||
responses:
|
||||
'200':
|
||||
description: Report obtained successfully
|
||||
'202':
|
||||
description: The task is in progress
|
||||
'403':
|
||||
description: There is a problem with credentials
|
||||
'404':
|
||||
description: The scan has no reports
|
||||
/api/v1/schedules/daily:
|
||||
post:
|
||||
operationId: schedules_daily_create
|
||||
|
||||
@@ -274,9 +274,10 @@ class TestValidateInvitation:
|
||||
expired_time = datetime.now(timezone.utc) - timedelta(days=1)
|
||||
invitation.expires_at = expired_time
|
||||
|
||||
with patch("api.utils.Invitation.objects.using") as mock_using, patch(
|
||||
"api.utils.datetime"
|
||||
) as mock_datetime:
|
||||
with (
|
||||
patch("api.utils.Invitation.objects.using") as mock_using,
|
||||
patch("api.utils.datetime") as mock_datetime,
|
||||
):
|
||||
mock_db = mock_using.return_value
|
||||
mock_db.get.return_value = invitation
|
||||
mock_datetime.now.return_value = datetime.now(timezone.utc)
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
import glob
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from unittest.mock import ANY, Mock, patch
|
||||
|
||||
import jwt
|
||||
import pytest
|
||||
from botocore.exceptions import NoCredentialsError
|
||||
from conftest import API_JSON_CONTENT_TYPE, TEST_PASSWORD, TEST_USER
|
||||
from django.conf import settings
|
||||
from django.urls import reverse
|
||||
@@ -20,6 +24,7 @@ from api.models import (
|
||||
RoleProviderGroupRelationship,
|
||||
Scan,
|
||||
StateChoices,
|
||||
Task,
|
||||
User,
|
||||
UserRoleRelationship,
|
||||
)
|
||||
@@ -2079,9 +2084,9 @@ class TestScanViewSet:
|
||||
("started_at.gte", "2024-01-01", 3),
|
||||
("started_at.lte", "2024-01-01", 0),
|
||||
("trigger", Scan.TriggerChoices.MANUAL, 1),
|
||||
("state", StateChoices.AVAILABLE, 2),
|
||||
("state", StateChoices.AVAILABLE, 1),
|
||||
("state", StateChoices.FAILED, 1),
|
||||
("state.in", f"{StateChoices.FAILED},{StateChoices.AVAILABLE}", 3),
|
||||
("state.in", f"{StateChoices.FAILED},{StateChoices.AVAILABLE}", 2),
|
||||
("trigger", Scan.TriggerChoices.MANUAL, 1),
|
||||
]
|
||||
),
|
||||
@@ -2156,6 +2161,159 @@ class TestScanViewSet:
|
||||
response = authenticated_client.get(reverse("scan-list"), {"sort": "invalid"})
|
||||
assert response.status_code == status.HTTP_400_BAD_REQUEST
|
||||
|
||||
def test_report_executing(self, authenticated_client, scans_fixture):
|
||||
"""
|
||||
When the scan is still executing (state == EXECUTING), the view should return
|
||||
the task data with HTTP 202 and a Content-Location header.
|
||||
"""
|
||||
scan = scans_fixture[0]
|
||||
scan.state = StateChoices.EXECUTING
|
||||
scan.save()
|
||||
|
||||
task = Task.objects.create(tenant_id=scan.tenant_id)
|
||||
dummy_task_data = {"id": str(task.id), "state": StateChoices.EXECUTING}
|
||||
|
||||
scan.task = task
|
||||
scan.save()
|
||||
|
||||
with patch(
|
||||
"api.v1.views.TaskSerializer",
|
||||
return_value=type("DummySerializer", (), {"data": dummy_task_data}),
|
||||
):
|
||||
url = reverse("scan-report", kwargs={"pk": scan.id})
|
||||
response = authenticated_client.get(url)
|
||||
assert response.status_code == status.HTTP_202_ACCEPTED
|
||||
assert "Content-Location" in response
|
||||
assert dummy_task_data["id"] in response["Content-Location"]
|
||||
|
||||
def test_report_celery_task_executing(self, authenticated_client, scans_fixture):
|
||||
"""
|
||||
When the scan is not executing but a related celery task exists and is running,
|
||||
the view should return that task data with HTTP 202.
|
||||
"""
|
||||
scan = scans_fixture[0]
|
||||
scan.state = StateChoices.COMPLETED
|
||||
scan.output_location = "dummy"
|
||||
scan.save()
|
||||
|
||||
dummy_task = Task.objects.create(tenant_id=scan.tenant_id)
|
||||
dummy_task.id = "dummy-task-id"
|
||||
dummy_task_data = {"id": dummy_task.id, "state": StateChoices.EXECUTING}
|
||||
|
||||
with patch("api.v1.views.Task.objects.get", return_value=dummy_task), patch(
|
||||
"api.v1.views.TaskSerializer",
|
||||
return_value=type("DummySerializer", (), {"data": dummy_task_data}),
|
||||
):
|
||||
url = reverse("scan-report", kwargs={"pk": scan.id})
|
||||
response = authenticated_client.get(url)
|
||||
assert response.status_code == status.HTTP_202_ACCEPTED
|
||||
assert "Content-Location" in response
|
||||
assert dummy_task_data["id"] in response["Content-Location"]
|
||||
|
||||
def test_report_no_output_location(self, authenticated_client, scans_fixture):
|
||||
"""
|
||||
If the scan does not have an output_location, the view should return a 404.
|
||||
"""
|
||||
scan = scans_fixture[0]
|
||||
scan.state = StateChoices.COMPLETED
|
||||
scan.output_location = ""
|
||||
scan.save()
|
||||
|
||||
url = reverse("scan-report", kwargs={"pk": scan.id})
|
||||
response = authenticated_client.get(url)
|
||||
assert response.status_code == status.HTTP_404_NOT_FOUND
|
||||
assert response.json()["errors"]["detail"] == "The scan has no reports."
|
||||
|
||||
def test_report_s3_no_credentials(
|
||||
self, authenticated_client, scans_fixture, monkeypatch
|
||||
):
|
||||
"""
|
||||
When output_location is an S3 URL and get_s3_client() raises a credentials exception,
|
||||
the view should return HTTP 403 with the proper error message.
|
||||
"""
|
||||
scan = scans_fixture[0]
|
||||
bucket = "test-bucket"
|
||||
key = "report.zip"
|
||||
scan.output_location = f"s3://{bucket}/{key}"
|
||||
scan.state = StateChoices.COMPLETED
|
||||
scan.save()
|
||||
|
||||
def fake_get_s3_client():
|
||||
raise NoCredentialsError()
|
||||
|
||||
monkeypatch.setattr("api.v1.views.get_s3_client", fake_get_s3_client)
|
||||
|
||||
url = reverse("scan-report", kwargs={"pk": scan.id})
|
||||
response = authenticated_client.get(url)
|
||||
assert response.status_code == status.HTTP_403_FORBIDDEN
|
||||
assert (
|
||||
response.json()["errors"]["detail"]
|
||||
== "There is a problem with credentials."
|
||||
)
|
||||
|
||||
def test_report_s3_success(self, authenticated_client, scans_fixture, monkeypatch):
|
||||
"""
|
||||
When output_location is an S3 URL and the S3 client returns the file successfully,
|
||||
the view should return the ZIP file with HTTP 200 and proper headers.
|
||||
"""
|
||||
scan = scans_fixture[0]
|
||||
bucket = "test-bucket"
|
||||
key = "report.zip"
|
||||
scan.output_location = f"s3://{bucket}/{key}"
|
||||
scan.state = StateChoices.COMPLETED
|
||||
scan.save()
|
||||
|
||||
monkeypatch.setattr(
|
||||
"api.v1.views.env", type("env", (), {"str": lambda self, key: bucket})()
|
||||
)
|
||||
|
||||
class FakeS3Client:
|
||||
def get_object(self, Bucket, Key):
|
||||
assert Bucket == bucket
|
||||
assert Key == key
|
||||
return {"Body": io.BytesIO(b"s3 zip content")}
|
||||
|
||||
monkeypatch.setattr("api.v1.views.get_s3_client", lambda: FakeS3Client())
|
||||
|
||||
url = reverse("scan-report", kwargs={"pk": scan.id})
|
||||
response = authenticated_client.get(url)
|
||||
assert response.status_code == 200
|
||||
expected_filename = os.path.basename("report.zip")
|
||||
content_disposition = response.get("Content-Disposition")
|
||||
assert content_disposition.startswith('attachment; filename="')
|
||||
assert f'filename="{expected_filename}"' in content_disposition
|
||||
assert response.content == b"s3 zip content"
|
||||
|
||||
def test_report_local_file(
|
||||
self, authenticated_client, scans_fixture, tmp_path, monkeypatch
|
||||
):
|
||||
"""
|
||||
When output_location is a local file path, the view should read the file from disk
|
||||
and return it with proper headers.
|
||||
"""
|
||||
scan = scans_fixture[0]
|
||||
file_content = b"local zip file content"
|
||||
file_path = tmp_path / "report.zip"
|
||||
file_path.write_bytes(file_content)
|
||||
|
||||
scan.output_location = str(file_path)
|
||||
scan.state = StateChoices.COMPLETED
|
||||
scan.save()
|
||||
|
||||
monkeypatch.setattr(
|
||||
glob,
|
||||
"glob",
|
||||
lambda pattern: [str(file_path)] if pattern == str(file_path) else [],
|
||||
)
|
||||
|
||||
url = reverse("scan-report", kwargs={"pk": scan.id})
|
||||
response = authenticated_client.get(url)
|
||||
assert response.status_code == 200
|
||||
assert response.content == file_content
|
||||
content_disposition = response.get("Content-Disposition")
|
||||
assert content_disposition.startswith('attachment; filename="')
|
||||
assert f'filename="{file_path.name}"' in content_disposition
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
class TestTaskViewSet:
|
||||
|
||||
@@ -939,6 +939,14 @@ class ScanTaskSerializer(RLSSerializer):
|
||||
]
|
||||
|
||||
|
||||
class ScanReportSerializer(serializers.Serializer):
|
||||
id = serializers.CharField(source="scan")
|
||||
|
||||
class Meta:
|
||||
resource_name = "scan-reports"
|
||||
fields = ["id"]
|
||||
|
||||
|
||||
class ResourceTagSerializer(RLSSerializer):
|
||||
"""
|
||||
Serializer for the ResourceTag model
|
||||
|
||||
@@ -1,6 +1,11 @@
|
||||
import glob
|
||||
import os
|
||||
|
||||
from allauth.socialaccount.providers.github.views import GitHubOAuth2Adapter
|
||||
from allauth.socialaccount.providers.google.views import GoogleOAuth2Adapter
|
||||
from botocore.exceptions import ClientError, NoCredentialsError, ParamValidationError
|
||||
from celery.result import AsyncResult
|
||||
from config.env import env
|
||||
from config.settings.social_login import (
|
||||
GITHUB_OAUTH_CALLBACK_URL,
|
||||
GOOGLE_OAUTH_CALLBACK_URL,
|
||||
@@ -12,6 +17,7 @@ from django.contrib.postgres.search import SearchQuery
|
||||
from django.db import transaction
|
||||
from django.db.models import Count, Exists, F, OuterRef, Prefetch, Q, Subquery, Sum
|
||||
from django.db.models.functions import Coalesce
|
||||
from django.http import HttpResponse
|
||||
from django.urls import reverse
|
||||
from django.utils.decorators import method_decorator
|
||||
from django.views.decorators.cache import cache_control
|
||||
@@ -38,11 +44,11 @@ from rest_framework.permissions import SAFE_METHODS
|
||||
from rest_framework_json_api.views import RelationshipView, Response
|
||||
from rest_framework_simplejwt.exceptions import InvalidToken, TokenError
|
||||
from tasks.beat import schedule_provider_scan
|
||||
from tasks.jobs.export import get_s3_client
|
||||
from tasks.tasks import (
|
||||
check_provider_connection_task,
|
||||
delete_provider_task,
|
||||
delete_tenant_task,
|
||||
perform_scan_summary_task,
|
||||
perform_scan_task,
|
||||
)
|
||||
|
||||
@@ -121,6 +127,7 @@ from api.v1.serializers import (
|
||||
RoleSerializer,
|
||||
RoleUpdateSerializer,
|
||||
ScanCreateSerializer,
|
||||
ScanReportSerializer,
|
||||
ScanSerializer,
|
||||
ScanUpdateSerializer,
|
||||
ScheduleDailyCreateSerializer,
|
||||
@@ -1116,6 +1123,18 @@ class ProviderViewSet(BaseRLSViewSet):
|
||||
request=ScanCreateSerializer,
|
||||
responses={202: OpenApiResponse(response=TaskSerializer)},
|
||||
),
|
||||
report=extend_schema(
|
||||
tags=["Scan"],
|
||||
summary="Download ZIP report",
|
||||
description="Returns a ZIP file containing the requested report",
|
||||
request=ScanReportSerializer,
|
||||
responses={
|
||||
200: OpenApiResponse(description="Report obtained successfully"),
|
||||
202: OpenApiResponse(description="The task is in progress"),
|
||||
403: OpenApiResponse(description="There is a problem with credentials"),
|
||||
404: OpenApiResponse(description="The scan has no reports"),
|
||||
},
|
||||
),
|
||||
)
|
||||
@method_decorator(CACHE_DECORATOR, name="list")
|
||||
@method_decorator(CACHE_DECORATOR, name="retrieve")
|
||||
@@ -1164,6 +1183,10 @@ class ScanViewSet(BaseRLSViewSet):
|
||||
return ScanCreateSerializer
|
||||
elif self.action == "partial_update":
|
||||
return ScanUpdateSerializer
|
||||
elif self.action == "report":
|
||||
if hasattr(self, "response_serializer_class"):
|
||||
return self.response_serializer_class
|
||||
return ScanReportSerializer
|
||||
return super().get_serializer_class()
|
||||
|
||||
def partial_update(self, request, *args, **kwargs):
|
||||
@@ -1181,6 +1204,93 @@ class ScanViewSet(BaseRLSViewSet):
|
||||
)
|
||||
return Response(data=read_serializer.data, status=status.HTTP_200_OK)
|
||||
|
||||
@action(detail=True, methods=["get"], url_name="report")
|
||||
def report(self, request, pk=None):
|
||||
scan_instance = self.get_object()
|
||||
|
||||
if scan_instance.state == StateChoices.EXECUTING:
|
||||
# If the scan is still running, return the task
|
||||
prowler_task = Task.objects.get(id=scan_instance.task.id)
|
||||
self.response_serializer_class = TaskSerializer
|
||||
output_serializer = self.get_serializer(prowler_task)
|
||||
return Response(
|
||||
data=output_serializer.data,
|
||||
status=status.HTTP_202_ACCEPTED,
|
||||
headers={
|
||||
"Content-Location": reverse(
|
||||
"task-detail", kwargs={"pk": output_serializer.data["id"]}
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
output_celery_task = Task.objects.get(
|
||||
task_runner_task__task_name="scan-report",
|
||||
task_runner_task__task_args__contains=pk,
|
||||
)
|
||||
self.response_serializer_class = TaskSerializer
|
||||
output_serializer = self.get_serializer(output_celery_task)
|
||||
if output_serializer.data["state"] == StateChoices.EXECUTING:
|
||||
# If the task is still running, return the task
|
||||
return Response(
|
||||
data=output_serializer.data,
|
||||
status=status.HTTP_202_ACCEPTED,
|
||||
headers={
|
||||
"Content-Location": reverse(
|
||||
"task-detail", kwargs={"pk": output_serializer.data["id"]}
|
||||
)
|
||||
},
|
||||
)
|
||||
except Task.DoesNotExist:
|
||||
# If the task does not exist, it means that the task is removed from the database
|
||||
pass
|
||||
|
||||
output_location = scan_instance.output_location
|
||||
if not output_location:
|
||||
return Response(
|
||||
{"detail": "The scan has no reports."},
|
||||
status=status.HTTP_404_NOT_FOUND,
|
||||
)
|
||||
|
||||
if scan_instance.output_location.startswith("s3://"):
|
||||
try:
|
||||
s3_client = get_s3_client()
|
||||
except (ClientError, NoCredentialsError, ParamValidationError):
|
||||
return Response(
|
||||
{"detail": "There is a problem with credentials."},
|
||||
status=status.HTTP_403_FORBIDDEN,
|
||||
)
|
||||
|
||||
bucket_name = env.str("DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET")
|
||||
key = output_location[len(f"s3://{bucket_name}/") :]
|
||||
try:
|
||||
s3_object = s3_client.get_object(Bucket=bucket_name, Key=key)
|
||||
except ClientError as e:
|
||||
error_code = e.response.get("Error", {}).get("Code")
|
||||
if error_code == "NoSuchKey":
|
||||
return Response(
|
||||
{"detail": "The scan has no reports."},
|
||||
status=status.HTTP_404_NOT_FOUND,
|
||||
)
|
||||
return Response(
|
||||
{"detail": "There is a problem with credentials."},
|
||||
status=status.HTTP_403_FORBIDDEN,
|
||||
)
|
||||
file_content = s3_object["Body"].read()
|
||||
filename = os.path.basename(output_location.split("/")[-1])
|
||||
else:
|
||||
zip_files = glob.glob(output_location)
|
||||
file_path = zip_files[0]
|
||||
with open(file_path, "rb") as f:
|
||||
file_content = f.read()
|
||||
filename = os.path.basename(file_path)
|
||||
|
||||
response = HttpResponse(
|
||||
file_content, content_type="application/x-zip-compressed"
|
||||
)
|
||||
response["Content-Disposition"] = f'attachment; filename="{filename}"'
|
||||
return response
|
||||
|
||||
def create(self, request, *args, **kwargs):
|
||||
input_serializer = self.get_serializer(data=request.data)
|
||||
input_serializer.is_valid(raise_exception=True)
|
||||
@@ -1195,10 +1305,6 @@ class ScanViewSet(BaseRLSViewSet):
|
||||
# Disabled for now
|
||||
# checks_to_execute=scan.scanner_args.get("checks_to_execute"),
|
||||
},
|
||||
link=perform_scan_summary_task.si(
|
||||
tenant_id=self.request.tenant_id,
|
||||
scan_id=str(scan.id),
|
||||
),
|
||||
)
|
||||
|
||||
scan.task_id = task.id
|
||||
|
||||
@@ -221,3 +221,18 @@ CACHE_STALE_WHILE_REVALIDATE = env.int("DJANGO_STALE_WHILE_REVALIDATE", 60)
|
||||
TESTING = False
|
||||
|
||||
FINDINGS_MAX_DAYS_IN_RANGE = env.int("DJANGO_FINDINGS_MAX_DAYS_IN_RANGE", 7)
|
||||
|
||||
|
||||
# API export settings
|
||||
DJANGO_TMP_OUTPUT_DIRECTORY = env.str(
|
||||
"DJANGO_TMP_OUTPUT_DIRECTORY", "/tmp/prowler_api_output"
|
||||
)
|
||||
DJANGO_FINDINGS_BATCH_SIZE = env.str("DJANGO_FINDINGS_BATCH_SIZE", 1000)
|
||||
|
||||
DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET = env.str("DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET", "")
|
||||
DJANGO_OUTPUT_S3_AWS_ACCESS_KEY_ID = env.str("DJANGO_OUTPUT_S3_AWS_ACCESS_KEY_ID", "")
|
||||
DJANGO_OUTPUT_S3_AWS_SECRET_ACCESS_KEY = env.str(
|
||||
"DJANGO_OUTPUT_S3_AWS_SECRET_ACCESS_KEY", ""
|
||||
)
|
||||
DJANGO_OUTPUT_S3_AWS_SESSION_TOKEN = env.str("DJANGO_OUTPUT_S3_AWS_SESSION_TOKEN", "")
|
||||
DJANGO_OUTPUT_S3_AWS_DEFAULT_REGION = env.str("DJANGO_OUTPUT_S3_AWS_DEFAULT_REGION", "")
|
||||
|
||||
@@ -486,7 +486,7 @@ def scans_fixture(tenants_fixture, providers_fixture):
|
||||
name="Scan 1",
|
||||
provider=provider,
|
||||
trigger=Scan.TriggerChoices.MANUAL,
|
||||
state=StateChoices.AVAILABLE,
|
||||
state=StateChoices.COMPLETED,
|
||||
tenant_id=tenant.id,
|
||||
started_at="2024-01-02T00:00:00Z",
|
||||
)
|
||||
|
||||
156
api/src/backend/tasks/jobs/export.py
Normal file
156
api/src/backend/tasks/jobs/export.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import os
|
||||
import zipfile
|
||||
|
||||
import boto3
|
||||
import config.django.base as base
|
||||
from botocore.exceptions import ClientError, NoCredentialsError, ParamValidationError
|
||||
from celery.utils.log import get_task_logger
|
||||
from django.conf import settings
|
||||
|
||||
from prowler.config.config import (
|
||||
csv_file_suffix,
|
||||
html_file_suffix,
|
||||
json_ocsf_file_suffix,
|
||||
output_file_timestamp,
|
||||
)
|
||||
from prowler.lib.outputs.csv.csv import CSV
|
||||
from prowler.lib.outputs.html.html import HTML
|
||||
from prowler.lib.outputs.ocsf.ocsf import OCSF
|
||||
|
||||
logger = get_task_logger(__name__)
|
||||
|
||||
|
||||
# Predefined mapping for output formats and their configurations
|
||||
OUTPUT_FORMATS_MAPPING = {
|
||||
"csv": {
|
||||
"class": CSV,
|
||||
"suffix": csv_file_suffix,
|
||||
"kwargs": {},
|
||||
},
|
||||
"json-ocsf": {"class": OCSF, "suffix": json_ocsf_file_suffix, "kwargs": {}},
|
||||
"html": {"class": HTML, "suffix": html_file_suffix, "kwargs": {"stats": {}}},
|
||||
}
|
||||
|
||||
|
||||
def _compress_output_files(output_directory: str) -> str:
|
||||
"""
|
||||
Compress output files from all configured output formats into a ZIP archive.
|
||||
Args:
|
||||
output_directory (str): The directory where the output files are located.
|
||||
The function looks up all known suffixes in OUTPUT_FORMATS_MAPPING
|
||||
and compresses those files into a single ZIP.
|
||||
Returns:
|
||||
str: The full path to the newly created ZIP archive.
|
||||
"""
|
||||
zip_path = f"{output_directory}.zip"
|
||||
|
||||
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
|
||||
for suffix in [config["suffix"] for config in OUTPUT_FORMATS_MAPPING.values()]:
|
||||
zipf.write(
|
||||
f"{output_directory}{suffix}",
|
||||
f"output/{output_directory.split('/')[-1]}{suffix}",
|
||||
)
|
||||
|
||||
return zip_path
|
||||
|
||||
|
||||
def get_s3_client():
|
||||
"""
|
||||
Create and return a boto3 S3 client using AWS credentials from environment variables.
|
||||
|
||||
This function attempts to initialize an S3 client by reading the AWS access key, secret key,
|
||||
session token, and region from environment variables. It then validates the client by listing
|
||||
available S3 buckets. If an error occurs during this process (for example, due to missing or
|
||||
invalid credentials), it falls back to creating an S3 client without explicitly provided credentials,
|
||||
which may rely on other configuration sources (e.g., IAM roles).
|
||||
|
||||
Returns:
|
||||
boto3.client: A configured S3 client instance.
|
||||
|
||||
Raises:
|
||||
ClientError, NoCredentialsError, or ParamValidationError if both attempts to create a client fail.
|
||||
"""
|
||||
s3_client = None
|
||||
try:
|
||||
s3_client = boto3.client(
|
||||
"s3",
|
||||
aws_access_key_id=settings.DJANGO_OUTPUT_S3_AWS_ACCESS_KEY_ID,
|
||||
aws_secret_access_key=settings.DJANGO_OUTPUT_S3_AWS_SECRET_ACCESS_KEY,
|
||||
aws_session_token=settings.DJANGO_OUTPUT_S3_AWS_SESSION_TOKEN,
|
||||
region_name=settings.DJANGO_OUTPUT_S3_AWS_DEFAULT_REGION,
|
||||
)
|
||||
s3_client.list_buckets()
|
||||
except (ClientError, NoCredentialsError, ParamValidationError, ValueError):
|
||||
s3_client = boto3.client("s3")
|
||||
s3_client.list_buckets()
|
||||
|
||||
return s3_client
|
||||
|
||||
|
||||
def _upload_to_s3(tenant_id: str, zip_path: str, scan_id: str) -> str:
|
||||
"""
|
||||
Upload the specified ZIP file to an S3 bucket.
|
||||
If the S3 bucket environment variables are not configured,
|
||||
the function returns None without performing an upload.
|
||||
Args:
|
||||
tenant_id (str): The tenant identifier, used as part of the S3 key prefix.
|
||||
zip_path (str): The local file system path to the ZIP file to be uploaded.
|
||||
scan_id (str): The scan identifier, used as part of the S3 key prefix.
|
||||
Returns:
|
||||
str: The S3 URI of the uploaded file (e.g., "s3://<bucket>/<key>") if successful.
|
||||
None: If the required environment variables for the S3 bucket are not set.
|
||||
Raises:
|
||||
botocore.exceptions.ClientError: If the upload attempt to S3 fails for any reason.
|
||||
"""
|
||||
if not base.DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET:
|
||||
return
|
||||
|
||||
try:
|
||||
s3 = get_s3_client()
|
||||
s3_key = f"{tenant_id}/{scan_id}/{os.path.basename(zip_path)}"
|
||||
s3.upload_file(
|
||||
Filename=zip_path,
|
||||
Bucket=base.DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET,
|
||||
Key=s3_key,
|
||||
)
|
||||
return f"s3://{base.DJANGO_OUTPUT_S3_AWS_OUTPUT_BUCKET}/{s3_key}"
|
||||
except (ClientError, NoCredentialsError, ParamValidationError, ValueError) as e:
|
||||
logger.error(f"S3 upload failed: {str(e)}")
|
||||
|
||||
|
||||
def _generate_output_directory(
|
||||
output_directory, prowler_provider: object, tenant_id: str, scan_id: str
|
||||
) -> str:
|
||||
"""
|
||||
Generate a file system path for the output directory of a prowler scan.
|
||||
|
||||
This function constructs the output directory path by combining a base
|
||||
temporary output directory, the tenant ID, the scan ID, and details about
|
||||
the prowler provider along with a timestamp. The resulting path is used to
|
||||
store the output files of a prowler scan.
|
||||
|
||||
Note:
|
||||
This function depends on one external variable:
|
||||
- `output_file_timestamp`: A timestamp (as a string) used to uniquely identify the output.
|
||||
|
||||
Args:
|
||||
output_directory (str): The base output directory.
|
||||
prowler_provider (object): An identifier or descriptor for the prowler provider.
|
||||
Typically, this is a string indicating the provider (e.g., "aws").
|
||||
tenant_id (str): The unique identifier for the tenant.
|
||||
scan_id (str): The unique identifier for the scan.
|
||||
|
||||
Returns:
|
||||
str: The constructed file system path for the prowler scan output directory.
|
||||
|
||||
Example:
|
||||
>>> _generate_output_directory("/tmp", "aws", "tenant-1234", "scan-5678")
|
||||
'/tmp/tenant-1234/aws/scan-5678/prowler-output-2023-02-15T12:34:56'
|
||||
"""
|
||||
path = (
|
||||
f"{output_directory}/{tenant_id}/{scan_id}/prowler-output-"
|
||||
f"{prowler_provider}-{output_file_timestamp}"
|
||||
)
|
||||
os.makedirs("/".join(path.split("/")[:-1]), exist_ok=True)
|
||||
|
||||
return path
|
||||
@@ -1,14 +1,28 @@
|
||||
from celery import shared_task
|
||||
from shutil import rmtree
|
||||
|
||||
from celery import chain, shared_task
|
||||
from celery.utils.log import get_task_logger
|
||||
from config.celery import RLSTask
|
||||
from config.django.base import DJANGO_FINDINGS_BATCH_SIZE, DJANGO_TMP_OUTPUT_DIRECTORY
|
||||
from django_celery_beat.models import PeriodicTask
|
||||
from tasks.jobs.connection import check_provider_connection
|
||||
from tasks.jobs.deletion import delete_provider, delete_tenant
|
||||
from tasks.jobs.export import (
|
||||
OUTPUT_FORMATS_MAPPING,
|
||||
_compress_output_files,
|
||||
_generate_output_directory,
|
||||
_upload_to_s3,
|
||||
)
|
||||
from tasks.jobs.scan import aggregate_findings, perform_prowler_scan
|
||||
from tasks.utils import get_next_execution_datetime
|
||||
from tasks.utils import batched, get_next_execution_datetime
|
||||
|
||||
from api.db_utils import rls_transaction
|
||||
from api.decorators import set_tenant
|
||||
from api.models import Scan, StateChoices
|
||||
from api.models import Finding, Provider, Scan, ScanSummary, StateChoices
|
||||
from api.utils import initialize_prowler_provider
|
||||
from prowler.lib.outputs.finding import Finding as FindingOutput
|
||||
|
||||
logger = get_task_logger(__name__)
|
||||
|
||||
|
||||
@shared_task(base=RLSTask, name="provider-connection-check")
|
||||
@@ -68,13 +82,20 @@ def perform_scan_task(
|
||||
Returns:
|
||||
dict: The result of the scan execution, typically including the status and results of the performed checks.
|
||||
"""
|
||||
return perform_prowler_scan(
|
||||
result = perform_prowler_scan(
|
||||
tenant_id=tenant_id,
|
||||
scan_id=scan_id,
|
||||
provider_id=provider_id,
|
||||
checks_to_execute=checks_to_execute,
|
||||
)
|
||||
|
||||
chain(
|
||||
perform_scan_summary_task.si(tenant_id, scan_id),
|
||||
generate_outputs.si(scan_id, provider_id, tenant_id=tenant_id),
|
||||
).apply_async()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@shared_task(base=RLSTask, bind=True, name="scan-perform-scheduled", queue="scans")
|
||||
def perform_scheduled_scan_task(self, tenant_id: str, provider_id: str):
|
||||
@@ -135,12 +156,11 @@ def perform_scheduled_scan_task(self, tenant_id: str, provider_id: str):
|
||||
scheduler_task_id=periodic_task_instance.id,
|
||||
)
|
||||
|
||||
perform_scan_summary_task.apply_async(
|
||||
kwargs={
|
||||
"tenant_id": tenant_id,
|
||||
"scan_id": str(scan_instance.id),
|
||||
}
|
||||
)
|
||||
chain(
|
||||
perform_scan_summary_task.si(tenant_id, scan_instance.id),
|
||||
generate_outputs.si(str(scan_instance.id), provider_id, tenant_id=tenant_id),
|
||||
).apply_async()
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@@ -152,3 +172,108 @@ def perform_scan_summary_task(tenant_id: str, scan_id: str):
|
||||
@shared_task(name="tenant-deletion", queue="deletion")
|
||||
def delete_tenant_task(tenant_id: str):
|
||||
return delete_tenant(pk=tenant_id)
|
||||
|
||||
|
||||
@shared_task(
|
||||
base=RLSTask,
|
||||
name="scan-report",
|
||||
queue="scan-reports",
|
||||
)
|
||||
@set_tenant(keep_tenant=True)
|
||||
def generate_outputs(scan_id: str, provider_id: str, tenant_id: str):
|
||||
"""
|
||||
Process findings in batches and generate output files in multiple formats.
|
||||
|
||||
This function retrieves findings associated with a scan, processes them
|
||||
in batches of 50, and writes each batch to the corresponding output files.
|
||||
It reuses output writer instances across batches, updates them with each
|
||||
batch of transformed findings, and uses a flag to indicate when the final
|
||||
batch is being processed. Finally, the output files are compressed and
|
||||
uploaded to S3.
|
||||
|
||||
Args:
|
||||
tenant_id (str): The tenant identifier.
|
||||
scan_id (str): The scan identifier.
|
||||
provider_id (str): The provider_id id to be used in generating outputs.
|
||||
"""
|
||||
# Initialize the prowler provider
|
||||
prowler_provider = initialize_prowler_provider(Provider.objects.get(id=provider_id))
|
||||
|
||||
# Get the provider UID
|
||||
provider_uid = Provider.objects.get(id=provider_id).uid
|
||||
|
||||
# Generate and ensure the output directory exists
|
||||
output_directory = _generate_output_directory(
|
||||
DJANGO_TMP_OUTPUT_DIRECTORY, provider_uid, tenant_id, scan_id
|
||||
)
|
||||
|
||||
# Define auxiliary variables
|
||||
output_writers = {}
|
||||
scan_summary = FindingOutput._transform_findings_stats(
|
||||
ScanSummary.objects.filter(scan_id=scan_id)
|
||||
)
|
||||
|
||||
# Retrieve findings queryset
|
||||
findings_qs = Finding.all_objects.filter(scan_id=scan_id).order_by("uid")
|
||||
|
||||
# Process findings in batches
|
||||
for batch, is_last_batch in batched(
|
||||
findings_qs.iterator(), DJANGO_FINDINGS_BATCH_SIZE
|
||||
):
|
||||
finding_outputs = [
|
||||
FindingOutput.transform_api_finding(finding, prowler_provider)
|
||||
for finding in batch
|
||||
]
|
||||
|
||||
# Generate output files
|
||||
for mode, config in OUTPUT_FORMATS_MAPPING.items():
|
||||
kwargs = dict(config.get("kwargs", {}))
|
||||
if mode == "html":
|
||||
kwargs["provider"] = prowler_provider
|
||||
kwargs["stats"] = scan_summary
|
||||
|
||||
writer_class = config["class"]
|
||||
if writer_class in output_writers:
|
||||
writer = output_writers[writer_class]
|
||||
writer.transform(finding_outputs)
|
||||
writer.close_file = is_last_batch
|
||||
else:
|
||||
writer = writer_class(
|
||||
findings=finding_outputs,
|
||||
file_path=output_directory,
|
||||
file_extension=config["suffix"],
|
||||
from_cli=False,
|
||||
)
|
||||
writer.close_file = is_last_batch
|
||||
output_writers[writer_class] = writer
|
||||
|
||||
# Write the current batch using the writer
|
||||
writer.batch_write_data_to_file(**kwargs)
|
||||
|
||||
# TODO: Refactor the output classes to avoid this manual reset
|
||||
writer._data = []
|
||||
|
||||
# Compress output files
|
||||
output_directory = _compress_output_files(output_directory)
|
||||
|
||||
# Save to configured storage
|
||||
uploaded = _upload_to_s3(tenant_id, output_directory, scan_id)
|
||||
|
||||
if uploaded:
|
||||
output_directory = uploaded
|
||||
uploaded = True
|
||||
# Remove the local files after upload
|
||||
rmtree(DJANGO_TMP_OUTPUT_DIRECTORY, ignore_errors=True)
|
||||
else:
|
||||
uploaded = False
|
||||
|
||||
# Update the scan instance with the output path
|
||||
Scan.all_objects.filter(id=scan_id).update(output_location=output_directory)
|
||||
|
||||
logger.info(f"Scan output files generated, output location: {output_directory}")
|
||||
|
||||
return {
|
||||
"upload": uploaded,
|
||||
"scan_id": scan_id,
|
||||
"provider_id": provider_id,
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
from django_celery_beat.models import IntervalSchedule, PeriodicTask
|
||||
from django_celery_results.models import TaskResult
|
||||
from tasks.utils import get_next_execution_datetime
|
||||
from tasks.utils import batched, get_next_execution_datetime
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
@@ -74,3 +74,29 @@ class TestGetNextExecutionDatetime:
|
||||
get_next_execution_datetime(
|
||||
task_id=task_result.task_id, provider_id="nonexistent"
|
||||
)
|
||||
|
||||
|
||||
class TestBatchedFunction:
|
||||
def test_empty_iterable(self):
|
||||
result = list(batched([], 3))
|
||||
assert result == [([], True)]
|
||||
|
||||
def test_exact_batches(self):
|
||||
result = list(batched([1, 2, 3, 4], 2))
|
||||
expected = [([1, 2], False), ([3, 4], False), ([], True)]
|
||||
assert result == expected
|
||||
|
||||
def test_inexact_batches(self):
|
||||
result = list(batched([1, 2, 3, 4, 5], 2))
|
||||
expected = [([1, 2], False), ([3, 4], False), ([5], True)]
|
||||
assert result == expected
|
||||
|
||||
def test_batch_size_one(self):
|
||||
result = list(batched([1, 2, 3], 1))
|
||||
expected = [([1], False), ([2], False), ([3], False), ([], True)]
|
||||
assert result == expected
|
||||
|
||||
def test_batch_size_greater_than_length(self):
|
||||
result = list(batched([1, 2, 3], 5))
|
||||
expected = [([1, 2, 3], True)]
|
||||
assert result == expected
|
||||
|
||||
@@ -24,3 +24,27 @@ def get_next_execution_datetime(task_id: int, provider_id: str) -> datetime:
|
||||
)
|
||||
|
||||
return current_scheduled_time + timedelta(**{interval.period: interval.every})
|
||||
|
||||
|
||||
def batched(iterable, batch_size):
|
||||
"""
|
||||
Yield successive batches from an iterable.
|
||||
|
||||
Args:
|
||||
iterable: An iterable source of items.
|
||||
batch_size (int): The number of items per batch.
|
||||
|
||||
Yields:
|
||||
tuple: A pair (batch, is_last_batch) where:
|
||||
- batch (list): A list of items (with length equal to batch_size,
|
||||
except possibly for the last batch).
|
||||
- is_last_batch (bool): True if this is the final batch, False otherwise.
|
||||
"""
|
||||
batch = []
|
||||
for item in iterable:
|
||||
batch.append(item)
|
||||
if len(batch) == batch_size:
|
||||
yield batch, False
|
||||
batch = []
|
||||
|
||||
yield batch, True
|
||||
|
||||
Reference in New Issue
Block a user