feat(mcp): migrate documentation search from ReadTheDocs to Mintlify API (#8916)

2025-12-19 05:17:47 +00:00 · 2025-10-15 17:40:18 +02:00
parent db5bab51ae
commit ec75b5d0a3
6 changed files with 101 additions and 86 deletions
--- a/mcp_server/CHANGELOG.md
+++ b/mcp_server/CHANGELOG.md
@@ -13,3 +13,4 @@ All notable changes to the **Prowler MCP Server** are documented in this file.
 - Add new MCP Server for Prowler Documentation [(#8795)](https://github.com/prowler-cloud/prowler/pull/8795)
 - API key support for STDIO mode and enhanced HTTP mode authentication [(#8823)](https://github.com/prowler-cloud/prowler/pull/8823)
 - Add health check endpoint [(#8905)](https://github.com/prowler-cloud/prowler/pull/8905)
+- Update Prowler Documentation MCP Server to use Mintlify API [(#8915)](https://github.com/prowler-cloud/prowler/pull/8915)
--- a/mcp_server/README.md
+++ b/mcp_server/README.md
@@ -2,16 +2,18 @@

 > ⚠️ **Preview Feature**: This MCP server is currently in preview and under active development. Features and functionality may change. We welcome your feedback—please report any issues on [GitHub](https://github.com/prowler-cloud/prowler/issues) or join our [Slack community](https://goto.prowler.com/slack) to discuss and share your thoughts.

-Access the entire Prowler ecosystem through the Model Context Protocol (MCP). This server provides two main capabilities:
+Access the entire Prowler ecosystem through the Model Context Protocol (MCP). This server provides three main capabilities:

 - **Prowler Cloud and Prowler App (Self-Managed)**: Full access to Prowler Cloud platform and Prowler Self-Managed for managing providers, running scans, and analyzing security findings
 - **Prowler Hub**: Access to Prowler's security checks, fixers, and compliance frameworks catalog
+- **Prowler Documentation**: Search and retrieve official Prowler documentation


 ## Requirements

 - Python 3.12+
 - Network access to `https://hub.prowler.com` (for Prowler Hub)
+- Network access to `https://prowler.mintlify.app` (for Prowler Documentation)
 - Network access to Prowler Cloud and Prowler App (Self-Managed) API (it can be Prowler Cloud API or self-hosted Prowler App API)
 - Prowler Cloud account credentials (for Prowler Cloud and Prowler App (Self-Managed) features)

@@ -169,6 +171,13 @@ All tools are exposed under the `prowler_hub` prefix.
 - `prowler_hub_list_providers`: List Prowler official providers and their services.
 - `prowler_hub_get_artifacts_count`: Return total artifact count (checks + frameworks).

+### Prowler Documentation
+
+All tools are exposed under the `prowler_docs` prefix.
+
+- `prowler_docs_search`: Search the official Prowler documentation using fulltext search. Returns relevant documentation pages with highlighted snippets and relevance scores.
+- `prowler_docs_get_document`: Retrieve the full markdown content of a specific documentation file using the path from search results.
+
 ### Prowler Cloud and Prowler App (Self-Managed)

 All tools are exposed under the `prowler_app` prefix.
@@ -218,7 +227,7 @@ All tools are exposed under the `prowler_app` prefix.
 ### Prowler Cloud and Prowler App (Self-Managed) Authentication

 > [!IMPORTANT]
-> Authentication is not needed for using Prowler Hub features.
+> Authentication is not needed for using Prowler Hub or Prowler Documentation features.

 The Prowler MCP server supports different authentication in Prowler Cloud and Prowler App (Self-Managed) methods depending on the transport mode:

--- a/mcp_server/prowler_mcp_server/prowler_documentation/search_engine.py
+++ b/mcp_server/prowler_mcp_server/prowler_documentation/search_engine.py
@@ -1,7 +1,7 @@
-import urllib.parse
 from typing import List, Optional

-import requests
+import httpx
+from prowler_mcp_server import __version__
 from pydantic import BaseModel, Field


@@ -12,25 +12,51 @@ class SearchResult(BaseModel):
    title: str = Field(description="Document title")
    url: str = Field(description="Documentation URL")
    highlights: List[str] = Field(
-        description="Highlighted content snippets showing query matches with <span> tags",
+        description="Highlighted content snippets showing query matches with <mark><b> tags",
        default_factory=list,
    )
+    score: float = Field(
+        description="Relevance score for the search result", default=0.0
+    )


 class ProwlerDocsSearchEngine:
-    """Prowler documentation search using ReadTheDocs API."""
+    """Prowler documentation search using Mintlify API."""

    def __init__(self):
        """Initialize the search engine."""
-        self.api_base_url = "https://docs.prowler.com/_/api/v3/search/"
-        self.project_name = "prowler-prowler"
-        self.github_raw_base = (
-            "https://raw.githubusercontent.com/prowler-cloud/prowler/master/docs"
+        self.api_base_url = (
+            "https://api.mintlifytrieve.com/api/chunk_group/group_oriented_autocomplete"
+        )
+        self.dataset_id = "0096ba11-3f72-463b-9d95-b788495ac392"
+        self.api_key = "tr-T6JLeTkFXeNbNPyhijtI9XhIncydQQ3O"
+        self.docs_base_url = "https://prowler.mintlify.app"
+
+        # HTTP client for Mintlify API
+        self.mintlify_client = httpx.Client(
+            timeout=30.0,
+            headers={
+                "Content-Type": "application/json",
+                "Accept": "application/json",
+                "User-Agent": f"prowler-mcp-server/{__version__}",
+                "TR-Dataset": self.dataset_id,
+                "Authorization": self.api_key,
+                "X-API-Version": "V2",
+            },
+        )
+
+        # HTTP client for Mintlify documentation
+        self.docs_client = httpx.Client(
+            timeout=30.0,
+            headers={
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                "User-Agent": f"prowler-mcp-server/{__version__}",
+            },
        )

    def search(self, query: str, page_size: int = 5) -> List[SearchResult]:
        """
-        Search documentation using ReadTheDocs API.
+        Search documentation using Mintlify API.

        Args:
            query: Search query string
@@ -40,53 +66,69 @@ class ProwlerDocsSearchEngine:
            List of search results
        """
        try:
-            # Construct the search query with project filter
-            search_query = f"project:{self.project_name} {query}"
+            # Construct request body
+            payload = {
+                "query": query,
+                "search_type": "fulltext",
+                "extend_results": True,
+                "highlight_options": {
+                    "highlight_window": 10,
+                    "highlight_max_num": 1,
+                    "highlight_max_length": 2,
+                    "highlight_strategy": "exactmatch",
+                    "highlight_delimiters": ["?", ",", ".", "!", "\n"],
+                },
+                "score_threshold": 0.2,
+                "filters": {"must_not": [{"field": "tag_set", "match": ["code"]}]},
+                "page_size": page_size,
+                "group_size": 3,
+            }

-            # Make request to ReadTheDocs API with page_size to limit results
-            params = {"q": search_query, "page_size": page_size}
-            response = requests.get(
+            # Make request to Mintlify API
+            response = self.mintlify_client.post(
                self.api_base_url,
-                params=params,
-                timeout=10,
+                json=payload,
            )
            response.raise_for_status()
-
            data = response.json()

            # Parse results
            results = []
-            for hit in data.get("results", []):
-                # Extract relevant fields from API response
-                blocks = hit.get("blocks", [])
-                # Get the document path from the hit's path field
-                hit_path = hit.get("path", "")
-                doc_path = self._extract_doc_path(hit_path)
+            for result in data.get("results", []):
+                group = result.get("group", {})
+                chunks = result.get("chunks", [])
+
+                # Get document path and title from group
+                doc_path = group.get("name", "")
+                group_title = group.get("name", "").replace("/", " / ").title()
+
+                # If chunks exist, use the first chunk's title from metadata
+                title = group_title
+                if chunks:
+                    first_chunk = chunks[0].get("chunk", {})
+                    metadata = first_chunk.get("metadata", {})
+                    title = metadata.get("title", group_title)

                # Construct full URL to docs
-                domain = hit.get("domain", "https://docs.prowler.com")
-                full_url = f"{domain}{hit_path}" if hit_path else ""
+                full_url = f"{self.docs_base_url}/{doc_path}"

-                # Extract highlights from API response
+                # Extract highlights and scores from chunks
                highlights = []
-
-                # Add title highlights
-                page_highlights = hit.get("highlights", {})
-                if page_highlights.get("title"):
-                    highlights.extend(page_highlights["title"])
-
-                # Add block content highlights (up to 3 snippets)
-                for block in blocks[:3]:
-                    block_highlights = block.get("highlights", {})
-                    if block_highlights.get("content"):
-                        highlights.extend(block_highlights["content"])
+                max_score = 0.0
+                for chunk_data in chunks:
+                    chunk_highlights = chunk_data.get("highlights", [])
+                    highlights.extend(chunk_highlights)
+                    # Track the highest score among all chunks in this group
+                    chunk_score = chunk_data.get("score", 0.0)
+                    max_score = max(max_score, chunk_score)

                results.append(
                    SearchResult(
                        path=doc_path,
-                        title=hit.get("title", ""),
+                        title=title,
                        url=full_url,
                        highlights=highlights,
+                        score=max_score,
                    )
                )

@@ -99,7 +141,7 @@ class ProwlerDocsSearchEngine:

    def get_document(self, doc_path: str) -> Optional[str]:
        """
-        Get full document content from GitHub raw API.
+        Get full document content from Mintlify documentation.

        Args:
            doc_path: Path to the documentation file (e.g., "getting-started/installation")
@@ -111,15 +153,15 @@ class ProwlerDocsSearchEngine:
            # Clean up the path
            doc_path = doc_path.rstrip("/")

-            # Add .md extension if not present
+            # Add .md extension if not present (Mintlify serves both .md and .mdx)
            if not doc_path.endswith(".md"):
                doc_path = f"{doc_path}.md"

-            # Construct GitHub raw URL
-            url = f"{self.github_raw_base}/{doc_path}"
+            # Construct Mintlify URL
+            url = f"{self.docs_base_url}/{doc_path}"

-            # Fetch the raw markdown
-            response = requests.get(url, timeout=10)
+            # Fetch the documentation page
+            response = self.docs_client.get(url)
            response.raise_for_status()

            return response.text
@@ -127,34 +169,3 @@ class ProwlerDocsSearchEngine:
        except Exception as e:
            print(f"Error fetching document: {e}")
            return None
-
-    def _extract_doc_path(self, url: str) -> str:
-        """
-        Extract the document path from a full URL.
-
-        Args:
-            url: Full documentation URL
-
-        Returns:
-            Document path relative to docs base
-        """
-        if not url:
-            return ""
-
-        # Parse URL and extract path
-        try:
-            parsed = urllib.parse.urlparse(url)
-            path = parsed.path
-
-            # Remove the base path prefix if present
-            base_path = "/projects/prowler-open-source/en/latest/"
-            if path.startswith(base_path):
-                path = path[len(base_path) :]
-
-            # Remove .html extension
-            if path.endswith(".html"):
-                path = path[:-5]
-
-            return path.lstrip("/")
-        except Exception:
-            return url
--- a/mcp_server/prowler_mcp_server/prowler_documentation/server.py
+++ b/mcp_server/prowler_mcp_server/prowler_documentation/server.py
@@ -23,18 +23,15 @@ def search(
    to find relevant information about security checks, cloud providers,
    compliance frameworks, and usage instructions.

-    Supports advanced search syntax:
-    - Exact phrases: "custom css"
-    - Prefix search: test*
-    - Fuzzy search: doks~1
-    - Proximity search: "dashboard admin"~2
+    Uses fulltext search to find the most relevant documentation pages
+    based on your query.

    Args:
        query: The search query
        page_size: Number of top results to return (default: 5)

    Returns:
-        List of search results with highlights showing matched terms (in <span> tags)
+        List of search results with highlights showing matched terms (in <mark><b> tags)
    """
    return prowler_docs_search_engine.search(query, page_size)

--- a/mcp_server/pyproject.toml
+++ b/mcp_server/pyproject.toml
@@ -6,7 +6,6 @@ requires = ["setuptools>=61.0", "wheel"]
 dependencies = [
  "fastmcp>=2.11.3",
  "httpx>=0.27.0",
-  "requests>=2.31.0"
 ]
 description = "MCP server for Prowler ecosystem"
 name = "prowler-mcp"
--- a/mcp_server/uv.lock
+++ b/mcp_server/uv.lock
@@ -634,14 +634,12 @@ source = { editable = "." }
 dependencies = [
    { name = "fastmcp" },
    { name = "httpx" },
-    { name = "requests" },
 ]

 [package.metadata]
 requires-dist = [
    { name = "fastmcp", specifier = ">=2.11.3" },
    { name = "httpx", specifier = ">=0.27.0" },
-    { name = "requests", specifier = ">=2.31.0" },
 ]

 [[package]]