Add local llms.txt file reading (#14)

Add ability to read llms.txt from local files. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-10-19 03:18:14 +03:00 · 2025-03-27 10:22:42 -07:00
parent 1bc11f5ea1
commit c2977b3602
4 changed files with 226 additions and 46 deletions
--- a/README.md
+++ b/README.md
@@ -20,6 +20,20 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
 #### Choose an `llms.txt` file to use. 
 * For example, [here's](https://langchain-ai.github.io/langgraph/llms.txt) the LangGraph `llms.txt` file.
 > **Note: Security and Domain Access Control**
 > 
 > For security reasons, mcpdoc implements strict domain access controls:
 > 
 > 1. **Remote llms.txt files**: When you specify a remote llms.txt URL (e.g., `https://langchain-ai.github.io/langgraph/llms.txt`), mcpdoc automatically adds only that specific domain (`langchain-ai.github.io`) to the allowed domains list. This means the tool can only fetch documentation from URLs on that domain.
 > 
 > 2. **Local llms.txt files**: When using a local file, NO domains are automatically added to the allowed list. You MUST explicitly specify which domains to allow using the `--allowed-domains` parameter.
 > 
 > 3. **Adding additional domains**: To allow fetching from domains beyond those automatically included:
 >    - Use `--allowed-domains domain1.com domain2.com` to add specific domains
 >    - Use `--allowed-domains '*'` to allow all domains (use with caution)
 > 
 > This security measure prevents unauthorized access to domains not explicitly approved by the user, ensuring that documentation can only be retrieved from trusted sources.
 #### (Optional) Test the MCP server locally with your `llms.txt` file of choice:
 ```bash
 uvx --from mcpdoc mcpdoc \
--- a/mcpdoc/cli.py
+++ b/mcpdoc/cli.py
@@ -25,6 +25,9 @@ Examples:
  # Directly specifying llms.txt URLs with optional names
  mcpdoc --urls LangGraph:https://langchain-ai.github.io/langgraph/llms.txt
  # Using a local file (absolute or relative path)
  mcpdoc --urls LocalDocs:/path/to/llms.txt --allowed-domains '*'
  # Using a YAML config file
  mcpdoc --yaml sample_config.yaml
@@ -72,7 +75,7 @@ def parse_args() -> argparse.Namespace:
        "-u",
        type=str,
        nargs="+",
-        help="List of llms.txt URLs with optional names (format: 'url' or 'name:url')",
+        help="List of llms.txt URLs or file paths with optional names (format: 'url_or_path' or 'name:url_or_path')",
    )
    parser.add_argument(
@@ -84,7 +87,7 @@ def parse_args() -> argparse.Namespace:
        "--allowed-domains",
        type=str,
        nargs="*",
-        help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains",
+        help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains.",
    )
    parser.add_argument(
        "--timeout", type=float, default=10.0, help="HTTP request timeout in seconds"
@@ -163,10 +166,11 @@ def load_config_file(file_path: str, file_format: str) -> List[Dict[str, str]]:
 def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]:
-    """Create doc sources from a list of URLs with optional names.
+    """Create doc sources from a list of URLs or file paths with optional names.
    Args:
-        urls: List of llms.txt URLs with optional names (format: 'url' or 'name:url')
+        urls: List of llms.txt URLs or file paths with optional names
             (format: 'url_or_path' or 'name:url_or_path')
    Returns:
        List of DocSource objects
--- a/mcpdoc/main.py
+++ b/mcpdoc/main.py
@@ -1,5 +1,6 @@
 """MCP Llms-txt server for docs."""
 import os
 from urllib.parse import urlparse
 import httpx
@@ -34,8 +35,64 @@ def extract_domain(url: str) -> str:
    return f"{parsed.scheme}://{parsed.netloc}/"
 def _is_http_or_https(url: str) -> bool:
    """Check if the URL is an HTTP or HTTPS URL."""
    return url.startswith(("http:", "https:"))
 def _get_fetch_description(has_local_sources: bool) -> str:
    """Get fetch docs tool description."""
    description = [
        "Fetch and parse documentation from a given URL or local file.",
        "",
        "Use this tool after list_doc_sources to:",
        "1. First fetch the llms.txt file from a documentation source",
        "2. Analyze the URLs listed in the llms.txt file",
        "3. Then fetch specific documentation pages relevant to the user's question",
        "",
    ]
    if has_local_sources:
        description.extend(
            [
                "Args:",
                "    url: The URL or file path to fetch documentation from. Can be:",
                "        - URL from an allowed domain",
                "        - A local file path (absolute or relative)",
                "        - A file:// URL (e.g., file:///path/to/llms.txt)",
            ]
        )
    else:
        description.extend(
            [
                "Args:",
                "    url: The URL to fetch documentation from.",
            ]
        )
    description.extend(
        [
            "",
            "Returns:",
            "    The fetched documentation content converted to markdown, or an error message",  # noqa: E501
            "    if the request fails or the URL is not from an allowed domain.",
        ]
    )
    return "\n".join(description)
 def _normalize_path(path: str) -> str:
    """Accept paths in file:/// or relative format and map to absolute paths."""
    return (
        os.path.abspath(path[7:])
        if path.startswith("file://")
        else os.path.abspath(path)
    )
 def create_server(
-    doc_source: list[DocSource],
+    doc_sources: list[DocSource],
    *,
    follow_redirects: bool = False,
    timeout: float = 10,
@@ -45,7 +102,7 @@ def create_server(
    """Create the server and generate documentation retrieval tools.
    Args:
-        doc_source: List of documentation sources to make available
+        doc_sources: List of documentation sources to make available
        follow_redirects: Whether to follow HTTP redirects when fetching docs
        timeout: HTTP request timeout in seconds
        settings: Additional settings to pass to FastMCP
@@ -68,61 +125,95 @@ def create_server(
    )
    httpx_client = httpx.AsyncClient(follow_redirects=follow_redirects, timeout=timeout)
-    @server.tool()
+    local_sources = []
-    def list_doc_sources() -> str:
+    remote_sources = []
        """List all available documentation sources.
-        This is the first tool you should call in the documentation workflow.
+    for entry in doc_sources:
-        It provides URLs to llms.txt files that the user has made available.
+        url = entry["llms_txt"]
        if _is_http_or_https(url):
            remote_sources.append(entry)
        else:
            local_sources.append(entry)
-        Returns:
+    # Let's verify that all local sources exist
-            A string containing a formatted list of documentation sources with their URLs
+    for entry in local_sources:
-        """
+        path = entry["llms_txt"]
-        content = ""
+        abs_path = _normalize_path(path)
-        for entry in doc_source:
+        if not os.path.exists(abs_path):
-            name = entry.get("name", "") or extract_domain(entry["llms_txt"])
+            raise FileNotFoundError(f"Local file not found: {abs_path}")
            content += f"{name}\n"
            content += "URL: " + entry["llms_txt"] + "\n\n"
        return content
-    # Parse the domain names in the llms.txt URLs
+    # Parse the domain names in the llms.txt URLs and identify local file paths
-    domains = set(extract_domain(entry["llms_txt"]) for entry in doc_source)
+    domains = set(extract_domain(entry["llms_txt"]) for entry in remote_sources)
-    # Add additional allowed domains if specified
+    # Add additional allowed domains if specified, or set to '*' if we have local files
    if allowed_domains:
        if "*" in allowed_domains:
            domains = {"*"}  # Special marker for allowing all domains
        else:
            domains.update(allowed_domains)
    allowed_local_files = set(
        _normalize_path(entry["llms_txt"]) for entry in local_sources
    )
    @server.tool()
-    async def fetch_docs(url: str) -> str:
+    def list_doc_sources() -> str:
-        """Fetch and parse documentation from a given URL.
+        """List all available documentation sources.
-        Use this tool after list_doc_sources to:
+        This is the first tool you should call in the documentation workflow.
-        1. First fetch the llms.txt file from a documentation source
+        It provides URLs to llms.txt files or local file paths that the user has made available.
        2. Analyze the URLs listed in the llms.txt file
        3. Then fetch specific documentation pages relevant to the user's question
        Args:
            url: The URL to fetch documentation from. Must be from an allowed domain.
        Returns:
-            The fetched documentation content converted to markdown, or an error message
+            A string containing a formatted list of documentation sources with their URLs or file paths
            if the request fails or the URL is not from an allowed domain.
        """
-        nonlocal domains
+        content = ""
-        if "*" not in domains and not any(url.startswith(domain) for domain in domains):
+        for entry_ in doc_sources:
-            return (
+            url_or_path = entry_["llms_txt"]
                "Error: URL not allowed. Must start with one of the following domains: "
                + ", ".join(domains)
            )
-        try:
+            if _is_http_or_https(url_or_path):
-            response = await httpx_client.get(url, timeout=timeout)
+                name = entry_.get("name", extract_domain(url_or_path))
-            response.raise_for_status()
+                content += f"{name}\nURL: {url_or_path}\n\n"
-            return markdownify(response.text)
+            else:
-        except (httpx.HTTPStatusError, httpx.RequestError) as e:
+                path = _normalize_path(url_or_path)
-            return f"Encountered an HTTP error with code {e.response.status_code}"
+                name = entry_.get("name", path)
                content += f"{name}\nPath: {path}\n\n"
        return content
    fetch_docs_description = _get_fetch_description(
        has_local_sources=bool(local_sources)
    )
    @server.tool(description=fetch_docs_description)
    async def fetch_docs(url: str) -> str:
        nonlocal domains
        # Handle local file paths (either as file:// URLs or direct filesystem paths)
        if not _is_http_or_https(url):
            abs_path = _normalize_path(url)
            if abs_path not in allowed_local_files:
                raise ValueError(
                    f"Local file not allowed: {abs_path}. Allowed files: {allowed_local_files}"
                )
            try:
                with open(abs_path, "r", encoding="utf-8") as f:
                    content = f.read()
                return markdownify(content)
            except Exception as e:
                return f"Error reading local file: {str(e)}"
        else:
            # Otherwise treat as URL
            if "*" not in domains and not any(
                url.startswith(domain) for domain in domains
            ):
                return (
                    "Error: URL not allowed. Must start with one of the following domains: "
                    + ", ".join(domains)
                )
            try:
                response = await httpx_client.get(url, timeout=timeout)
                response.raise_for_status()
                return markdownify(response.text)
            except (httpx.HTTPStatusError, httpx.RequestError) as e:
                return f"Encountered an HTTP error: {str(e)}"
    return server
--- a/tests/unit_tests/test_main.py
+++ b/tests/unit_tests/test_main.py
@@ -0,0 +1,71 @@
 """Tests for mcpdoc.main module."""
 import pytest
 from mcpdoc.main import (
    _get_fetch_description,
    _is_http_or_https,
    extract_domain,
 )
 def test_extract_domain() -> None:
    """Test extract_domain function."""
    # Test with https URL
    assert extract_domain("https://example.com/page") == "https://example.com/"
    # Test with http URL
    assert extract_domain("http://test.org/docs/index.html") == "http://test.org/"
    # Test with URL that has port
    assert extract_domain("https://localhost:8080/api") == "https://localhost:8080/"
    # Check trailing slash
    assert extract_domain("https://localhost:8080") == "https://localhost:8080/"
    # Test with URL that has subdomain
    assert extract_domain("https://docs.python.org/3/") == "https://docs.python.org/"
@pytest.mark.parametrize(
    "url,expected",
    [
        ("http://example.com", True),
        ("https://example.com", True),
        ("/path/to/file.txt", False),
        ("file:///path/to/file.txt", False),
        (
            "ftp://example.com",
            False,
        ),  # Not HTTP or HTTPS, even though it's not a local file
    ],
 )
 def test_is_http_or_https(url, expected):
    """Test _is_http_or_https function."""
    assert _is_http_or_https(url) is expected
@pytest.mark.parametrize(
    "has_local_sources,expected_substrings",
    [
        (True, ["local file path", "file://"]),
        (False, ["URL to fetch"]),
    ],
 )
 def test_get_fetch_description(has_local_sources, expected_substrings):
    """Test _get_fetch_description function."""
    description = _get_fetch_description(has_local_sources)
    # Common assertions for both cases
    assert "Fetch and parse documentation" in description
    assert "Returns:" in description
    # Specific assertions based on has_local_sources
    for substring in expected_substrings:
        if has_local_sources:
            assert substring in description
        else:
            # For the False case, we only check that "local file path"
            # and "file://" are NOT present
            if substring in ["local file path", "file://"]:
                assert substring not in description