Add local llms.txt file reading (#14)

Add ability to read llms.txt from local files.

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Lance Martin
2025-03-27 10:22:42 -07:00
committed by GitHub
parent 1bc11f5ea1
commit c2977b3602
4 changed files with 226 additions and 46 deletions

View File

@@ -20,6 +20,20 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
#### Choose an `llms.txt` file to use.
* For example, [here's](https://langchain-ai.github.io/langgraph/llms.txt) the LangGraph `llms.txt` file.
> **Note: Security and Domain Access Control**
>
> For security reasons, mcpdoc implements strict domain access controls:
>
> 1. **Remote llms.txt files**: When you specify a remote llms.txt URL (e.g., `https://langchain-ai.github.io/langgraph/llms.txt`), mcpdoc automatically adds only that specific domain (`langchain-ai.github.io`) to the allowed domains list. This means the tool can only fetch documentation from URLs on that domain.
>
> 2. **Local llms.txt files**: When using a local file, NO domains are automatically added to the allowed list. You MUST explicitly specify which domains to allow using the `--allowed-domains` parameter.
>
> 3. **Adding additional domains**: To allow fetching from domains beyond those automatically included:
> - Use `--allowed-domains domain1.com domain2.com` to add specific domains
> - Use `--allowed-domains '*'` to allow all domains (use with caution)
>
> This security measure prevents unauthorized access to domains not explicitly approved by the user, ensuring that documentation can only be retrieved from trusted sources.
#### (Optional) Test the MCP server locally with your `llms.txt` file of choice:
```bash
uvx --from mcpdoc mcpdoc \

View File

@@ -25,6 +25,9 @@ Examples:
# Directly specifying llms.txt URLs with optional names
mcpdoc --urls LangGraph:https://langchain-ai.github.io/langgraph/llms.txt
# Using a local file (absolute or relative path)
mcpdoc --urls LocalDocs:/path/to/llms.txt --allowed-domains '*'
# Using a YAML config file
mcpdoc --yaml sample_config.yaml
@@ -72,7 +75,7 @@ def parse_args() -> argparse.Namespace:
"-u",
type=str,
nargs="+",
help="List of llms.txt URLs with optional names (format: 'url' or 'name:url')",
help="List of llms.txt URLs or file paths with optional names (format: 'url_or_path' or 'name:url_or_path')",
)
parser.add_argument(
@@ -84,7 +87,7 @@ def parse_args() -> argparse.Namespace:
"--allowed-domains",
type=str,
nargs="*",
help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains",
help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains.",
)
parser.add_argument(
"--timeout", type=float, default=10.0, help="HTTP request timeout in seconds"
@@ -163,10 +166,11 @@ def load_config_file(file_path: str, file_format: str) -> List[Dict[str, str]]:
def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]:
"""Create doc sources from a list of URLs with optional names.
"""Create doc sources from a list of URLs or file paths with optional names.
Args:
urls: List of llms.txt URLs with optional names (format: 'url' or 'name:url')
urls: List of llms.txt URLs or file paths with optional names
(format: 'url_or_path' or 'name:url_or_path')
Returns:
List of DocSource objects

View File

@@ -1,5 +1,6 @@
"""MCP Llms-txt server for docs."""
import os
from urllib.parse import urlparse
import httpx
@@ -34,8 +35,64 @@ def extract_domain(url: str) -> str:
return f"{parsed.scheme}://{parsed.netloc}/"
def _is_http_or_https(url: str) -> bool:
"""Check if the URL is an HTTP or HTTPS URL."""
return url.startswith(("http:", "https:"))
def _get_fetch_description(has_local_sources: bool) -> str:
"""Get fetch docs tool description."""
description = [
"Fetch and parse documentation from a given URL or local file.",
"",
"Use this tool after list_doc_sources to:",
"1. First fetch the llms.txt file from a documentation source",
"2. Analyze the URLs listed in the llms.txt file",
"3. Then fetch specific documentation pages relevant to the user's question",
"",
]
if has_local_sources:
description.extend(
[
"Args:",
" url: The URL or file path to fetch documentation from. Can be:",
" - URL from an allowed domain",
" - A local file path (absolute or relative)",
" - A file:// URL (e.g., file:///path/to/llms.txt)",
]
)
else:
description.extend(
[
"Args:",
" url: The URL to fetch documentation from.",
]
)
description.extend(
[
"",
"Returns:",
" The fetched documentation content converted to markdown, or an error message", # noqa: E501
" if the request fails or the URL is not from an allowed domain.",
]
)
return "\n".join(description)
def _normalize_path(path: str) -> str:
"""Accept paths in file:/// or relative format and map to absolute paths."""
return (
os.path.abspath(path[7:])
if path.startswith("file://")
else os.path.abspath(path)
)
def create_server(
doc_source: list[DocSource],
doc_sources: list[DocSource],
*,
follow_redirects: bool = False,
timeout: float = 10,
@@ -45,7 +102,7 @@ def create_server(
"""Create the server and generate documentation retrieval tools.
Args:
doc_source: List of documentation sources to make available
doc_sources: List of documentation sources to make available
follow_redirects: Whether to follow HTTP redirects when fetching docs
timeout: HTTP request timeout in seconds
settings: Additional settings to pass to FastMCP
@@ -68,61 +125,95 @@ def create_server(
)
httpx_client = httpx.AsyncClient(follow_redirects=follow_redirects, timeout=timeout)
@server.tool()
def list_doc_sources() -> str:
"""List all available documentation sources.
local_sources = []
remote_sources = []
This is the first tool you should call in the documentation workflow.
It provides URLs to llms.txt files that the user has made available.
for entry in doc_sources:
url = entry["llms_txt"]
if _is_http_or_https(url):
remote_sources.append(entry)
else:
local_sources.append(entry)
Returns:
A string containing a formatted list of documentation sources with their URLs
"""
content = ""
for entry in doc_source:
name = entry.get("name", "") or extract_domain(entry["llms_txt"])
content += f"{name}\n"
content += "URL: " + entry["llms_txt"] + "\n\n"
return content
# Let's verify that all local sources exist
for entry in local_sources:
path = entry["llms_txt"]
abs_path = _normalize_path(path)
if not os.path.exists(abs_path):
raise FileNotFoundError(f"Local file not found: {abs_path}")
# Parse the domain names in the llms.txt URLs
domains = set(extract_domain(entry["llms_txt"]) for entry in doc_source)
# Parse the domain names in the llms.txt URLs and identify local file paths
domains = set(extract_domain(entry["llms_txt"]) for entry in remote_sources)
# Add additional allowed domains if specified
# Add additional allowed domains if specified, or set to '*' if we have local files
if allowed_domains:
if "*" in allowed_domains:
domains = {"*"} # Special marker for allowing all domains
else:
domains.update(allowed_domains)
allowed_local_files = set(
_normalize_path(entry["llms_txt"]) for entry in local_sources
)
@server.tool()
async def fetch_docs(url: str) -> str:
"""Fetch and parse documentation from a given URL.
def list_doc_sources() -> str:
"""List all available documentation sources.
Use this tool after list_doc_sources to:
1. First fetch the llms.txt file from a documentation source
2. Analyze the URLs listed in the llms.txt file
3. Then fetch specific documentation pages relevant to the user's question
Args:
url: The URL to fetch documentation from. Must be from an allowed domain.
This is the first tool you should call in the documentation workflow.
It provides URLs to llms.txt files or local file paths that the user has made available.
Returns:
The fetched documentation content converted to markdown, or an error message
if the request fails or the URL is not from an allowed domain.
A string containing a formatted list of documentation sources with their URLs or file paths
"""
nonlocal domains
if "*" not in domains and not any(url.startswith(domain) for domain in domains):
return (
"Error: URL not allowed. Must start with one of the following domains: "
+ ", ".join(domains)
)
content = ""
for entry_ in doc_sources:
url_or_path = entry_["llms_txt"]
try:
response = await httpx_client.get(url, timeout=timeout)
response.raise_for_status()
return markdownify(response.text)
except (httpx.HTTPStatusError, httpx.RequestError) as e:
return f"Encountered an HTTP error with code {e.response.status_code}"
if _is_http_or_https(url_or_path):
name = entry_.get("name", extract_domain(url_or_path))
content += f"{name}\nURL: {url_or_path}\n\n"
else:
path = _normalize_path(url_or_path)
name = entry_.get("name", path)
content += f"{name}\nPath: {path}\n\n"
return content
fetch_docs_description = _get_fetch_description(
has_local_sources=bool(local_sources)
)
@server.tool(description=fetch_docs_description)
async def fetch_docs(url: str) -> str:
nonlocal domains
# Handle local file paths (either as file:// URLs or direct filesystem paths)
if not _is_http_or_https(url):
abs_path = _normalize_path(url)
if abs_path not in allowed_local_files:
raise ValueError(
f"Local file not allowed: {abs_path}. Allowed files: {allowed_local_files}"
)
try:
with open(abs_path, "r", encoding="utf-8") as f:
content = f.read()
return markdownify(content)
except Exception as e:
return f"Error reading local file: {str(e)}"
else:
# Otherwise treat as URL
if "*" not in domains and not any(
url.startswith(domain) for domain in domains
):
return (
"Error: URL not allowed. Must start with one of the following domains: "
+ ", ".join(domains)
)
try:
response = await httpx_client.get(url, timeout=timeout)
response.raise_for_status()
return markdownify(response.text)
except (httpx.HTTPStatusError, httpx.RequestError) as e:
return f"Encountered an HTTP error: {str(e)}"
return server

View File

@@ -0,0 +1,71 @@
"""Tests for mcpdoc.main module."""
import pytest
from mcpdoc.main import (
_get_fetch_description,
_is_http_or_https,
extract_domain,
)
def test_extract_domain() -> None:
"""Test extract_domain function."""
# Test with https URL
assert extract_domain("https://example.com/page") == "https://example.com/"
# Test with http URL
assert extract_domain("http://test.org/docs/index.html") == "http://test.org/"
# Test with URL that has port
assert extract_domain("https://localhost:8080/api") == "https://localhost:8080/"
# Check trailing slash
assert extract_domain("https://localhost:8080") == "https://localhost:8080/"
# Test with URL that has subdomain
assert extract_domain("https://docs.python.org/3/") == "https://docs.python.org/"
@pytest.mark.parametrize(
"url,expected",
[
("http://example.com", True),
("https://example.com", True),
("/path/to/file.txt", False),
("file:///path/to/file.txt", False),
(
"ftp://example.com",
False,
), # Not HTTP or HTTPS, even though it's not a local file
],
)
def test_is_http_or_https(url, expected):
"""Test _is_http_or_https function."""
assert _is_http_or_https(url) is expected
@pytest.mark.parametrize(
"has_local_sources,expected_substrings",
[
(True, ["local file path", "file://"]),
(False, ["URL to fetch"]),
],
)
def test_get_fetch_description(has_local_sources, expected_substrings):
"""Test _get_fetch_description function."""
description = _get_fetch_description(has_local_sources)
# Common assertions for both cases
assert "Fetch and parse documentation" in description
assert "Returns:" in description
# Specific assertions based on has_local_sources
for substring in expected_substrings:
if has_local_sources:
assert substring in description
else:
# For the False case, we only check that "local file path"
# and "file://" are NOT present
if substring in ["local file path", "file://"]:
assert substring not in description