mirror of
https://github.com/langchain-ai/mcpdoc.git
synced 2025-10-19 03:18:14 +03:00
Add local llms.txt file reading (#14)
Add ability to read llms.txt from local files. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
14
README.md
14
README.md
@@ -20,6 +20,20 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
#### Choose an `llms.txt` file to use.
|
||||
* For example, [here's](https://langchain-ai.github.io/langgraph/llms.txt) the LangGraph `llms.txt` file.
|
||||
|
||||
> **Note: Security and Domain Access Control**
|
||||
>
|
||||
> For security reasons, mcpdoc implements strict domain access controls:
|
||||
>
|
||||
> 1. **Remote llms.txt files**: When you specify a remote llms.txt URL (e.g., `https://langchain-ai.github.io/langgraph/llms.txt`), mcpdoc automatically adds only that specific domain (`langchain-ai.github.io`) to the allowed domains list. This means the tool can only fetch documentation from URLs on that domain.
|
||||
>
|
||||
> 2. **Local llms.txt files**: When using a local file, NO domains are automatically added to the allowed list. You MUST explicitly specify which domains to allow using the `--allowed-domains` parameter.
|
||||
>
|
||||
> 3. **Adding additional domains**: To allow fetching from domains beyond those automatically included:
|
||||
> - Use `--allowed-domains domain1.com domain2.com` to add specific domains
|
||||
> - Use `--allowed-domains '*'` to allow all domains (use with caution)
|
||||
>
|
||||
> This security measure prevents unauthorized access to domains not explicitly approved by the user, ensuring that documentation can only be retrieved from trusted sources.
|
||||
|
||||
#### (Optional) Test the MCP server locally with your `llms.txt` file of choice:
|
||||
```bash
|
||||
uvx --from mcpdoc mcpdoc \
|
||||
|
||||
@@ -25,6 +25,9 @@ Examples:
|
||||
# Directly specifying llms.txt URLs with optional names
|
||||
mcpdoc --urls LangGraph:https://langchain-ai.github.io/langgraph/llms.txt
|
||||
|
||||
# Using a local file (absolute or relative path)
|
||||
mcpdoc --urls LocalDocs:/path/to/llms.txt --allowed-domains '*'
|
||||
|
||||
# Using a YAML config file
|
||||
mcpdoc --yaml sample_config.yaml
|
||||
|
||||
@@ -72,7 +75,7 @@ def parse_args() -> argparse.Namespace:
|
||||
"-u",
|
||||
type=str,
|
||||
nargs="+",
|
||||
help="List of llms.txt URLs with optional names (format: 'url' or 'name:url')",
|
||||
help="List of llms.txt URLs or file paths with optional names (format: 'url_or_path' or 'name:url_or_path')",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
@@ -84,7 +87,7 @@ def parse_args() -> argparse.Namespace:
|
||||
"--allowed-domains",
|
||||
type=str,
|
||||
nargs="*",
|
||||
help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains",
|
||||
help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout", type=float, default=10.0, help="HTTP request timeout in seconds"
|
||||
@@ -163,10 +166,11 @@ def load_config_file(file_path: str, file_format: str) -> List[Dict[str, str]]:
|
||||
|
||||
|
||||
def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]:
|
||||
"""Create doc sources from a list of URLs with optional names.
|
||||
"""Create doc sources from a list of URLs or file paths with optional names.
|
||||
|
||||
Args:
|
||||
urls: List of llms.txt URLs with optional names (format: 'url' or 'name:url')
|
||||
urls: List of llms.txt URLs or file paths with optional names
|
||||
(format: 'url_or_path' or 'name:url_or_path')
|
||||
|
||||
Returns:
|
||||
List of DocSource objects
|
||||
|
||||
175
mcpdoc/main.py
175
mcpdoc/main.py
@@ -1,5 +1,6 @@
|
||||
"""MCP Llms-txt server for docs."""
|
||||
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
@@ -34,8 +35,64 @@ def extract_domain(url: str) -> str:
|
||||
return f"{parsed.scheme}://{parsed.netloc}/"
|
||||
|
||||
|
||||
def _is_http_or_https(url: str) -> bool:
|
||||
"""Check if the URL is an HTTP or HTTPS URL."""
|
||||
return url.startswith(("http:", "https:"))
|
||||
|
||||
|
||||
def _get_fetch_description(has_local_sources: bool) -> str:
|
||||
"""Get fetch docs tool description."""
|
||||
description = [
|
||||
"Fetch and parse documentation from a given URL or local file.",
|
||||
"",
|
||||
"Use this tool after list_doc_sources to:",
|
||||
"1. First fetch the llms.txt file from a documentation source",
|
||||
"2. Analyze the URLs listed in the llms.txt file",
|
||||
"3. Then fetch specific documentation pages relevant to the user's question",
|
||||
"",
|
||||
]
|
||||
|
||||
if has_local_sources:
|
||||
description.extend(
|
||||
[
|
||||
"Args:",
|
||||
" url: The URL or file path to fetch documentation from. Can be:",
|
||||
" - URL from an allowed domain",
|
||||
" - A local file path (absolute or relative)",
|
||||
" - A file:// URL (e.g., file:///path/to/llms.txt)",
|
||||
]
|
||||
)
|
||||
else:
|
||||
description.extend(
|
||||
[
|
||||
"Args:",
|
||||
" url: The URL to fetch documentation from.",
|
||||
]
|
||||
)
|
||||
|
||||
description.extend(
|
||||
[
|
||||
"",
|
||||
"Returns:",
|
||||
" The fetched documentation content converted to markdown, or an error message", # noqa: E501
|
||||
" if the request fails or the URL is not from an allowed domain.",
|
||||
]
|
||||
)
|
||||
|
||||
return "\n".join(description)
|
||||
|
||||
|
||||
def _normalize_path(path: str) -> str:
|
||||
"""Accept paths in file:/// or relative format and map to absolute paths."""
|
||||
return (
|
||||
os.path.abspath(path[7:])
|
||||
if path.startswith("file://")
|
||||
else os.path.abspath(path)
|
||||
)
|
||||
|
||||
|
||||
def create_server(
|
||||
doc_source: list[DocSource],
|
||||
doc_sources: list[DocSource],
|
||||
*,
|
||||
follow_redirects: bool = False,
|
||||
timeout: float = 10,
|
||||
@@ -45,7 +102,7 @@ def create_server(
|
||||
"""Create the server and generate documentation retrieval tools.
|
||||
|
||||
Args:
|
||||
doc_source: List of documentation sources to make available
|
||||
doc_sources: List of documentation sources to make available
|
||||
follow_redirects: Whether to follow HTTP redirects when fetching docs
|
||||
timeout: HTTP request timeout in seconds
|
||||
settings: Additional settings to pass to FastMCP
|
||||
@@ -68,61 +125,95 @@ def create_server(
|
||||
)
|
||||
httpx_client = httpx.AsyncClient(follow_redirects=follow_redirects, timeout=timeout)
|
||||
|
||||
@server.tool()
|
||||
def list_doc_sources() -> str:
|
||||
"""List all available documentation sources.
|
||||
local_sources = []
|
||||
remote_sources = []
|
||||
|
||||
This is the first tool you should call in the documentation workflow.
|
||||
It provides URLs to llms.txt files that the user has made available.
|
||||
for entry in doc_sources:
|
||||
url = entry["llms_txt"]
|
||||
if _is_http_or_https(url):
|
||||
remote_sources.append(entry)
|
||||
else:
|
||||
local_sources.append(entry)
|
||||
|
||||
Returns:
|
||||
A string containing a formatted list of documentation sources with their URLs
|
||||
"""
|
||||
content = ""
|
||||
for entry in doc_source:
|
||||
name = entry.get("name", "") or extract_domain(entry["llms_txt"])
|
||||
content += f"{name}\n"
|
||||
content += "URL: " + entry["llms_txt"] + "\n\n"
|
||||
return content
|
||||
# Let's verify that all local sources exist
|
||||
for entry in local_sources:
|
||||
path = entry["llms_txt"]
|
||||
abs_path = _normalize_path(path)
|
||||
if not os.path.exists(abs_path):
|
||||
raise FileNotFoundError(f"Local file not found: {abs_path}")
|
||||
|
||||
# Parse the domain names in the llms.txt URLs
|
||||
domains = set(extract_domain(entry["llms_txt"]) for entry in doc_source)
|
||||
# Parse the domain names in the llms.txt URLs and identify local file paths
|
||||
domains = set(extract_domain(entry["llms_txt"]) for entry in remote_sources)
|
||||
|
||||
# Add additional allowed domains if specified
|
||||
# Add additional allowed domains if specified, or set to '*' if we have local files
|
||||
if allowed_domains:
|
||||
if "*" in allowed_domains:
|
||||
domains = {"*"} # Special marker for allowing all domains
|
||||
else:
|
||||
domains.update(allowed_domains)
|
||||
|
||||
allowed_local_files = set(
|
||||
_normalize_path(entry["llms_txt"]) for entry in local_sources
|
||||
)
|
||||
|
||||
@server.tool()
|
||||
async def fetch_docs(url: str) -> str:
|
||||
"""Fetch and parse documentation from a given URL.
|
||||
def list_doc_sources() -> str:
|
||||
"""List all available documentation sources.
|
||||
|
||||
Use this tool after list_doc_sources to:
|
||||
1. First fetch the llms.txt file from a documentation source
|
||||
2. Analyze the URLs listed in the llms.txt file
|
||||
3. Then fetch specific documentation pages relevant to the user's question
|
||||
|
||||
Args:
|
||||
url: The URL to fetch documentation from. Must be from an allowed domain.
|
||||
This is the first tool you should call in the documentation workflow.
|
||||
It provides URLs to llms.txt files or local file paths that the user has made available.
|
||||
|
||||
Returns:
|
||||
The fetched documentation content converted to markdown, or an error message
|
||||
if the request fails or the URL is not from an allowed domain.
|
||||
A string containing a formatted list of documentation sources with their URLs or file paths
|
||||
"""
|
||||
nonlocal domains
|
||||
if "*" not in domains and not any(url.startswith(domain) for domain in domains):
|
||||
return (
|
||||
"Error: URL not allowed. Must start with one of the following domains: "
|
||||
+ ", ".join(domains)
|
||||
)
|
||||
content = ""
|
||||
for entry_ in doc_sources:
|
||||
url_or_path = entry_["llms_txt"]
|
||||
|
||||
try:
|
||||
response = await httpx_client.get(url, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
return markdownify(response.text)
|
||||
except (httpx.HTTPStatusError, httpx.RequestError) as e:
|
||||
return f"Encountered an HTTP error with code {e.response.status_code}"
|
||||
if _is_http_or_https(url_or_path):
|
||||
name = entry_.get("name", extract_domain(url_or_path))
|
||||
content += f"{name}\nURL: {url_or_path}\n\n"
|
||||
else:
|
||||
path = _normalize_path(url_or_path)
|
||||
name = entry_.get("name", path)
|
||||
content += f"{name}\nPath: {path}\n\n"
|
||||
return content
|
||||
|
||||
fetch_docs_description = _get_fetch_description(
|
||||
has_local_sources=bool(local_sources)
|
||||
)
|
||||
|
||||
@server.tool(description=fetch_docs_description)
|
||||
async def fetch_docs(url: str) -> str:
|
||||
nonlocal domains
|
||||
# Handle local file paths (either as file:// URLs or direct filesystem paths)
|
||||
if not _is_http_or_https(url):
|
||||
abs_path = _normalize_path(url)
|
||||
if abs_path not in allowed_local_files:
|
||||
raise ValueError(
|
||||
f"Local file not allowed: {abs_path}. Allowed files: {allowed_local_files}"
|
||||
)
|
||||
try:
|
||||
with open(abs_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
return markdownify(content)
|
||||
except Exception as e:
|
||||
return f"Error reading local file: {str(e)}"
|
||||
else:
|
||||
# Otherwise treat as URL
|
||||
if "*" not in domains and not any(
|
||||
url.startswith(domain) for domain in domains
|
||||
):
|
||||
return (
|
||||
"Error: URL not allowed. Must start with one of the following domains: "
|
||||
+ ", ".join(domains)
|
||||
)
|
||||
|
||||
try:
|
||||
response = await httpx_client.get(url, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
return markdownify(response.text)
|
||||
except (httpx.HTTPStatusError, httpx.RequestError) as e:
|
||||
return f"Encountered an HTTP error: {str(e)}"
|
||||
|
||||
return server
|
||||
|
||||
71
tests/unit_tests/test_main.py
Normal file
71
tests/unit_tests/test_main.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""Tests for mcpdoc.main module."""
|
||||
|
||||
import pytest
|
||||
|
||||
from mcpdoc.main import (
|
||||
_get_fetch_description,
|
||||
_is_http_or_https,
|
||||
extract_domain,
|
||||
)
|
||||
|
||||
|
||||
def test_extract_domain() -> None:
|
||||
"""Test extract_domain function."""
|
||||
# Test with https URL
|
||||
assert extract_domain("https://example.com/page") == "https://example.com/"
|
||||
|
||||
# Test with http URL
|
||||
assert extract_domain("http://test.org/docs/index.html") == "http://test.org/"
|
||||
|
||||
# Test with URL that has port
|
||||
assert extract_domain("https://localhost:8080/api") == "https://localhost:8080/"
|
||||
|
||||
# Check trailing slash
|
||||
assert extract_domain("https://localhost:8080") == "https://localhost:8080/"
|
||||
|
||||
# Test with URL that has subdomain
|
||||
assert extract_domain("https://docs.python.org/3/") == "https://docs.python.org/"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url,expected",
|
||||
[
|
||||
("http://example.com", True),
|
||||
("https://example.com", True),
|
||||
("/path/to/file.txt", False),
|
||||
("file:///path/to/file.txt", False),
|
||||
(
|
||||
"ftp://example.com",
|
||||
False,
|
||||
), # Not HTTP or HTTPS, even though it's not a local file
|
||||
],
|
||||
)
|
||||
def test_is_http_or_https(url, expected):
|
||||
"""Test _is_http_or_https function."""
|
||||
assert _is_http_or_https(url) is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"has_local_sources,expected_substrings",
|
||||
[
|
||||
(True, ["local file path", "file://"]),
|
||||
(False, ["URL to fetch"]),
|
||||
],
|
||||
)
|
||||
def test_get_fetch_description(has_local_sources, expected_substrings):
|
||||
"""Test _get_fetch_description function."""
|
||||
description = _get_fetch_description(has_local_sources)
|
||||
|
||||
# Common assertions for both cases
|
||||
assert "Fetch and parse documentation" in description
|
||||
assert "Returns:" in description
|
||||
|
||||
# Specific assertions based on has_local_sources
|
||||
for substring in expected_substrings:
|
||||
if has_local_sources:
|
||||
assert substring in description
|
||||
else:
|
||||
# For the False case, we only check that "local file path"
|
||||
# and "file://" are NOT present
|
||||
if substring in ["local file path", "file://"]:
|
||||
assert substring not in description
|
||||
Reference in New Issue
Block a user