mirror of
https://github.com/langchain-ai/mcpdoc.git
synced 2025-10-19 03:18:14 +03:00
Add local llms.txt file reading (#14)
Add ability to read llms.txt from local files. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
14
README.md
14
README.md
@@ -20,6 +20,20 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
|
|||||||
#### Choose an `llms.txt` file to use.
|
#### Choose an `llms.txt` file to use.
|
||||||
* For example, [here's](https://langchain-ai.github.io/langgraph/llms.txt) the LangGraph `llms.txt` file.
|
* For example, [here's](https://langchain-ai.github.io/langgraph/llms.txt) the LangGraph `llms.txt` file.
|
||||||
|
|
||||||
|
> **Note: Security and Domain Access Control**
|
||||||
|
>
|
||||||
|
> For security reasons, mcpdoc implements strict domain access controls:
|
||||||
|
>
|
||||||
|
> 1. **Remote llms.txt files**: When you specify a remote llms.txt URL (e.g., `https://langchain-ai.github.io/langgraph/llms.txt`), mcpdoc automatically adds only that specific domain (`langchain-ai.github.io`) to the allowed domains list. This means the tool can only fetch documentation from URLs on that domain.
|
||||||
|
>
|
||||||
|
> 2. **Local llms.txt files**: When using a local file, NO domains are automatically added to the allowed list. You MUST explicitly specify which domains to allow using the `--allowed-domains` parameter.
|
||||||
|
>
|
||||||
|
> 3. **Adding additional domains**: To allow fetching from domains beyond those automatically included:
|
||||||
|
> - Use `--allowed-domains domain1.com domain2.com` to add specific domains
|
||||||
|
> - Use `--allowed-domains '*'` to allow all domains (use with caution)
|
||||||
|
>
|
||||||
|
> This security measure prevents unauthorized access to domains not explicitly approved by the user, ensuring that documentation can only be retrieved from trusted sources.
|
||||||
|
|
||||||
#### (Optional) Test the MCP server locally with your `llms.txt` file of choice:
|
#### (Optional) Test the MCP server locally with your `llms.txt` file of choice:
|
||||||
```bash
|
```bash
|
||||||
uvx --from mcpdoc mcpdoc \
|
uvx --from mcpdoc mcpdoc \
|
||||||
|
|||||||
@@ -25,6 +25,9 @@ Examples:
|
|||||||
# Directly specifying llms.txt URLs with optional names
|
# Directly specifying llms.txt URLs with optional names
|
||||||
mcpdoc --urls LangGraph:https://langchain-ai.github.io/langgraph/llms.txt
|
mcpdoc --urls LangGraph:https://langchain-ai.github.io/langgraph/llms.txt
|
||||||
|
|
||||||
|
# Using a local file (absolute or relative path)
|
||||||
|
mcpdoc --urls LocalDocs:/path/to/llms.txt --allowed-domains '*'
|
||||||
|
|
||||||
# Using a YAML config file
|
# Using a YAML config file
|
||||||
mcpdoc --yaml sample_config.yaml
|
mcpdoc --yaml sample_config.yaml
|
||||||
|
|
||||||
@@ -72,7 +75,7 @@ def parse_args() -> argparse.Namespace:
|
|||||||
"-u",
|
"-u",
|
||||||
type=str,
|
type=str,
|
||||||
nargs="+",
|
nargs="+",
|
||||||
help="List of llms.txt URLs with optional names (format: 'url' or 'name:url')",
|
help="List of llms.txt URLs or file paths with optional names (format: 'url_or_path' or 'name:url_or_path')",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -84,7 +87,7 @@ def parse_args() -> argparse.Namespace:
|
|||||||
"--allowed-domains",
|
"--allowed-domains",
|
||||||
type=str,
|
type=str,
|
||||||
nargs="*",
|
nargs="*",
|
||||||
help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains",
|
help="Additional allowed domains to fetch documentation from. Use '*' to allow all domains.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--timeout", type=float, default=10.0, help="HTTP request timeout in seconds"
|
"--timeout", type=float, default=10.0, help="HTTP request timeout in seconds"
|
||||||
@@ -163,10 +166,11 @@ def load_config_file(file_path: str, file_format: str) -> List[Dict[str, str]]:
|
|||||||
|
|
||||||
|
|
||||||
def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]:
|
def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]:
|
||||||
"""Create doc sources from a list of URLs with optional names.
|
"""Create doc sources from a list of URLs or file paths with optional names.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls: List of llms.txt URLs with optional names (format: 'url' or 'name:url')
|
urls: List of llms.txt URLs or file paths with optional names
|
||||||
|
(format: 'url_or_path' or 'name:url_or_path')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of DocSource objects
|
List of DocSource objects
|
||||||
|
|||||||
175
mcpdoc/main.py
175
mcpdoc/main.py
@@ -1,5 +1,6 @@
|
|||||||
"""MCP Llms-txt server for docs."""
|
"""MCP Llms-txt server for docs."""
|
||||||
|
|
||||||
|
import os
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
@@ -34,8 +35,64 @@ def extract_domain(url: str) -> str:
|
|||||||
return f"{parsed.scheme}://{parsed.netloc}/"
|
return f"{parsed.scheme}://{parsed.netloc}/"
|
||||||
|
|
||||||
|
|
||||||
|
def _is_http_or_https(url: str) -> bool:
|
||||||
|
"""Check if the URL is an HTTP or HTTPS URL."""
|
||||||
|
return url.startswith(("http:", "https:"))
|
||||||
|
|
||||||
|
|
||||||
|
def _get_fetch_description(has_local_sources: bool) -> str:
|
||||||
|
"""Get fetch docs tool description."""
|
||||||
|
description = [
|
||||||
|
"Fetch and parse documentation from a given URL or local file.",
|
||||||
|
"",
|
||||||
|
"Use this tool after list_doc_sources to:",
|
||||||
|
"1. First fetch the llms.txt file from a documentation source",
|
||||||
|
"2. Analyze the URLs listed in the llms.txt file",
|
||||||
|
"3. Then fetch specific documentation pages relevant to the user's question",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
|
||||||
|
if has_local_sources:
|
||||||
|
description.extend(
|
||||||
|
[
|
||||||
|
"Args:",
|
||||||
|
" url: The URL or file path to fetch documentation from. Can be:",
|
||||||
|
" - URL from an allowed domain",
|
||||||
|
" - A local file path (absolute or relative)",
|
||||||
|
" - A file:// URL (e.g., file:///path/to/llms.txt)",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
description.extend(
|
||||||
|
[
|
||||||
|
"Args:",
|
||||||
|
" url: The URL to fetch documentation from.",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
description.extend(
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"Returns:",
|
||||||
|
" The fetched documentation content converted to markdown, or an error message", # noqa: E501
|
||||||
|
" if the request fails or the URL is not from an allowed domain.",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return "\n".join(description)
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_path(path: str) -> str:
|
||||||
|
"""Accept paths in file:/// or relative format and map to absolute paths."""
|
||||||
|
return (
|
||||||
|
os.path.abspath(path[7:])
|
||||||
|
if path.startswith("file://")
|
||||||
|
else os.path.abspath(path)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def create_server(
|
def create_server(
|
||||||
doc_source: list[DocSource],
|
doc_sources: list[DocSource],
|
||||||
*,
|
*,
|
||||||
follow_redirects: bool = False,
|
follow_redirects: bool = False,
|
||||||
timeout: float = 10,
|
timeout: float = 10,
|
||||||
@@ -45,7 +102,7 @@ def create_server(
|
|||||||
"""Create the server and generate documentation retrieval tools.
|
"""Create the server and generate documentation retrieval tools.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
doc_source: List of documentation sources to make available
|
doc_sources: List of documentation sources to make available
|
||||||
follow_redirects: Whether to follow HTTP redirects when fetching docs
|
follow_redirects: Whether to follow HTTP redirects when fetching docs
|
||||||
timeout: HTTP request timeout in seconds
|
timeout: HTTP request timeout in seconds
|
||||||
settings: Additional settings to pass to FastMCP
|
settings: Additional settings to pass to FastMCP
|
||||||
@@ -68,61 +125,95 @@ def create_server(
|
|||||||
)
|
)
|
||||||
httpx_client = httpx.AsyncClient(follow_redirects=follow_redirects, timeout=timeout)
|
httpx_client = httpx.AsyncClient(follow_redirects=follow_redirects, timeout=timeout)
|
||||||
|
|
||||||
@server.tool()
|
local_sources = []
|
||||||
def list_doc_sources() -> str:
|
remote_sources = []
|
||||||
"""List all available documentation sources.
|
|
||||||
|
|
||||||
This is the first tool you should call in the documentation workflow.
|
for entry in doc_sources:
|
||||||
It provides URLs to llms.txt files that the user has made available.
|
url = entry["llms_txt"]
|
||||||
|
if _is_http_or_https(url):
|
||||||
|
remote_sources.append(entry)
|
||||||
|
else:
|
||||||
|
local_sources.append(entry)
|
||||||
|
|
||||||
Returns:
|
# Let's verify that all local sources exist
|
||||||
A string containing a formatted list of documentation sources with their URLs
|
for entry in local_sources:
|
||||||
"""
|
path = entry["llms_txt"]
|
||||||
content = ""
|
abs_path = _normalize_path(path)
|
||||||
for entry in doc_source:
|
if not os.path.exists(abs_path):
|
||||||
name = entry.get("name", "") or extract_domain(entry["llms_txt"])
|
raise FileNotFoundError(f"Local file not found: {abs_path}")
|
||||||
content += f"{name}\n"
|
|
||||||
content += "URL: " + entry["llms_txt"] + "\n\n"
|
|
||||||
return content
|
|
||||||
|
|
||||||
# Parse the domain names in the llms.txt URLs
|
# Parse the domain names in the llms.txt URLs and identify local file paths
|
||||||
domains = set(extract_domain(entry["llms_txt"]) for entry in doc_source)
|
domains = set(extract_domain(entry["llms_txt"]) for entry in remote_sources)
|
||||||
|
|
||||||
# Add additional allowed domains if specified
|
# Add additional allowed domains if specified, or set to '*' if we have local files
|
||||||
if allowed_domains:
|
if allowed_domains:
|
||||||
if "*" in allowed_domains:
|
if "*" in allowed_domains:
|
||||||
domains = {"*"} # Special marker for allowing all domains
|
domains = {"*"} # Special marker for allowing all domains
|
||||||
else:
|
else:
|
||||||
domains.update(allowed_domains)
|
domains.update(allowed_domains)
|
||||||
|
|
||||||
|
allowed_local_files = set(
|
||||||
|
_normalize_path(entry["llms_txt"]) for entry in local_sources
|
||||||
|
)
|
||||||
|
|
||||||
@server.tool()
|
@server.tool()
|
||||||
async def fetch_docs(url: str) -> str:
|
def list_doc_sources() -> str:
|
||||||
"""Fetch and parse documentation from a given URL.
|
"""List all available documentation sources.
|
||||||
|
|
||||||
Use this tool after list_doc_sources to:
|
This is the first tool you should call in the documentation workflow.
|
||||||
1. First fetch the llms.txt file from a documentation source
|
It provides URLs to llms.txt files or local file paths that the user has made available.
|
||||||
2. Analyze the URLs listed in the llms.txt file
|
|
||||||
3. Then fetch specific documentation pages relevant to the user's question
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url: The URL to fetch documentation from. Must be from an allowed domain.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The fetched documentation content converted to markdown, or an error message
|
A string containing a formatted list of documentation sources with their URLs or file paths
|
||||||
if the request fails or the URL is not from an allowed domain.
|
|
||||||
"""
|
"""
|
||||||
nonlocal domains
|
content = ""
|
||||||
if "*" not in domains and not any(url.startswith(domain) for domain in domains):
|
for entry_ in doc_sources:
|
||||||
return (
|
url_or_path = entry_["llms_txt"]
|
||||||
"Error: URL not allowed. Must start with one of the following domains: "
|
|
||||||
+ ", ".join(domains)
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
if _is_http_or_https(url_or_path):
|
||||||
response = await httpx_client.get(url, timeout=timeout)
|
name = entry_.get("name", extract_domain(url_or_path))
|
||||||
response.raise_for_status()
|
content += f"{name}\nURL: {url_or_path}\n\n"
|
||||||
return markdownify(response.text)
|
else:
|
||||||
except (httpx.HTTPStatusError, httpx.RequestError) as e:
|
path = _normalize_path(url_or_path)
|
||||||
return f"Encountered an HTTP error with code {e.response.status_code}"
|
name = entry_.get("name", path)
|
||||||
|
content += f"{name}\nPath: {path}\n\n"
|
||||||
|
return content
|
||||||
|
|
||||||
|
fetch_docs_description = _get_fetch_description(
|
||||||
|
has_local_sources=bool(local_sources)
|
||||||
|
)
|
||||||
|
|
||||||
|
@server.tool(description=fetch_docs_description)
|
||||||
|
async def fetch_docs(url: str) -> str:
|
||||||
|
nonlocal domains
|
||||||
|
# Handle local file paths (either as file:// URLs or direct filesystem paths)
|
||||||
|
if not _is_http_or_https(url):
|
||||||
|
abs_path = _normalize_path(url)
|
||||||
|
if abs_path not in allowed_local_files:
|
||||||
|
raise ValueError(
|
||||||
|
f"Local file not allowed: {abs_path}. Allowed files: {allowed_local_files}"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with open(abs_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
return markdownify(content)
|
||||||
|
except Exception as e:
|
||||||
|
return f"Error reading local file: {str(e)}"
|
||||||
|
else:
|
||||||
|
# Otherwise treat as URL
|
||||||
|
if "*" not in domains and not any(
|
||||||
|
url.startswith(domain) for domain in domains
|
||||||
|
):
|
||||||
|
return (
|
||||||
|
"Error: URL not allowed. Must start with one of the following domains: "
|
||||||
|
+ ", ".join(domains)
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await httpx_client.get(url, timeout=timeout)
|
||||||
|
response.raise_for_status()
|
||||||
|
return markdownify(response.text)
|
||||||
|
except (httpx.HTTPStatusError, httpx.RequestError) as e:
|
||||||
|
return f"Encountered an HTTP error: {str(e)}"
|
||||||
|
|
||||||
return server
|
return server
|
||||||
|
|||||||
71
tests/unit_tests/test_main.py
Normal file
71
tests/unit_tests/test_main.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
"""Tests for mcpdoc.main module."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from mcpdoc.main import (
|
||||||
|
_get_fetch_description,
|
||||||
|
_is_http_or_https,
|
||||||
|
extract_domain,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_domain() -> None:
|
||||||
|
"""Test extract_domain function."""
|
||||||
|
# Test with https URL
|
||||||
|
assert extract_domain("https://example.com/page") == "https://example.com/"
|
||||||
|
|
||||||
|
# Test with http URL
|
||||||
|
assert extract_domain("http://test.org/docs/index.html") == "http://test.org/"
|
||||||
|
|
||||||
|
# Test with URL that has port
|
||||||
|
assert extract_domain("https://localhost:8080/api") == "https://localhost:8080/"
|
||||||
|
|
||||||
|
# Check trailing slash
|
||||||
|
assert extract_domain("https://localhost:8080") == "https://localhost:8080/"
|
||||||
|
|
||||||
|
# Test with URL that has subdomain
|
||||||
|
assert extract_domain("https://docs.python.org/3/") == "https://docs.python.org/"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url,expected",
|
||||||
|
[
|
||||||
|
("http://example.com", True),
|
||||||
|
("https://example.com", True),
|
||||||
|
("/path/to/file.txt", False),
|
||||||
|
("file:///path/to/file.txt", False),
|
||||||
|
(
|
||||||
|
"ftp://example.com",
|
||||||
|
False,
|
||||||
|
), # Not HTTP or HTTPS, even though it's not a local file
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_http_or_https(url, expected):
|
||||||
|
"""Test _is_http_or_https function."""
|
||||||
|
assert _is_http_or_https(url) is expected
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"has_local_sources,expected_substrings",
|
||||||
|
[
|
||||||
|
(True, ["local file path", "file://"]),
|
||||||
|
(False, ["URL to fetch"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_get_fetch_description(has_local_sources, expected_substrings):
|
||||||
|
"""Test _get_fetch_description function."""
|
||||||
|
description = _get_fetch_description(has_local_sources)
|
||||||
|
|
||||||
|
# Common assertions for both cases
|
||||||
|
assert "Fetch and parse documentation" in description
|
||||||
|
assert "Returns:" in description
|
||||||
|
|
||||||
|
# Specific assertions based on has_local_sources
|
||||||
|
for substring in expected_substrings:
|
||||||
|
if has_local_sources:
|
||||||
|
assert substring in description
|
||||||
|
else:
|
||||||
|
# For the False case, we only check that "local file path"
|
||||||
|
# and "file://" are NOT present
|
||||||
|
if substring in ["local file path", "file://"]:
|
||||||
|
assert substring not in description
|
||||||
Reference in New Issue
Block a user