mirror of
https://github.com/unclecode/crawl4ai.git
synced 2024-12-22 15:52:24 +03:00
refactor: remove legacy build hooks and setup files, migrate to setup.cfg and pyproject.toml
This commit is contained in:
1
MANIFEST.in
Normal file
1
MANIFEST.in
Normal file
@@ -0,0 +1 @@
|
||||
include requirements.txt
|
||||
@@ -1,48 +0,0 @@
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
import sys
|
||||
from hatchling.builders.hooks.plugin.interface import BuildHookInterface
|
||||
PLUGIN = "CustomBuildHook"
|
||||
|
||||
class CustomBuildHook(BuildHookInterface):
|
||||
def initialize(self, version, build_data):
|
||||
# Create the .crawl4ai folder structure
|
||||
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
|
||||
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
|
||||
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
|
||||
cache_folder = crawl4ai_folder / "cache"
|
||||
content_folders = [
|
||||
"html_content",
|
||||
"cleaned_html",
|
||||
"markdown_content",
|
||||
"extracted_content",
|
||||
"screenshots",
|
||||
]
|
||||
|
||||
# Clean up old cache if exists
|
||||
if cache_folder.exists():
|
||||
shutil.rmtree(cache_folder)
|
||||
|
||||
# Create new folder structure
|
||||
crawl4ai_folder.mkdir(exist_ok=True)
|
||||
cache_folder.mkdir(exist_ok=True)
|
||||
for folder in content_folders:
|
||||
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
||||
|
||||
# Install Playwright browsers
|
||||
try:
|
||||
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
||||
except Exception as e:
|
||||
print(f"Warning: Playwright installation failed: {e}")
|
||||
print("Please run 'python -m playwright install' manually after installation")
|
||||
|
||||
# Initialize database
|
||||
try:
|
||||
from crawl4ai.async_database import async_db_manager
|
||||
import asyncio
|
||||
asyncio.run(async_db_manager.initialize())
|
||||
except Exception as e:
|
||||
print(f"Warning: Database initialization failed: {e}")
|
||||
print("Database will be initialized on first use")
|
||||
@@ -1,9 +0,0 @@
|
||||
from colorama import Fore, Style
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
def post_install():
|
||||
print(f"\n{Fore.YELLOW}{'='*40}")
|
||||
print(f"{Fore.RED}IMPORTANT: Run this command now:")
|
||||
print(f"{Fore.GREEN}python -m playwright install")
|
||||
print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
|
||||
@@ -1,19 +0,0 @@
|
||||
from colorama import Fore, Style
|
||||
import subprocess
|
||||
import sys
|
||||
import distutils.log as log
|
||||
from pathlib import Path
|
||||
|
||||
def main():
|
||||
try:
|
||||
subprocess.check_call([sys.executable, "-m", "playwright", "install"],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL)
|
||||
except:
|
||||
print(f"\n{Fore.YELLOW}{'='*40}")
|
||||
print(f"{Fore.RED}IMPORTANT: Run this command now:")
|
||||
print(f"{Fore.GREEN}python -m playwright install")
|
||||
print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,75 +0,0 @@
|
||||
[build-system]
|
||||
requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "Crawl4AI"
|
||||
dynamic = ["version"]
|
||||
description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||
readme = "README.md"
|
||||
license = "Apache-2.0"
|
||||
requires-python = ">=3.7"
|
||||
authors = [
|
||||
{ name = "Unclecode", email = "unclecode@kidocode.com" },
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
]
|
||||
dependencies = [
|
||||
"aiosqlite~=0.20",
|
||||
"html2text~=2024.2",
|
||||
"lxml~=5.3",
|
||||
"litellm>=1.53.1",
|
||||
"numpy>=1.26.0,<3",
|
||||
"pillow~=10.4",
|
||||
"playwright>=1.49.0",
|
||||
"python-dotenv~=1.0",
|
||||
"requests~=2.26",
|
||||
"beautifulsoup4~=4.12",
|
||||
"tf-playwright-stealth>=1.1.0",
|
||||
"xxhash~=3.4",
|
||||
"rank-bm25~=0.2",
|
||||
"aiofiles>=24.1.0",
|
||||
"colorama~=0.4",
|
||||
"snowballstemmer~=2.2",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
torch = ["torch", "nltk", "scikit-learn"]
|
||||
transformer = ["transformers", "tokenizers"]
|
||||
cosine = ["torch", "transformers", "nltk"]
|
||||
sync = ["selenium"]
|
||||
all = [
|
||||
"torch",
|
||||
"nltk",
|
||||
"scikit-learn",
|
||||
"transformers",
|
||||
"tokenizers",
|
||||
"selenium",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/unclecode/crawl4ai"
|
||||
Documentation = "https://crawl4ai.com/mkdocs/"
|
||||
|
||||
[project.scripts]
|
||||
crawl4ai-download-models = "crawl4ai.model_loader:main"
|
||||
crawl4ai-migrate = "crawl4ai.migrations:main"
|
||||
crawl4ai-post-install = "crawl4ai.post_install:main"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "crawl4ai/__version__.py"
|
||||
|
||||
[tool.hatch.build.hooks.custom]
|
||||
dependencies = ["hatch-fancy-pypi-readme>=22.5.0"]
|
||||
path = "build_hooks.py"
|
||||
|
||||
[project.entry-points.hatch]
|
||||
crawl4ai = "crawl4ai.plugin:post_install"
|
||||
16
requirements.txt
Normal file
16
requirements.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
aiosqlite~=0.20
|
||||
html2text~=2024.2
|
||||
lxml~=5.3
|
||||
litellm>=1.53.1
|
||||
numpy>=1.26.0,<3
|
||||
pillow~=10.4
|
||||
playwright>=1.49.0
|
||||
python-dotenv~=1.0
|
||||
requests~=2.26
|
||||
beautifulsoup4~=4.12
|
||||
tf-playwright-stealth>=1.1.0
|
||||
xxhash~=3.4
|
||||
rank-bm25~=0.2
|
||||
aiofiles>=24.1.0
|
||||
colorama~=0.4
|
||||
snowballstemmer~=2.2
|
||||
136
setup.py
Normal file
136
setup.py
Normal file
@@ -0,0 +1,136 @@
|
||||
from setuptools import setup, find_packages
|
||||
from setuptools.command.install import install
|
||||
import os
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import asyncio
|
||||
|
||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||
# If the folder already exists, remove the cache folder
|
||||
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
|
||||
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
|
||||
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
|
||||
cache_folder = crawl4ai_folder / "cache"
|
||||
content_folders = [
|
||||
"html_content",
|
||||
"cleaned_html",
|
||||
"markdown_content",
|
||||
"extracted_content",
|
||||
"screenshots",
|
||||
]
|
||||
|
||||
# Clean up old cache if exists
|
||||
if cache_folder.exists():
|
||||
shutil.rmtree(cache_folder)
|
||||
|
||||
# Create new folder structure
|
||||
crawl4ai_folder.mkdir(exist_ok=True)
|
||||
cache_folder.mkdir(exist_ok=True)
|
||||
for folder in content_folders:
|
||||
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
||||
|
||||
# Read requirements and version
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
with open(os.path.join(__location__, "requirements.txt")) as f:
|
||||
requirements = f.read().splitlines()
|
||||
|
||||
with open("crawl4ai/__version__.py") as f:
|
||||
for line in f:
|
||||
if line.startswith("__version__"):
|
||||
version = line.split("=")[1].strip().strip('"')
|
||||
break
|
||||
|
||||
# Define requirements
|
||||
default_requirements = requirements
|
||||
torch_requirements = ["torch", "nltk", "scikit-learn"]
|
||||
transformer_requirements = ["transformers", "tokenizers"]
|
||||
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
|
||||
sync_requirements = ["selenium"]
|
||||
|
||||
|
||||
def install_playwright():
|
||||
print("Installing Playwright browsers...")
|
||||
try:
|
||||
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
|
||||
print("Playwright installation completed successfully.")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error during Playwright installation: {e}")
|
||||
print(
|
||||
"Please run 'python -m playwright install' manually after the installation."
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Unexpected error during Playwright installation: {e}")
|
||||
print(
|
||||
"Please run 'python -m playwright install' manually after the installation."
|
||||
)
|
||||
|
||||
|
||||
def run_migration():
|
||||
"""Initialize database during installation"""
|
||||
try:
|
||||
print("Starting database initialization...")
|
||||
from crawl4ai.async_database import async_db_manager
|
||||
|
||||
asyncio.run(async_db_manager.initialize())
|
||||
print("Database initialization completed successfully.")
|
||||
except ImportError:
|
||||
print("Warning: Database module not found. Will initialize on first use.")
|
||||
except Exception as e:
|
||||
print(f"Warning: Database initialization failed: {e}")
|
||||
print("Database will be initialized on first use")
|
||||
|
||||
|
||||
class PostInstallCommand(install):
|
||||
def run(self):
|
||||
install.run(self)
|
||||
install_playwright()
|
||||
# run_migration()
|
||||
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
version=version,
|
||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
|
||||
long_description=open("README.md", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/unclecode/crawl4ai",
|
||||
author="Unclecode",
|
||||
author_email="unclecode@kidocode.com",
|
||||
license="MIT",
|
||||
packages=find_packages(),
|
||||
install_requires=default_requirements
|
||||
+ ["playwright", "aiofiles"], # Added aiofiles
|
||||
extras_require={
|
||||
"torch": torch_requirements,
|
||||
"transformer": transformer_requirements,
|
||||
"cosine": cosine_similarity_requirements,
|
||||
"sync": sync_requirements,
|
||||
"all": default_requirements
|
||||
+ torch_requirements
|
||||
+ transformer_requirements
|
||||
+ cosine_similarity_requirements
|
||||
+ sync_requirements,
|
||||
},
|
||||
entry_points={
|
||||
"console_scripts": [
|
||||
"crawl4ai-download-models=crawl4ai.model_loader:main",
|
||||
"crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command
|
||||
],
|
||||
},
|
||||
classifiers=[
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
],
|
||||
python_requires=">=3.7",
|
||||
cmdclass={
|
||||
"install": PostInstallCommand,
|
||||
},
|
||||
)
|
||||
Reference in New Issue
Block a user