refactor: remove legacy build hooks and setup files, migrate to setup.cfg and pyproject.toml

2024-12-22 15:52:24 +03:00 · 2024-11-29 16:01:19 +08:00
parent 449dd7cc0b
commit 12e73d4898
8 changed files with 155 additions and 151 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include requirements.txt
--- a/build_hooks.py
+++ b/build_hooks.py
@@ -1,48 +0,0 @@
-import os
-import shutil
-from pathlib import Path
-import subprocess
-import sys
-from hatchling.builders.hooks.plugin.interface import BuildHookInterface
-PLUGIN = "CustomBuildHook" 
-
-class CustomBuildHook(BuildHookInterface):
-    def initialize(self, version, build_data):
-        # Create the .crawl4ai folder structure
-        base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
-        crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
-        crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
-        cache_folder = crawl4ai_folder / "cache"
-        content_folders = [
-            "html_content",
-            "cleaned_html",
-            "markdown_content",
-            "extracted_content",
-            "screenshots",
-        ]
-
-        # Clean up old cache if exists
-        if cache_folder.exists():
-            shutil.rmtree(cache_folder)
-
-        # Create new folder structure
-        crawl4ai_folder.mkdir(exist_ok=True)
-        cache_folder.mkdir(exist_ok=True)
-        for folder in content_folders:
-            (crawl4ai_folder / folder).mkdir(exist_ok=True)
-
-        # Install Playwright browsers
-        try:
-            subprocess.check_call([sys.executable, "-m", "playwright", "install"])
-        except Exception as e:
-            print(f"Warning: Playwright installation failed: {e}")
-            print("Please run 'python -m playwright install' manually after installation")
-
-        # Initialize database
-        try:
-            from crawl4ai.async_database import async_db_manager
-            import asyncio
-            asyncio.run(async_db_manager.initialize())
-        except Exception as e:
-            print(f"Warning: Database initialization failed: {e}")
-            print("Database will be initialized on first use")
--- a/plugin.py
+++ b/plugin.py
@@ -1,9 +0,0 @@
-from colorama import Fore, Style
-import subprocess
-import sys
-
-def post_install():
-    print(f"\n{Fore.YELLOW}{'='*40}")
-    print(f"{Fore.RED}IMPORTANT: Run this command now:")
-    print(f"{Fore.GREEN}python -m playwright install")
-    print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
--- a/post_install.py
+++ b/post_install.py
@@ -1,19 +0,0 @@
-from colorama import Fore, Style
-import subprocess
-import sys
-import distutils.log as log
-from pathlib import Path
-
-def main():
-    try:
-        subprocess.check_call([sys.executable, "-m", "playwright", "install"], 
-                            stdout=subprocess.DEVNULL, 
-                            stderr=subprocess.DEVNULL)
-    except:
-        print(f"\n{Fore.YELLOW}{'='*40}")
-        print(f"{Fore.RED}IMPORTANT: Run this command now:")
-        print(f"{Fore.GREEN}python -m playwright install")
-        print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
-
-if __name__ == "__main__":
-    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,75 +0,0 @@
-[build-system]
-requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"]
-build-backend = "hatchling.build"
-
-[project]
-name = "Crawl4AI"
-dynamic = ["version"]
-description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
-readme = "README.md"
-license = "Apache-2.0"
-requires-python = ">=3.7"
-authors = [
-    { name = "Unclecode", email = "unclecode@kidocode.com" },
-]
-classifiers = [
-    "Development Status :: 3 - Alpha",
-    "Intended Audience :: Developers",
-    "License :: OSI Approved :: Apache Software License",
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.7",
-    "Programming Language :: Python :: 3.8",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-]
-dependencies = [
-    "aiosqlite~=0.20",
-    "html2text~=2024.2",
-    "lxml~=5.3",
-    "litellm>=1.53.1",
-    "numpy>=1.26.0,<3",
-    "pillow~=10.4",
-    "playwright>=1.49.0",
-    "python-dotenv~=1.0",
-    "requests~=2.26",
-    "beautifulsoup4~=4.12",
-    "tf-playwright-stealth>=1.1.0",
-    "xxhash~=3.4",
-    "rank-bm25~=0.2", 
-    "aiofiles>=24.1.0",
-    "colorama~=0.4",
-    "snowballstemmer~=2.2",
-]
-
-[project.optional-dependencies]
-torch = ["torch", "nltk", "scikit-learn"]
-transformer = ["transformers", "tokenizers"]
-cosine = ["torch", "transformers", "nltk"]
-sync = ["selenium"]
-all = [
-    "torch",
-    "nltk",
-    "scikit-learn",
-    "transformers",
-    "tokenizers",
-    "selenium",
-]
-
-[project.urls]
-Homepage = "https://github.com/unclecode/crawl4ai"
-Documentation = "https://crawl4ai.com/mkdocs/"
-
-[project.scripts]
-crawl4ai-download-models = "crawl4ai.model_loader:main"
-crawl4ai-migrate = "crawl4ai.migrations:main"
-crawl4ai-post-install = "crawl4ai.post_install:main"
-
-[tool.hatch.version]
-path = "crawl4ai/__version__.py"
-
-[tool.hatch.build.hooks.custom]
-dependencies = ["hatch-fancy-pypi-readme>=22.5.0"]
-path = "build_hooks.py"
-
-[project.entry-points.hatch]
-crawl4ai = "crawl4ai.plugin:post_install"
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+aiosqlite~=0.20
+html2text~=2024.2
+lxml~=5.3
+litellm>=1.53.1
+numpy>=1.26.0,<3
+pillow~=10.4
+playwright>=1.49.0
+python-dotenv~=1.0
+requests~=2.26
+beautifulsoup4~=4.12
+tf-playwright-stealth>=1.1.0
+xxhash~=3.4
+rank-bm25~=0.2
+aiofiles>=24.1.0
+colorama~=0.4
+snowballstemmer~=2.2
--- a/setup.cfg
+++ b/setup.cfg
@@ -0,0 +1,2 @@
+[options]
+include_package_data = True
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,136 @@
+from setuptools import setup, find_packages
+from setuptools.command.install import install
+import os
+from pathlib import Path
+import shutil
+import subprocess
+import sys
+import asyncio
+
+# Create the .crawl4ai folder in the user's home directory if it doesn't exist
+# If the folder already exists, remove the cache folder
+base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
+crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
+crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
+cache_folder = crawl4ai_folder / "cache"
+content_folders = [
+    "html_content",
+    "cleaned_html",
+    "markdown_content",
+    "extracted_content",
+    "screenshots",
+]
+
+# Clean up old cache if exists
+if cache_folder.exists():
+    shutil.rmtree(cache_folder)
+
+# Create new folder structure
+crawl4ai_folder.mkdir(exist_ok=True)
+cache_folder.mkdir(exist_ok=True)
+for folder in content_folders:
+    (crawl4ai_folder / folder).mkdir(exist_ok=True)
+
+# Read requirements and version
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+with open(os.path.join(__location__, "requirements.txt")) as f:
+    requirements = f.read().splitlines()
+
+with open("crawl4ai/__version__.py") as f:
+    for line in f:
+        if line.startswith("__version__"):
+            version = line.split("=")[1].strip().strip('"')
+            break
+
+# Define requirements
+default_requirements = requirements
+torch_requirements = ["torch", "nltk", "scikit-learn"]
+transformer_requirements = ["transformers", "tokenizers"]
+cosine_similarity_requirements = ["torch", "transformers", "nltk"]
+sync_requirements = ["selenium"]
+
+
+def install_playwright():
+    print("Installing Playwright browsers...")
+    try:
+        subprocess.check_call([sys.executable, "-m", "playwright", "install"])
+        print("Playwright installation completed successfully.")
+    except subprocess.CalledProcessError as e:
+        print(f"Error during Playwright installation: {e}")
+        print(
+            "Please run 'python -m playwright install' manually after the installation."
+        )
+    except Exception as e:
+        print(f"Unexpected error during Playwright installation: {e}")
+        print(
+            "Please run 'python -m playwright install' manually after the installation."
+        )
+
+
+def run_migration():
+    """Initialize database during installation"""
+    try:
+        print("Starting database initialization...")
+        from crawl4ai.async_database import async_db_manager
+
+        asyncio.run(async_db_manager.initialize())
+        print("Database initialization completed successfully.")
+    except ImportError:
+        print("Warning: Database module not found. Will initialize on first use.")
+    except Exception as e:
+        print(f"Warning: Database initialization failed: {e}")
+        print("Database will be initialized on first use")
+
+
+class PostInstallCommand(install):
+    def run(self):
+        install.run(self)
+        install_playwright()
+        # run_migration()
+
+
+setup(
+    name="Crawl4AI",
+    version=version,
+    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
+    long_description=open("README.md", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/unclecode/crawl4ai",
+    author="Unclecode",
+    author_email="unclecode@kidocode.com",
+    license="MIT",
+    packages=find_packages(),
+    install_requires=default_requirements
+    + ["playwright", "aiofiles"],  # Added aiofiles
+    extras_require={
+        "torch": torch_requirements,
+        "transformer": transformer_requirements,
+        "cosine": cosine_similarity_requirements,
+        "sync": sync_requirements,
+        "all": default_requirements
+        + torch_requirements
+        + transformer_requirements
+        + cosine_similarity_requirements
+        + sync_requirements,
+    },
+    entry_points={
+        "console_scripts": [
+            "crawl4ai-download-models=crawl4ai.model_loader:main",
+            "crawl4ai-migrate=crawl4ai.migrations:main",  # Added migration command
+        ],
+    },
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+    ],
+    python_requires=">=3.7",
+    cmdclass={
+        "install": PostInstallCommand,
+    },
+)