refactor: remove legacy build hooks and setup files, migrate to setup.cfg and pyproject.toml

This commit is contained in:
UncleCode
2024-11-29 16:01:19 +08:00
parent 449dd7cc0b
commit 12e73d4898
8 changed files with 155 additions and 151 deletions

1
MANIFEST.in Normal file
View File

@@ -0,0 +1 @@
include requirements.txt

View File

@@ -1,48 +0,0 @@
import os
import shutil
from pathlib import Path
import subprocess
import sys
from hatchling.builders.hooks.plugin.interface import BuildHookInterface
PLUGIN = "CustomBuildHook"
class CustomBuildHook(BuildHookInterface):
def initialize(self, version, build_data):
# Create the .crawl4ai folder structure
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
cache_folder = crawl4ai_folder / "cache"
content_folders = [
"html_content",
"cleaned_html",
"markdown_content",
"extracted_content",
"screenshots",
]
# Clean up old cache if exists
if cache_folder.exists():
shutil.rmtree(cache_folder)
# Create new folder structure
crawl4ai_folder.mkdir(exist_ok=True)
cache_folder.mkdir(exist_ok=True)
for folder in content_folders:
(crawl4ai_folder / folder).mkdir(exist_ok=True)
# Install Playwright browsers
try:
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
except Exception as e:
print(f"Warning: Playwright installation failed: {e}")
print("Please run 'python -m playwright install' manually after installation")
# Initialize database
try:
from crawl4ai.async_database import async_db_manager
import asyncio
asyncio.run(async_db_manager.initialize())
except Exception as e:
print(f"Warning: Database initialization failed: {e}")
print("Database will be initialized on first use")

View File

@@ -1,9 +0,0 @@
from colorama import Fore, Style
import subprocess
import sys
def post_install():
print(f"\n{Fore.YELLOW}{'='*40}")
print(f"{Fore.RED}IMPORTANT: Run this command now:")
print(f"{Fore.GREEN}python -m playwright install")
print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")

View File

@@ -1,19 +0,0 @@
from colorama import Fore, Style
import subprocess
import sys
import distutils.log as log
from pathlib import Path
def main():
try:
subprocess.check_call([sys.executable, "-m", "playwright", "install"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL)
except:
print(f"\n{Fore.YELLOW}{'='*40}")
print(f"{Fore.RED}IMPORTANT: Run this command now:")
print(f"{Fore.GREEN}python -m playwright install")
print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
if __name__ == "__main__":
main()

View File

@@ -1,75 +0,0 @@
[build-system]
requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"]
build-backend = "hatchling.build"
[project]
name = "Crawl4AI"
dynamic = ["version"]
description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
readme = "README.md"
license = "Apache-2.0"
requires-python = ">=3.7"
authors = [
{ name = "Unclecode", email = "unclecode@kidocode.com" },
]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
]
dependencies = [
"aiosqlite~=0.20",
"html2text~=2024.2",
"lxml~=5.3",
"litellm>=1.53.1",
"numpy>=1.26.0,<3",
"pillow~=10.4",
"playwright>=1.49.0",
"python-dotenv~=1.0",
"requests~=2.26",
"beautifulsoup4~=4.12",
"tf-playwright-stealth>=1.1.0",
"xxhash~=3.4",
"rank-bm25~=0.2",
"aiofiles>=24.1.0",
"colorama~=0.4",
"snowballstemmer~=2.2",
]
[project.optional-dependencies]
torch = ["torch", "nltk", "scikit-learn"]
transformer = ["transformers", "tokenizers"]
cosine = ["torch", "transformers", "nltk"]
sync = ["selenium"]
all = [
"torch",
"nltk",
"scikit-learn",
"transformers",
"tokenizers",
"selenium",
]
[project.urls]
Homepage = "https://github.com/unclecode/crawl4ai"
Documentation = "https://crawl4ai.com/mkdocs/"
[project.scripts]
crawl4ai-download-models = "crawl4ai.model_loader:main"
crawl4ai-migrate = "crawl4ai.migrations:main"
crawl4ai-post-install = "crawl4ai.post_install:main"
[tool.hatch.version]
path = "crawl4ai/__version__.py"
[tool.hatch.build.hooks.custom]
dependencies = ["hatch-fancy-pypi-readme>=22.5.0"]
path = "build_hooks.py"
[project.entry-points.hatch]
crawl4ai = "crawl4ai.plugin:post_install"

16
requirements.txt Normal file
View File

@@ -0,0 +1,16 @@
aiosqlite~=0.20
html2text~=2024.2
lxml~=5.3
litellm>=1.53.1
numpy>=1.26.0,<3
pillow~=10.4
playwright>=1.49.0
python-dotenv~=1.0
requests~=2.26
beautifulsoup4~=4.12
tf-playwright-stealth>=1.1.0
xxhash~=3.4
rank-bm25~=0.2
aiofiles>=24.1.0
colorama~=0.4
snowballstemmer~=2.2

2
setup.cfg Normal file
View File

@@ -0,0 +1,2 @@
[options]
include_package_data = True

136
setup.py Normal file
View File

@@ -0,0 +1,136 @@
from setuptools import setup, find_packages
from setuptools.command.install import install
import os
from pathlib import Path
import shutil
import subprocess
import sys
import asyncio
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
# If the folder already exists, remove the cache folder
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
cache_folder = crawl4ai_folder / "cache"
content_folders = [
"html_content",
"cleaned_html",
"markdown_content",
"extracted_content",
"screenshots",
]
# Clean up old cache if exists
if cache_folder.exists():
shutil.rmtree(cache_folder)
# Create new folder structure
crawl4ai_folder.mkdir(exist_ok=True)
cache_folder.mkdir(exist_ok=True)
for folder in content_folders:
(crawl4ai_folder / folder).mkdir(exist_ok=True)
# Read requirements and version
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
with open(os.path.join(__location__, "requirements.txt")) as f:
requirements = f.read().splitlines()
with open("crawl4ai/__version__.py") as f:
for line in f:
if line.startswith("__version__"):
version = line.split("=")[1].strip().strip('"')
break
# Define requirements
default_requirements = requirements
torch_requirements = ["torch", "nltk", "scikit-learn"]
transformer_requirements = ["transformers", "tokenizers"]
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
sync_requirements = ["selenium"]
def install_playwright():
print("Installing Playwright browsers...")
try:
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
print("Playwright installation completed successfully.")
except subprocess.CalledProcessError as e:
print(f"Error during Playwright installation: {e}")
print(
"Please run 'python -m playwright install' manually after the installation."
)
except Exception as e:
print(f"Unexpected error during Playwright installation: {e}")
print(
"Please run 'python -m playwright install' manually after the installation."
)
def run_migration():
"""Initialize database during installation"""
try:
print("Starting database initialization...")
from crawl4ai.async_database import async_db_manager
asyncio.run(async_db_manager.initialize())
print("Database initialization completed successfully.")
except ImportError:
print("Warning: Database module not found. Will initialize on first use.")
except Exception as e:
print(f"Warning: Database initialization failed: {e}")
print("Database will be initialized on first use")
class PostInstallCommand(install):
def run(self):
install.run(self)
install_playwright()
# run_migration()
setup(
name="Crawl4AI",
version=version,
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
url="https://github.com/unclecode/crawl4ai",
author="Unclecode",
author_email="unclecode@kidocode.com",
license="MIT",
packages=find_packages(),
install_requires=default_requirements
+ ["playwright", "aiofiles"], # Added aiofiles
extras_require={
"torch": torch_requirements,
"transformer": transformer_requirements,
"cosine": cosine_similarity_requirements,
"sync": sync_requirements,
"all": default_requirements
+ torch_requirements
+ transformer_requirements
+ cosine_similarity_requirements
+ sync_requirements,
},
entry_points={
"console_scripts": [
"crawl4ai-download-models=crawl4ai.model_loader:main",
"crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command
],
},
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
],
python_requires=">=3.7",
cmdclass={
"install": PostInstallCommand,
},
)