packaging

2025-10-22 23:19:46 +03:00 · 2025-10-11 21:33:12 +08:00
parent 60201d365f
commit a391badfe1
37 changed files with 282 additions and 98 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,7 @@
+include README.md
+include LICENSE
+include requirements.txt
+recursive-include longcodezip *.py
+recursive-exclude * __pycache__
+recursive-exclude * *.py[co]
+
--- a/README.md
+++ b/README.md
@@ -21,37 +21,20 @@ LongCodeZip introduces a two-stage code compression framework specifically desig

 The method is plug-and-play and can be integrated with existing code LLMs to achieve significant compression ratios while maintaining or improving task performance.

-## Repository Structure
-
-This repository contains implementations and experiments for three code-related tasks:
-
-```
-LongCodeZip/
-├── repo-qa/                   # Code Retrieval Task
-│   ├── main.py               # Main evaluation script
-│   ├── run.sh                # Experiment runner
-│   ├── code_compressor.py    # Core compression implementation
-│   ├── compute_score.py      # Evaluation metrics
-│   └── ...
-├── long-code-completion/      # Code Completion Task
-│   ├── main.py               # Main evaluation script
-│   ├── run.sh                # Experiment runner
-│   ├── code_compressor.py    # Core compression implementation
-│   ├── utils.py              # Utility functions
-│   └── ...
-├── module-summarization/      # Code Summarization Task
-│   ├── main.py               # Main evaluation script
-│   ├── run.sh                # Experiment runner
-│   ├── code_compressor.py    # Core compression implementation
-│   ├── utils.py              # Utility functions
-│   └── ...
-└── README.md
-```
-
 ## Installation

+You can install directly from the GitHub repository:
+
 ```bash
-pip install -r requirements.txt
+pip install git+https://github.com/YerbaPage/LongCodeZip.git
+```
+
+Or clone and install in development mode:
+
+```bash
+git clone https://github.com/YerbaPage/LongCodeZip.git
+cd LongCodeZip
+pip install -e .
 ```

 ## Quick Demo
@@ -62,36 +45,21 @@ We provide a simple demo (`demo.py`) to help you get started with LongCodeZip:
 python demo.py
 ```

-This demo showcases the core compression functionality by compressing a simple code snippet containing multiple functions (add, quick_sort, search_with_binary_search) based on a query about quick sort. The compressor will:
-1. Rank functions by relevance to the query
-2. Apply fine-grained compression to maximize information density
-3. Generate a compressed prompt suitable for code LLMs
-
-**Example output:**
-```python
-# Original: ~150 tokens
-# Compressed: ~64 tokens (target)
-# Selected: quick_sort function (most relevant to query)
-```
-
-## Core API Usage
-
-LongCodeZip provides a simple and powerful API for compressing long code contexts. Here's how to use it:
-
-### Basic Example
+## Basic Example

 ```python
-from longcodezip import CodeCompressor
+from longcodezip import LongCodeZip

 # Initialize the compressor
-compressor = CodeCompressor(model_name="Qwen/Qwen2.5-Coder-7B-Instruct")
+compressor = LongCodeZip(model_name="Qwen/Qwen2.5-Coder-7B-Instruct")

 # Compress code with a query
 result = compressor.compress_code_file(
-    code=your_code_string,
-    query="What does this function do?",
-    instruction="Answer the question based on the code.",
+    code=<your_code_string>,
+    query=<your_query>,
+    instruction=<your_instruction>,
    rate=0.5,  # Keep 50% of tokens
+    rank_only=False, # Set to True to only rank and select contexts without fine-grained compression
 )

 # Access compressed results
@@ -99,41 +67,6 @@ compressed_code = result['compressed_code']
 compressed_prompt = result['compressed_prompt']  # Full prompt with instruction
 compression_ratio = result['compression_ratio']
 ```
-## Usage
-
-### Quick Start
-
-Each task directory contains a `run.sh` script for easy experimentation. Simply navigate to the desired task directory and run:
-
-```bash
-cd <task_directory>
-bash run.sh
-```
-
-### Code Retrieval (RepoQA)
-
-Navigate to the `repo-qa` directory and run experiments with different compression ratios:
-
-```bash
-cd repo-qa
-bash run.sh
-```
-
-The script will evaluate LongCodeZip on the RepoQA dataset with compression ratios, running experiments in parallel on multiple GPUs.
-
-**Key Parameters:**
- `--compression-ratio`: Controls the compression level
- `--model`: Specifies the base LLM model
- `--backend`: Backend for model inference (vllm)
-
-### Code Completion
-
-Navigate to the `long-code-completion` directory:
-
-```bash
-cd long-code-completion
-bash run.sh
-```

 ## References

--- a/demo.py
+++ b/demo.py
@@ -1,4 +1,4 @@
-from longcodezip import CodeCompressor
+from longcodezip import LongCodeZip
 from loguru import logger

 if __name__ == "__main__":
@@ -16,7 +16,7 @@ if __name__ == "__main__":
    # Initialize compressor
    logger.info("Initializing compressor...")
    model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
-    compressor = CodeCompressor(model_name=model_name)
+    compressor = LongCodeZip(model_name=model_name)
    
    # Test function-based code file compression with query
    logger.info("\nTesting function-based code file compression with query...")
@@ -24,7 +24,7 @@ if __name__ == "__main__":
    original_tokens = len(compressor.tokenizer.encode(context))
    target_token = 64
    target_ratio = min(1.0, max(0.0, target_token / original_tokens))
-    logger.info(f"CodeCompressor: Original tokens={original_tokens}, Target tokens={target_token}, Calculated ratio={target_ratio:.4f}")
+    logger.info(f"LongCodeZip: Original tokens={original_tokens}, Target tokens={target_token}, Calculated ratio={target_ratio:.4f}")

    logger.info("\nTesting compression with Coarse-grained compression only...")
    result_cond = compressor.compress_code_file(
@@ -35,6 +35,7 @@ if __name__ == "__main__":
        rank_only=True # Coarse-grained compression
    )
    logger.info(f"Compressed prompt: \n{result_cond['compressed_prompt']}")
+    logger.info(f"Compression ratio: {result_cond['compression_ratio']:.4f}") # Compression ratio: 0.3856

    logger.info("\nTesting compression with Coarse-grained and Fine-grained compression...")
    result_cond = compressor.compress_code_file(
@@ -44,4 +45,5 @@ if __name__ == "__main__":
        rate=target_ratio,
        rank_only=False # Corase-grained and Fine-grained compression
    )
-    logger.info(f"Compressed prompt: \n{result_cond['compressed_prompt']}")
+    logger.info(f"Compressed prompt: \n{result_cond['compressed_prompt']}")
+    logger.info(f"Compression ratio: {result_cond['compression_ratio']:.4f}") # Compression ratio: 0.1468
--- a/experiments/README.md
+++ b/experiments/README.md
@@ -0,0 +1,37 @@
+# Experiments
+
+This folder contains the old experiments for the three code-related tasks. Some codes may be outdated after refactoring.
+
+### Quick Start
+
+Each task directory contains a `run.sh` script for easy experimentation. Simply navigate to the desired task directory and run:
+
+```bash
+cd <task_directory>
+bash run.sh
+```
+
+### Code Retrieval (RepoQA)
+
+Navigate to the `repo-qa` directory and run experiments with different compression ratios:
+
+```bash
+cd repo-qa
+bash run.sh
+```
+
+The script will evaluate LongCodeZip on the RepoQA dataset with compression ratios, running experiments in parallel on multiple GPUs.
+
+**Key Parameters:**
+- `--compression-ratio`: Controls the compression level
+- `--model`: Specifies the base LLM model
+- `--backend`: Backend for model inference (vllm)
+
+### Code Completion
+
+Navigate to the `long-code-completion` directory:
+
+```bash
+cd long-code-completion
+bash run.sh
+```
--- a/experiments/long-code-completion/code_compressor.py
+++ b/experiments/long-code-completion/code_compressor.py
--- a/experiments/long-code-completion/compare_empty_line_handling.py
+++ b/experiments/long-code-completion/compare_empty_line_handling.py
--- a/experiments/long-code-completion/main.py
+++ b/experiments/long-code-completion/main.py
--- a/experiments/long-code-completion/run.sh
+++ b/experiments/long-code-completion/run.sh
--- a/experiments/long-code-completion/utils.py
+++ b/experiments/long-code-completion/utils.py
--- a/experiments/module-summarization/code_compressor.py
+++ b/experiments/module-summarization/code_compressor.py
--- a/experiments/module-summarization/main.py
+++ b/experiments/module-summarization/main.py
--- a/experiments/module-summarization/run.sh
+++ b/experiments/module-summarization/run.sh
--- a/experiments/module-summarization/utils.py
+++ b/experiments/module-summarization/utils.py
--- a/experiments/repo-qa/init.py
+++ b/experiments/repo-qa/init.py
--- a/experiments/repo-qa/code_compressor.py
+++ b/experiments/repo-qa/code_compressor.py
--- a/experiments/repo-qa/code_segment_extractor.py
+++ b/experiments/repo-qa/code_segment_extractor.py
--- a/experiments/repo-qa/compute_score.py
+++ b/experiments/repo-qa/compute_score.py
--- a/experiments/repo-qa/data.py
+++ b/experiments/repo-qa/data.py
--- a/experiments/repo-qa/main.py
+++ b/experiments/repo-qa/main.py
--- a/experiments/repo-qa/metric.py
+++ b/experiments/repo-qa/metric.py
--- a/experiments/repo-qa/provider/init.py
+++ b/experiments/repo-qa/provider/init.py
--- a/experiments/repo-qa/provider/anthropic.py
+++ b/experiments/repo-qa/provider/anthropic.py
--- a/experiments/repo-qa/provider/base.py
+++ b/experiments/repo-qa/provider/base.py
--- a/experiments/repo-qa/provider/google.py
+++ b/experiments/repo-qa/provider/google.py
--- a/experiments/repo-qa/provider/hf.py
+++ b/experiments/repo-qa/provider/hf.py
--- a/experiments/repo-qa/provider/openai.py
+++ b/experiments/repo-qa/provider/openai.py
--- a/experiments/repo-qa/provider/request/init.py
+++ b/experiments/repo-qa/provider/request/init.py
--- a/experiments/repo-qa/provider/request/anthropic.py
+++ b/experiments/repo-qa/provider/request/anthropic.py
--- a/experiments/repo-qa/provider/request/google.py
+++ b/experiments/repo-qa/provider/request/google.py
--- a/experiments/repo-qa/provider/request/openai.py
+++ b/experiments/repo-qa/provider/request/openai.py
--- a/experiments/repo-qa/provider/vllm.py
+++ b/experiments/repo-qa/provider/vllm.py
--- a/experiments/repo-qa/run.sh
+++ b/experiments/repo-qa/run.sh
--- a/experiments/repo-qa/utility.py
+++ b/experiments/repo-qa/utility.py
--- a/longcodezip/init.py
+++ b/longcodezip/init.py
@@ -11,6 +11,10 @@ import copy
 import bisect
 import json
 from loguru import logger
+import sys
+# set the level to info only, no need to show the debug messages
+logger.remove()
+logger.add(sys.stderr, level="INFO")

 class EntropyChunking:
    def __init__(self, model_name="Qwen/Qwen2.5-Coder-0.5B-Instruct"):
@@ -192,7 +196,7 @@ class EntropyChunking:
        
        return chunks, sentences, ppls, spike_indices

-class CodeCompressor:
+class LongCodeZip:
    def __init__(
        self,
        model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4",
@@ -200,7 +204,7 @@ class CodeCompressor:
        model_config: dict = {},
    ):
        """
-        Initialize the CodeCompressor with a language model for compression.
+        Initialize the LongCodeZip with a language model for compression.
        
        Args:
            model_name: The name of the model to load from HuggingFace
@@ -1803,4 +1807,93 @@ class CodeCompressor:
                selected.add(idx)
                current_weight += weight
        
-        return selected
+        return selected
+
+if __name__ == "__main__":
+
+    context = """
+    def add(a, b):
+        return a + b
+
+    def quick_sort(arr):
+        if len(arr) <= 1:
+            return arr
+        pivot = arr[len(arr) // 2]
+        left = [x for x in arr if x < pivot]
+        middle = [x for x in arr if x == pivot]
+        right = [x for x in arr if x > pivot]
+        return quick_sort(left) + middle + quick_sort(right)
+
+    def search_with_binary_search(arr, target):
+        left, right = 0, len(arr) - 1
+        while left <= right:
+            mid = (left + right) // 2
+            if arr[mid] == target:
+                return mid
+            elif arr[mid] < target:
+                left = mid + 1
+            else:
+                right = mid - 1
+        return -1
+    """
+
+    question = "How to write a quick sort algorithm?"
+   
+    # Initialize compressor
+    logger.info("Initializing compressor...")
+    model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
+    compressor = LongCodeZip(model_name=model_name)
+    
+    # Test function-based code file compression with query
+    logger.info("\nTesting function-based code file compression with query...")
+
+    original_tokens = len(compressor.tokenizer.encode(context))
+    target_token = 64
+    target_ratio = min(1.0, max(0.0, target_token / original_tokens))
+    logger.info(f"LongCodeZip: Original tokens={original_tokens}, Target tokens={target_token}, Calculated ratio={target_ratio:.4f}")
+
+    result = compressor.compress_code_file(
+        code=context,
+        query=question, # Using current function context as query focus
+        instruction="Complete the following code function given the context.",
+        rate=target_ratio,
+        rank_only=True, # Only use coarse-grained compression
+        fine_grained_importance_method="conditional_ppl", # Explicitly test default
+        min_lines_for_fine_grained=5, # Min number of lines for fine-grained compression
+        importance_beta=0.5, # Sensitivity to importance score
+        use_knapsack=True,
+    )
+
+    # show the compressed code
+    logger.info(f"Compressed code (using {result['fine_grained_method_used']}): \n{result['compressed_code']}")
+    logger.info(f"Current function context: \n{question}")
+    # final prompt
+    final_prompt = result['compressed_prompt']
+    # get the completion
+    tokenized_prompt = compressor.tokenizer(final_prompt, return_tensors="pt").to(compressor.device)
+    # Increase max_new_tokens for potentially longer completions
+    completion_ids = compressor.model.generate(**tokenized_prompt, max_new_tokens=128, pad_token_id=compressor.tokenizer.eos_token_id)
+    # Decode only the generated part, skipping special tokens
+    completion = compressor.tokenizer.decode(completion_ids[0][len(tokenized_prompt.input_ids[0]):], skip_special_tokens=True)
+
+    # Basic cleanup: remove leading/trailing whitespace and potentially stop words if needed
+    completion = completion.strip()
+    # More robust cleanup: Find the first meaningful line if generation includes noise
+    completion_lines = [line for line in completion.split("\n") if line.strip() and not line.strip().startswith(("#", "//"))] # Simple comment removal
+    cleaned_completion = completion_lines[0] if completion_lines else completion # Take first non-comment line or original if none found
+
+    logger.info(f"Cleaned Completion: {cleaned_completion}")
+
+    # Optional: Test with conditional_ppl method
+    logger.info("\nTesting fine-grained compression with conditional_ppl...")
+    result_cond = compressor.compress_code_file(
+        code=context,
+        query=question,
+        instruction="Complete the following code function given the context.",
+        rate=target_ratio,
+        rank_only=False,
+        fine_grained_importance_method="conditional_ppl",
+        min_lines_for_fine_grained=5,
+        importance_beta=0.5
+    )
+    logger.info(f"Compressed code (using {result_cond['fine_grained_method_used']}): \n{result_cond['compressed_code']}")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,72 @@
+[build-system]
+requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "longcodezip"
+version = "0.1.0"
+description = "A novel two-stage long code compression method for code language models"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {text = "MIT"}
+authors = [
+    {name = "Yuling Shi"},
+]
+keywords = [
+    "code compression",
+    "language models",
+    "llm",
+    "code intelligence",
+    "nlp",
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = [
+    "appdirs",
+    "datasets",
+    "editdistance",
+    "fire",
+    "loguru",
+    "matplotlib",
+    "nltk",
+    "numpy",
+    "openai>=1.0.0",
+    "rich",
+    "torch",
+    "transformers",
+    "tqdm",
+    "tree-sitter-languages",
+    "tempdir",
+    "wget",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "black",
+    "flake8",
+]
+
+[project.urls]
+Homepage = "https://github.com/YerbaPage/LongCodeZip"
+Repository = "https://github.com/YerbaPage/LongCodeZip"
+Documentation = "https://github.com/YerbaPage/LongCodeZip/blob/main/README.md"
+"Bug Tracker" = "https://github.com/YerbaPage/LongCodeZip/issues"
+"Paper" = "https://arxiv.org/abs/2510.00446"
+
+[tool.setuptools]
+packages = ["longcodezip"]
+
+[tool.setuptools.package-data]
+longcodezip = ["py.typed"]
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,7 @@
-anthropic
 appdirs
 datasets
 editdistance
 fire
-google-api-core
-google-generativeai
-llmlingua
 loguru
 matplotlib
 nltk
@@ -13,9 +9,8 @@ numpy
 openai>=1.0.0
 rich
 torch
-transformers==4.37.1
+transformers
 tqdm
 tree-sitter-languages
 tempdir
-vllm
 wget
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,45 @@
+from setuptools import setup, find_packages
+
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+with open("requirements.txt", "r", encoding="utf-8") as fh:
+    requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
+
+setup(
+    name="longcodezip",
+    version="0.1.0",
+    author="Yuling Shi",
+    author_email="",
+    description="A novel two-stage long code compression method for code language models",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/YerbaPage/LongCodeZip",
+    packages=find_packages(exclude=["repo-qa", "long-code-completion", "module-summarization", "assets"]),
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+    python_requires=">=3.9",
+    install_requires=requirements,
+    extras_require={
+        "dev": [
+            "pytest",
+            "black",
+            "flake8",
+        ],
+    },
+    include_package_data=True,
+    package_data={
+        "longcodezip": ["py.typed"],
+    },
+)
+