packaging

This commit is contained in:
YerbaPage
2025-10-11 21:33:12 +08:00
parent 60201d365f
commit a391badfe1
37 changed files with 282 additions and 98 deletions

7
MANIFEST.in Normal file
View File

@@ -0,0 +1,7 @@
include README.md
include LICENSE
include requirements.txt
recursive-include longcodezip *.py
recursive-exclude * __pycache__
recursive-exclude * *.py[co]

103
README.md
View File

@@ -21,37 +21,20 @@ LongCodeZip introduces a two-stage code compression framework specifically desig
The method is plug-and-play and can be integrated with existing code LLMs to achieve significant compression ratios while maintaining or improving task performance.
## Repository Structure
This repository contains implementations and experiments for three code-related tasks:
```
LongCodeZip/
├── repo-qa/ # Code Retrieval Task
│ ├── main.py # Main evaluation script
│ ├── run.sh # Experiment runner
│ ├── code_compressor.py # Core compression implementation
│ ├── compute_score.py # Evaluation metrics
│ └── ...
├── long-code-completion/ # Code Completion Task
│ ├── main.py # Main evaluation script
│ ├── run.sh # Experiment runner
│ ├── code_compressor.py # Core compression implementation
│ ├── utils.py # Utility functions
│ └── ...
├── module-summarization/ # Code Summarization Task
│ ├── main.py # Main evaluation script
│ ├── run.sh # Experiment runner
│ ├── code_compressor.py # Core compression implementation
│ ├── utils.py # Utility functions
│ └── ...
└── README.md
```
## Installation
You can install directly from the GitHub repository:
```bash
pip install -r requirements.txt
pip install git+https://github.com/YerbaPage/LongCodeZip.git
```
Or clone and install in development mode:
```bash
git clone https://github.com/YerbaPage/LongCodeZip.git
cd LongCodeZip
pip install -e .
```
## Quick Demo
@@ -62,36 +45,21 @@ We provide a simple demo (`demo.py`) to help you get started with LongCodeZip:
python demo.py
```
This demo showcases the core compression functionality by compressing a simple code snippet containing multiple functions (add, quick_sort, search_with_binary_search) based on a query about quick sort. The compressor will:
1. Rank functions by relevance to the query
2. Apply fine-grained compression to maximize information density
3. Generate a compressed prompt suitable for code LLMs
**Example output:**
```python
# Original: ~150 tokens
# Compressed: ~64 tokens (target)
# Selected: quick_sort function (most relevant to query)
```
## Core API Usage
LongCodeZip provides a simple and powerful API for compressing long code contexts. Here's how to use it:
### Basic Example
## Basic Example
```python
from longcodezip import CodeCompressor
from longcodezip import LongCodeZip
# Initialize the compressor
compressor = CodeCompressor(model_name="Qwen/Qwen2.5-Coder-7B-Instruct")
compressor = LongCodeZip(model_name="Qwen/Qwen2.5-Coder-7B-Instruct")
# Compress code with a query
result = compressor.compress_code_file(
code=your_code_string,
query="What does this function do?",
instruction="Answer the question based on the code.",
code=<your_code_string>,
query=<your_query>,
instruction=<your_instruction>,
rate=0.5, # Keep 50% of tokens
rank_only=False, # Set to True to only rank and select contexts without fine-grained compression
)
# Access compressed results
@@ -99,41 +67,6 @@ compressed_code = result['compressed_code']
compressed_prompt = result['compressed_prompt'] # Full prompt with instruction
compression_ratio = result['compression_ratio']
```
## Usage
### Quick Start
Each task directory contains a `run.sh` script for easy experimentation. Simply navigate to the desired task directory and run:
```bash
cd <task_directory>
bash run.sh
```
### Code Retrieval (RepoQA)
Navigate to the `repo-qa` directory and run experiments with different compression ratios:
```bash
cd repo-qa
bash run.sh
```
The script will evaluate LongCodeZip on the RepoQA dataset with compression ratios, running experiments in parallel on multiple GPUs.
**Key Parameters:**
- `--compression-ratio`: Controls the compression level
- `--model`: Specifies the base LLM model
- `--backend`: Backend for model inference (vllm)
### Code Completion
Navigate to the `long-code-completion` directory:
```bash
cd long-code-completion
bash run.sh
```
## References

View File

@@ -1,4 +1,4 @@
from longcodezip import CodeCompressor
from longcodezip import LongCodeZip
from loguru import logger
if __name__ == "__main__":
@@ -16,7 +16,7 @@ if __name__ == "__main__":
# Initialize compressor
logger.info("Initializing compressor...")
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
compressor = CodeCompressor(model_name=model_name)
compressor = LongCodeZip(model_name=model_name)
# Test function-based code file compression with query
logger.info("\nTesting function-based code file compression with query...")
@@ -24,7 +24,7 @@ if __name__ == "__main__":
original_tokens = len(compressor.tokenizer.encode(context))
target_token = 64
target_ratio = min(1.0, max(0.0, target_token / original_tokens))
logger.info(f"CodeCompressor: Original tokens={original_tokens}, Target tokens={target_token}, Calculated ratio={target_ratio:.4f}")
logger.info(f"LongCodeZip: Original tokens={original_tokens}, Target tokens={target_token}, Calculated ratio={target_ratio:.4f}")
logger.info("\nTesting compression with Coarse-grained compression only...")
result_cond = compressor.compress_code_file(
@@ -35,6 +35,7 @@ if __name__ == "__main__":
rank_only=True # Coarse-grained compression
)
logger.info(f"Compressed prompt: \n{result_cond['compressed_prompt']}")
logger.info(f"Compression ratio: {result_cond['compression_ratio']:.4f}") # Compression ratio: 0.3856
logger.info("\nTesting compression with Coarse-grained and Fine-grained compression...")
result_cond = compressor.compress_code_file(
@@ -45,3 +46,4 @@ if __name__ == "__main__":
rank_only=False # Corase-grained and Fine-grained compression
)
logger.info(f"Compressed prompt: \n{result_cond['compressed_prompt']}")
logger.info(f"Compression ratio: {result_cond['compression_ratio']:.4f}") # Compression ratio: 0.1468

37
experiments/README.md Normal file
View File

@@ -0,0 +1,37 @@
# Experiments
This folder contains the old experiments for the three code-related tasks. Some codes may be outdated after refactoring.
### Quick Start
Each task directory contains a `run.sh` script for easy experimentation. Simply navigate to the desired task directory and run:
```bash
cd <task_directory>
bash run.sh
```
### Code Retrieval (RepoQA)
Navigate to the `repo-qa` directory and run experiments with different compression ratios:
```bash
cd repo-qa
bash run.sh
```
The script will evaluate LongCodeZip on the RepoQA dataset with compression ratios, running experiments in parallel on multiple GPUs.
**Key Parameters:**
- `--compression-ratio`: Controls the compression level
- `--model`: Specifies the base LLM model
- `--backend`: Backend for model inference (vllm)
### Code Completion
Navigate to the `long-code-completion` directory:
```bash
cd long-code-completion
bash run.sh
```

View File

@@ -11,6 +11,10 @@ import copy
import bisect
import json
from loguru import logger
import sys
# set the level to info only, no need to show the debug messages
logger.remove()
logger.add(sys.stderr, level="INFO")
class EntropyChunking:
def __init__(self, model_name="Qwen/Qwen2.5-Coder-0.5B-Instruct"):
@@ -192,7 +196,7 @@ class EntropyChunking:
return chunks, sentences, ppls, spike_indices
class CodeCompressor:
class LongCodeZip:
def __init__(
self,
model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4",
@@ -200,7 +204,7 @@ class CodeCompressor:
model_config: dict = {},
):
"""
Initialize the CodeCompressor with a language model for compression.
Initialize the LongCodeZip with a language model for compression.
Args:
model_name: The name of the model to load from HuggingFace
@@ -1804,3 +1808,92 @@ class CodeCompressor:
current_weight += weight
return selected
if __name__ == "__main__":
context = """
def add(a, b):
return a + b
def quick_sort(arr):
if len(arr) <= 1:
return arr
pivot = arr[len(arr) // 2]
left = [x for x in arr if x < pivot]
middle = [x for x in arr if x == pivot]
right = [x for x in arr if x > pivot]
return quick_sort(left) + middle + quick_sort(right)
def search_with_binary_search(arr, target):
left, right = 0, len(arr) - 1
while left <= right:
mid = (left + right) // 2
if arr[mid] == target:
return mid
elif arr[mid] < target:
left = mid + 1
else:
right = mid - 1
return -1
"""
question = "How to write a quick sort algorithm?"
# Initialize compressor
logger.info("Initializing compressor...")
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
compressor = LongCodeZip(model_name=model_name)
# Test function-based code file compression with query
logger.info("\nTesting function-based code file compression with query...")
original_tokens = len(compressor.tokenizer.encode(context))
target_token = 64
target_ratio = min(1.0, max(0.0, target_token / original_tokens))
logger.info(f"LongCodeZip: Original tokens={original_tokens}, Target tokens={target_token}, Calculated ratio={target_ratio:.4f}")
result = compressor.compress_code_file(
code=context,
query=question, # Using current function context as query focus
instruction="Complete the following code function given the context.",
rate=target_ratio,
rank_only=True, # Only use coarse-grained compression
fine_grained_importance_method="conditional_ppl", # Explicitly test default
min_lines_for_fine_grained=5, # Min number of lines for fine-grained compression
importance_beta=0.5, # Sensitivity to importance score
use_knapsack=True,
)
# show the compressed code
logger.info(f"Compressed code (using {result['fine_grained_method_used']}): \n{result['compressed_code']}")
logger.info(f"Current function context: \n{question}")
# final prompt
final_prompt = result['compressed_prompt']
# get the completion
tokenized_prompt = compressor.tokenizer(final_prompt, return_tensors="pt").to(compressor.device)
# Increase max_new_tokens for potentially longer completions
completion_ids = compressor.model.generate(**tokenized_prompt, max_new_tokens=128, pad_token_id=compressor.tokenizer.eos_token_id)
# Decode only the generated part, skipping special tokens
completion = compressor.tokenizer.decode(completion_ids[0][len(tokenized_prompt.input_ids[0]):], skip_special_tokens=True)
# Basic cleanup: remove leading/trailing whitespace and potentially stop words if needed
completion = completion.strip()
# More robust cleanup: Find the first meaningful line if generation includes noise
completion_lines = [line for line in completion.split("\n") if line.strip() and not line.strip().startswith(("#", "//"))] # Simple comment removal
cleaned_completion = completion_lines[0] if completion_lines else completion # Take first non-comment line or original if none found
logger.info(f"Cleaned Completion: {cleaned_completion}")
# Optional: Test with conditional_ppl method
logger.info("\nTesting fine-grained compression with conditional_ppl...")
result_cond = compressor.compress_code_file(
code=context,
query=question,
instruction="Complete the following code function given the context.",
rate=target_ratio,
rank_only=False,
fine_grained_importance_method="conditional_ppl",
min_lines_for_fine_grained=5,
importance_beta=0.5
)
logger.info(f"Compressed code (using {result_cond['fine_grained_method_used']}): \n{result_cond['compressed_code']}")

72
pyproject.toml Normal file
View File

@@ -0,0 +1,72 @@
[build-system]
requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
build-backend = "setuptools.build_meta"
[project]
name = "longcodezip"
version = "0.1.0"
description = "A novel two-stage long code compression method for code language models"
readme = "README.md"
requires-python = ">=3.9"
license = {text = "MIT"}
authors = [
{name = "Yuling Shi"},
]
keywords = [
"code compression",
"language models",
"llm",
"code intelligence",
"nlp",
]
classifiers = [
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries :: Python Modules",
]
dependencies = [
"appdirs",
"datasets",
"editdistance",
"fire",
"loguru",
"matplotlib",
"nltk",
"numpy",
"openai>=1.0.0",
"rich",
"torch",
"transformers",
"tqdm",
"tree-sitter-languages",
"tempdir",
"wget",
]
[project.optional-dependencies]
dev = [
"pytest",
"black",
"flake8",
]
[project.urls]
Homepage = "https://github.com/YerbaPage/LongCodeZip"
Repository = "https://github.com/YerbaPage/LongCodeZip"
Documentation = "https://github.com/YerbaPage/LongCodeZip/blob/main/README.md"
"Bug Tracker" = "https://github.com/YerbaPage/LongCodeZip/issues"
"Paper" = "https://arxiv.org/abs/2510.00446"
[tool.setuptools]
packages = ["longcodezip"]
[tool.setuptools.package-data]
longcodezip = ["py.typed"]

View File

@@ -1,11 +1,7 @@
anthropic
appdirs
datasets
editdistance
fire
google-api-core
google-generativeai
llmlingua
loguru
matplotlib
nltk
@@ -13,9 +9,8 @@ numpy
openai>=1.0.0
rich
torch
transformers==4.37.1
transformers
tqdm
tree-sitter-languages
tempdir
vllm
wget

45
setup.py Normal file
View File

@@ -0,0 +1,45 @@
from setuptools import setup, find_packages
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
with open("requirements.txt", "r", encoding="utf-8") as fh:
requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
setup(
name="longcodezip",
version="0.1.0",
author="Yuling Shi",
author_email="",
description="A novel two-stage long code compression method for code language models",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/YerbaPage/LongCodeZip",
packages=find_packages(exclude=["repo-qa", "long-code-completion", "module-summarization", "assets"]),
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries :: Python Modules",
],
python_requires=">=3.9",
install_requires=requirements,
extras_require={
"dev": [
"pytest",
"black",
"flake8",
],
},
include_package_data=True,
package_data={
"longcodezip": ["py.typed"],
},
)