mirror of
https://github.com/YerbaPage/LongCodeZip.git
synced 2025-10-22 23:19:46 +03:00
packaging
This commit is contained in:
7
MANIFEST.in
Normal file
7
MANIFEST.in
Normal file
@@ -0,0 +1,7 @@
|
||||
include README.md
|
||||
include LICENSE
|
||||
include requirements.txt
|
||||
recursive-include longcodezip *.py
|
||||
recursive-exclude * __pycache__
|
||||
recursive-exclude * *.py[co]
|
||||
|
||||
103
README.md
103
README.md
@@ -21,37 +21,20 @@ LongCodeZip introduces a two-stage code compression framework specifically desig
|
||||
|
||||
The method is plug-and-play and can be integrated with existing code LLMs to achieve significant compression ratios while maintaining or improving task performance.
|
||||
|
||||
## Repository Structure
|
||||
|
||||
This repository contains implementations and experiments for three code-related tasks:
|
||||
|
||||
```
|
||||
LongCodeZip/
|
||||
├── repo-qa/ # Code Retrieval Task
|
||||
│ ├── main.py # Main evaluation script
|
||||
│ ├── run.sh # Experiment runner
|
||||
│ ├── code_compressor.py # Core compression implementation
|
||||
│ ├── compute_score.py # Evaluation metrics
|
||||
│ └── ...
|
||||
├── long-code-completion/ # Code Completion Task
|
||||
│ ├── main.py # Main evaluation script
|
||||
│ ├── run.sh # Experiment runner
|
||||
│ ├── code_compressor.py # Core compression implementation
|
||||
│ ├── utils.py # Utility functions
|
||||
│ └── ...
|
||||
├── module-summarization/ # Code Summarization Task
|
||||
│ ├── main.py # Main evaluation script
|
||||
│ ├── run.sh # Experiment runner
|
||||
│ ├── code_compressor.py # Core compression implementation
|
||||
│ ├── utils.py # Utility functions
|
||||
│ └── ...
|
||||
└── README.md
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
You can install directly from the GitHub repository:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install git+https://github.com/YerbaPage/LongCodeZip.git
|
||||
```
|
||||
|
||||
Or clone and install in development mode:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/YerbaPage/LongCodeZip.git
|
||||
cd LongCodeZip
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
## Quick Demo
|
||||
@@ -62,36 +45,21 @@ We provide a simple demo (`demo.py`) to help you get started with LongCodeZip:
|
||||
python demo.py
|
||||
```
|
||||
|
||||
This demo showcases the core compression functionality by compressing a simple code snippet containing multiple functions (add, quick_sort, search_with_binary_search) based on a query about quick sort. The compressor will:
|
||||
1. Rank functions by relevance to the query
|
||||
2. Apply fine-grained compression to maximize information density
|
||||
3. Generate a compressed prompt suitable for code LLMs
|
||||
|
||||
**Example output:**
|
||||
```python
|
||||
# Original: ~150 tokens
|
||||
# Compressed: ~64 tokens (target)
|
||||
# Selected: quick_sort function (most relevant to query)
|
||||
```
|
||||
|
||||
## Core API Usage
|
||||
|
||||
LongCodeZip provides a simple and powerful API for compressing long code contexts. Here's how to use it:
|
||||
|
||||
### Basic Example
|
||||
## Basic Example
|
||||
|
||||
```python
|
||||
from longcodezip import CodeCompressor
|
||||
from longcodezip import LongCodeZip
|
||||
|
||||
# Initialize the compressor
|
||||
compressor = CodeCompressor(model_name="Qwen/Qwen2.5-Coder-7B-Instruct")
|
||||
compressor = LongCodeZip(model_name="Qwen/Qwen2.5-Coder-7B-Instruct")
|
||||
|
||||
# Compress code with a query
|
||||
result = compressor.compress_code_file(
|
||||
code=your_code_string,
|
||||
query="What does this function do?",
|
||||
instruction="Answer the question based on the code.",
|
||||
code=<your_code_string>,
|
||||
query=<your_query>,
|
||||
instruction=<your_instruction>,
|
||||
rate=0.5, # Keep 50% of tokens
|
||||
rank_only=False, # Set to True to only rank and select contexts without fine-grained compression
|
||||
)
|
||||
|
||||
# Access compressed results
|
||||
@@ -99,41 +67,6 @@ compressed_code = result['compressed_code']
|
||||
compressed_prompt = result['compressed_prompt'] # Full prompt with instruction
|
||||
compression_ratio = result['compression_ratio']
|
||||
```
|
||||
## Usage
|
||||
|
||||
### Quick Start
|
||||
|
||||
Each task directory contains a `run.sh` script for easy experimentation. Simply navigate to the desired task directory and run:
|
||||
|
||||
```bash
|
||||
cd <task_directory>
|
||||
bash run.sh
|
||||
```
|
||||
|
||||
### Code Retrieval (RepoQA)
|
||||
|
||||
Navigate to the `repo-qa` directory and run experiments with different compression ratios:
|
||||
|
||||
```bash
|
||||
cd repo-qa
|
||||
bash run.sh
|
||||
```
|
||||
|
||||
The script will evaluate LongCodeZip on the RepoQA dataset with compression ratios, running experiments in parallel on multiple GPUs.
|
||||
|
||||
**Key Parameters:**
|
||||
- `--compression-ratio`: Controls the compression level
|
||||
- `--model`: Specifies the base LLM model
|
||||
- `--backend`: Backend for model inference (vllm)
|
||||
|
||||
### Code Completion
|
||||
|
||||
Navigate to the `long-code-completion` directory:
|
||||
|
||||
```bash
|
||||
cd long-code-completion
|
||||
bash run.sh
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
|
||||
10
demo.py
10
demo.py
@@ -1,4 +1,4 @@
|
||||
from longcodezip import CodeCompressor
|
||||
from longcodezip import LongCodeZip
|
||||
from loguru import logger
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -16,7 +16,7 @@ if __name__ == "__main__":
|
||||
# Initialize compressor
|
||||
logger.info("Initializing compressor...")
|
||||
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
|
||||
compressor = CodeCompressor(model_name=model_name)
|
||||
compressor = LongCodeZip(model_name=model_name)
|
||||
|
||||
# Test function-based code file compression with query
|
||||
logger.info("\nTesting function-based code file compression with query...")
|
||||
@@ -24,7 +24,7 @@ if __name__ == "__main__":
|
||||
original_tokens = len(compressor.tokenizer.encode(context))
|
||||
target_token = 64
|
||||
target_ratio = min(1.0, max(0.0, target_token / original_tokens))
|
||||
logger.info(f"CodeCompressor: Original tokens={original_tokens}, Target tokens={target_token}, Calculated ratio={target_ratio:.4f}")
|
||||
logger.info(f"LongCodeZip: Original tokens={original_tokens}, Target tokens={target_token}, Calculated ratio={target_ratio:.4f}")
|
||||
|
||||
logger.info("\nTesting compression with Coarse-grained compression only...")
|
||||
result_cond = compressor.compress_code_file(
|
||||
@@ -35,6 +35,7 @@ if __name__ == "__main__":
|
||||
rank_only=True # Coarse-grained compression
|
||||
)
|
||||
logger.info(f"Compressed prompt: \n{result_cond['compressed_prompt']}")
|
||||
logger.info(f"Compression ratio: {result_cond['compression_ratio']:.4f}") # Compression ratio: 0.3856
|
||||
|
||||
logger.info("\nTesting compression with Coarse-grained and Fine-grained compression...")
|
||||
result_cond = compressor.compress_code_file(
|
||||
@@ -44,4 +45,5 @@ if __name__ == "__main__":
|
||||
rate=target_ratio,
|
||||
rank_only=False # Corase-grained and Fine-grained compression
|
||||
)
|
||||
logger.info(f"Compressed prompt: \n{result_cond['compressed_prompt']}")
|
||||
logger.info(f"Compressed prompt: \n{result_cond['compressed_prompt']}")
|
||||
logger.info(f"Compression ratio: {result_cond['compression_ratio']:.4f}") # Compression ratio: 0.1468
|
||||
37
experiments/README.md
Normal file
37
experiments/README.md
Normal file
@@ -0,0 +1,37 @@
|
||||
# Experiments
|
||||
|
||||
This folder contains the old experiments for the three code-related tasks. Some codes may be outdated after refactoring.
|
||||
|
||||
### Quick Start
|
||||
|
||||
Each task directory contains a `run.sh` script for easy experimentation. Simply navigate to the desired task directory and run:
|
||||
|
||||
```bash
|
||||
cd <task_directory>
|
||||
bash run.sh
|
||||
```
|
||||
|
||||
### Code Retrieval (RepoQA)
|
||||
|
||||
Navigate to the `repo-qa` directory and run experiments with different compression ratios:
|
||||
|
||||
```bash
|
||||
cd repo-qa
|
||||
bash run.sh
|
||||
```
|
||||
|
||||
The script will evaluate LongCodeZip on the RepoQA dataset with compression ratios, running experiments in parallel on multiple GPUs.
|
||||
|
||||
**Key Parameters:**
|
||||
- `--compression-ratio`: Controls the compression level
|
||||
- `--model`: Specifies the base LLM model
|
||||
- `--backend`: Backend for model inference (vllm)
|
||||
|
||||
### Code Completion
|
||||
|
||||
Navigate to the `long-code-completion` directory:
|
||||
|
||||
```bash
|
||||
cd long-code-completion
|
||||
bash run.sh
|
||||
```
|
||||
@@ -11,6 +11,10 @@ import copy
|
||||
import bisect
|
||||
import json
|
||||
from loguru import logger
|
||||
import sys
|
||||
# set the level to info only, no need to show the debug messages
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, level="INFO")
|
||||
|
||||
class EntropyChunking:
|
||||
def __init__(self, model_name="Qwen/Qwen2.5-Coder-0.5B-Instruct"):
|
||||
@@ -192,7 +196,7 @@ class EntropyChunking:
|
||||
|
||||
return chunks, sentences, ppls, spike_indices
|
||||
|
||||
class CodeCompressor:
|
||||
class LongCodeZip:
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = "Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int4",
|
||||
@@ -200,7 +204,7 @@ class CodeCompressor:
|
||||
model_config: dict = {},
|
||||
):
|
||||
"""
|
||||
Initialize the CodeCompressor with a language model for compression.
|
||||
Initialize the LongCodeZip with a language model for compression.
|
||||
|
||||
Args:
|
||||
model_name: The name of the model to load from HuggingFace
|
||||
@@ -1803,4 +1807,93 @@ class CodeCompressor:
|
||||
selected.add(idx)
|
||||
current_weight += weight
|
||||
|
||||
return selected
|
||||
return selected
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
context = """
|
||||
def add(a, b):
|
||||
return a + b
|
||||
|
||||
def quick_sort(arr):
|
||||
if len(arr) <= 1:
|
||||
return arr
|
||||
pivot = arr[len(arr) // 2]
|
||||
left = [x for x in arr if x < pivot]
|
||||
middle = [x for x in arr if x == pivot]
|
||||
right = [x for x in arr if x > pivot]
|
||||
return quick_sort(left) + middle + quick_sort(right)
|
||||
|
||||
def search_with_binary_search(arr, target):
|
||||
left, right = 0, len(arr) - 1
|
||||
while left <= right:
|
||||
mid = (left + right) // 2
|
||||
if arr[mid] == target:
|
||||
return mid
|
||||
elif arr[mid] < target:
|
||||
left = mid + 1
|
||||
else:
|
||||
right = mid - 1
|
||||
return -1
|
||||
"""
|
||||
|
||||
question = "How to write a quick sort algorithm?"
|
||||
|
||||
# Initialize compressor
|
||||
logger.info("Initializing compressor...")
|
||||
model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"
|
||||
compressor = LongCodeZip(model_name=model_name)
|
||||
|
||||
# Test function-based code file compression with query
|
||||
logger.info("\nTesting function-based code file compression with query...")
|
||||
|
||||
original_tokens = len(compressor.tokenizer.encode(context))
|
||||
target_token = 64
|
||||
target_ratio = min(1.0, max(0.0, target_token / original_tokens))
|
||||
logger.info(f"LongCodeZip: Original tokens={original_tokens}, Target tokens={target_token}, Calculated ratio={target_ratio:.4f}")
|
||||
|
||||
result = compressor.compress_code_file(
|
||||
code=context,
|
||||
query=question, # Using current function context as query focus
|
||||
instruction="Complete the following code function given the context.",
|
||||
rate=target_ratio,
|
||||
rank_only=True, # Only use coarse-grained compression
|
||||
fine_grained_importance_method="conditional_ppl", # Explicitly test default
|
||||
min_lines_for_fine_grained=5, # Min number of lines for fine-grained compression
|
||||
importance_beta=0.5, # Sensitivity to importance score
|
||||
use_knapsack=True,
|
||||
)
|
||||
|
||||
# show the compressed code
|
||||
logger.info(f"Compressed code (using {result['fine_grained_method_used']}): \n{result['compressed_code']}")
|
||||
logger.info(f"Current function context: \n{question}")
|
||||
# final prompt
|
||||
final_prompt = result['compressed_prompt']
|
||||
# get the completion
|
||||
tokenized_prompt = compressor.tokenizer(final_prompt, return_tensors="pt").to(compressor.device)
|
||||
# Increase max_new_tokens for potentially longer completions
|
||||
completion_ids = compressor.model.generate(**tokenized_prompt, max_new_tokens=128, pad_token_id=compressor.tokenizer.eos_token_id)
|
||||
# Decode only the generated part, skipping special tokens
|
||||
completion = compressor.tokenizer.decode(completion_ids[0][len(tokenized_prompt.input_ids[0]):], skip_special_tokens=True)
|
||||
|
||||
# Basic cleanup: remove leading/trailing whitespace and potentially stop words if needed
|
||||
completion = completion.strip()
|
||||
# More robust cleanup: Find the first meaningful line if generation includes noise
|
||||
completion_lines = [line for line in completion.split("\n") if line.strip() and not line.strip().startswith(("#", "//"))] # Simple comment removal
|
||||
cleaned_completion = completion_lines[0] if completion_lines else completion # Take first non-comment line or original if none found
|
||||
|
||||
logger.info(f"Cleaned Completion: {cleaned_completion}")
|
||||
|
||||
# Optional: Test with conditional_ppl method
|
||||
logger.info("\nTesting fine-grained compression with conditional_ppl...")
|
||||
result_cond = compressor.compress_code_file(
|
||||
code=context,
|
||||
query=question,
|
||||
instruction="Complete the following code function given the context.",
|
||||
rate=target_ratio,
|
||||
rank_only=False,
|
||||
fine_grained_importance_method="conditional_ppl",
|
||||
min_lines_for_fine_grained=5,
|
||||
importance_beta=0.5
|
||||
)
|
||||
logger.info(f"Compressed code (using {result_cond['fine_grained_method_used']}): \n{result_cond['compressed_code']}")
|
||||
72
pyproject.toml
Normal file
72
pyproject.toml
Normal file
@@ -0,0 +1,72 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "longcodezip"
|
||||
version = "0.1.0"
|
||||
description = "A novel two-stage long code compression method for code language models"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
license = {text = "MIT"}
|
||||
authors = [
|
||||
{name = "Yuling Shi"},
|
||||
]
|
||||
keywords = [
|
||||
"code compression",
|
||||
"language models",
|
||||
"llm",
|
||||
"code intelligence",
|
||||
"nlp",
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||
]
|
||||
dependencies = [
|
||||
"appdirs",
|
||||
"datasets",
|
||||
"editdistance",
|
||||
"fire",
|
||||
"loguru",
|
||||
"matplotlib",
|
||||
"nltk",
|
||||
"numpy",
|
||||
"openai>=1.0.0",
|
||||
"rich",
|
||||
"torch",
|
||||
"transformers",
|
||||
"tqdm",
|
||||
"tree-sitter-languages",
|
||||
"tempdir",
|
||||
"wget",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest",
|
||||
"black",
|
||||
"flake8",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/YerbaPage/LongCodeZip"
|
||||
Repository = "https://github.com/YerbaPage/LongCodeZip"
|
||||
Documentation = "https://github.com/YerbaPage/LongCodeZip/blob/main/README.md"
|
||||
"Bug Tracker" = "https://github.com/YerbaPage/LongCodeZip/issues"
|
||||
"Paper" = "https://arxiv.org/abs/2510.00446"
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["longcodezip"]
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
longcodezip = ["py.typed"]
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
anthropic
|
||||
appdirs
|
||||
datasets
|
||||
editdistance
|
||||
fire
|
||||
google-api-core
|
||||
google-generativeai
|
||||
llmlingua
|
||||
loguru
|
||||
matplotlib
|
||||
nltk
|
||||
@@ -13,9 +9,8 @@ numpy
|
||||
openai>=1.0.0
|
||||
rich
|
||||
torch
|
||||
transformers==4.37.1
|
||||
transformers
|
||||
tqdm
|
||||
tree-sitter-languages
|
||||
tempdir
|
||||
vllm
|
||||
wget
|
||||
|
||||
45
setup.py
Normal file
45
setup.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
with open("README.md", "r", encoding="utf-8") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
with open("requirements.txt", "r", encoding="utf-8") as fh:
|
||||
requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
|
||||
|
||||
setup(
|
||||
name="longcodezip",
|
||||
version="0.1.0",
|
||||
author="Yuling Shi",
|
||||
author_email="",
|
||||
description="A novel two-stage long code compression method for code language models",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/YerbaPage/LongCodeZip",
|
||||
packages=find_packages(exclude=["repo-qa", "long-code-completion", "module-summarization", "assets"]),
|
||||
classifiers=[
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||
],
|
||||
python_requires=">=3.9",
|
||||
install_requires=requirements,
|
||||
extras_require={
|
||||
"dev": [
|
||||
"pytest",
|
||||
"black",
|
||||
"flake8",
|
||||
],
|
||||
},
|
||||
include_package_data=True,
|
||||
package_data={
|
||||
"longcodezip": ["py.typed"],
|
||||
},
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user