[Refactor]: Change search method to BM25 & Dense vector Hybrid search (#119)

* [Refactor]: Change search method to BM25 & Dense vector Hybrid search

* [Restructure] 1.Refactor codebase to use Context class 2.Add hybrid mode environment variable

Signed-off-by: ShawnZheng <shawn.zheng@zilliz.com>

---------

Signed-off-by: ShawnZheng <shawn.zheng@zilliz.com>
This commit is contained in:
Shawn Zheng
2025-08-05 16:58:20 +08:00
committed by GitHub
parent 51822f5470
commit 419d40e3aa
11 changed files with 845 additions and 134 deletions

View File

@@ -11,7 +11,10 @@ import {
import {
VectorDatabase,
VectorDocument,
VectorSearchResult
VectorSearchResult,
HybridSearchRequest,
HybridSearchOptions,
HybridSearchResult
} from './vectordb';
import { SemanticSearchResult } from './types';
import { envManager } from './utils/env-manager';
@@ -152,17 +155,30 @@ export class Context {
}
/**
* Generate collection name based on codebase path
* Get isHybrid setting from environment variable with default true
*/
private getCollectionName(codebasePath: string): string {
const normalizedPath = path.resolve(codebasePath);
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
return `code_chunks_${hash.substring(0, 8)}`;
private getIsHybrid(): boolean {
const isHybridEnv = envManager.get('HYBRID_MODE');
if (isHybridEnv === undefined || isHybridEnv === null) {
return true; // Default to true
}
return isHybridEnv.toLowerCase() === 'true';
}
/**
* Index entire codebase
* @param codebasePath Codebase path
* Generate collection name based on codebase path and hybrid mode
*/
private getCollectionName(codebasePath: string): string {
const isHybrid = this.getIsHybrid();
const normalizedPath = path.resolve(codebasePath);
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
const prefix = isHybrid === true ? 'hybrid_code_chunks' : 'code_chunks';
return `${prefix}_${hash.substring(0, 8)}`;
}
/**
* Index a codebase for semantic search
* @param codebasePath Codebase root path
* @param progressCallback Optional progress callback function
* @returns Indexing statistics
*/
@@ -170,7 +186,9 @@ export class Context {
codebasePath: string,
progressCallback?: (progress: { phase: string; current: number; total: number; percentage: number }) => void
): Promise<{ indexedFiles: number; totalChunks: number; status: 'completed' | 'limit_reached' }> {
console.log(`🚀 Starting to index codebase: ${codebasePath}`);
const isHybrid = this.getIsHybrid();
const searchType = isHybrid === true ? 'hybrid search' : 'semantic search';
console.log(`🚀 Starting to index codebase with ${searchType}: ${codebasePath}`);
// 1. Load ignore patterns from various ignore files
await this.loadGitignorePatterns(codebasePath);
@@ -239,7 +257,7 @@ export class Context {
if (!synchronizer) {
// Load project-specific ignore patterns before creating FileSynchronizer
await this.loadGitignorePatterns(codebasePath);
// To be safe, let's initialize if it's not there.
const newSynchronizer = new FileSynchronizer(codebasePath, this.ignorePatterns);
await newSynchronizer.initialize();
@@ -317,37 +335,118 @@ export class Context {
}
/**
* Semantic search
* Semantic search with unified implementation
* @param codebasePath Codebase path to search in
* @param query Search query
* @param topK Number of results to return
* @param threshold Similarity threshold
*/
async semanticSearch(codebasePath: string, query: string, topK: number = 5, threshold: number = 0.5): Promise<SemanticSearchResult[]> {
console.log(`🔍 Executing semantic search: "${query}" in ${codebasePath}`);
const isHybrid = this.getIsHybrid();
const searchType = isHybrid === true ? 'hybrid search' : 'semantic search';
console.log(`🔍 Executing ${searchType}: "${query}" in ${codebasePath}`);
// 1. Generate query vector
const queryEmbedding: EmbeddingVector = await this.embedding.embed(query);
const collectionName = this.getCollectionName(codebasePath);
console.log(`🔍 Using collection: ${collectionName}`);
// 2. Search in vector database
const searchResults: VectorSearchResult[] = await this.vectorDatabase.search(
this.getCollectionName(codebasePath),
queryEmbedding.vector,
{ topK, threshold }
);
// Check if collection exists and has data
const hasCollection = await this.vectorDatabase.hasCollection(collectionName);
if (!hasCollection) {
console.log(`⚠️ Collection '${collectionName}' does not exist. Please index the codebase first.`);
return [];
}
// 3. Convert to semantic search result format
const results: SemanticSearchResult[] = searchResults.map(result => ({
content: result.document.content,
relativePath: result.document.relativePath,
startLine: result.document.startLine,
endLine: result.document.endLine,
language: result.document.metadata.language || 'unknown',
score: result.score
}));
if (isHybrid === true) {
try {
// Check collection stats to see if it has data
const stats = await this.vectorDatabase.query(collectionName, '', ['id'], 1);
console.log(`🔍 Collection '${collectionName}' exists and appears to have data`);
} catch (error) {
console.log(`⚠️ Collection '${collectionName}' exists but may be empty or not properly indexed:`, error);
}
console.log(`✅ Found ${results.length} relevant results`);
return results;
// 1. Generate query vector
console.log(`🔍 Generating embeddings for query: "${query}"`);
const queryEmbedding: EmbeddingVector = await this.embedding.embed(query);
console.log(`✅ Generated embedding vector with dimension: ${queryEmbedding.vector.length}`);
console.log(`🔍 First 5 embedding values: [${queryEmbedding.vector.slice(0, 5).join(', ')}]`);
// 2. Prepare hybrid search requests
const searchRequests: HybridSearchRequest[] = [
{
data: queryEmbedding.vector,
anns_field: "vector",
param: { "nprobe": 10 },
limit: topK
},
{
data: query,
anns_field: "sparse_vector",
param: { "drop_ratio_search": 0.2 },
limit: topK
}
];
console.log(`🔍 Search request 1 (dense): anns_field="${searchRequests[0].anns_field}", vector_dim=${queryEmbedding.vector.length}, limit=${searchRequests[0].limit}`);
console.log(`🔍 Search request 2 (sparse): anns_field="${searchRequests[1].anns_field}", query_text="${query}", limit=${searchRequests[1].limit}`);
// 3. Execute hybrid search
console.log(`🔍 Executing hybrid search with RRF reranking...`);
const searchResults: HybridSearchResult[] = await this.vectorDatabase.hybridSearch(
collectionName,
searchRequests,
{
rerank: {
strategy: 'rrf',
params: { k: 100 }
},
limit: topK
}
);
console.log(`🔍 Raw search results count: ${searchResults.length}`);
// 4. Convert to semantic search result format
const results: SemanticSearchResult[] = searchResults.map(result => ({
content: result.document.content,
relativePath: result.document.relativePath,
startLine: result.document.startLine,
endLine: result.document.endLine,
language: result.document.metadata.language || 'unknown',
score: result.score
}));
console.log(`✅ Found ${results.length} relevant hybrid results`);
if (results.length > 0) {
console.log(`🔍 Top result score: ${results[0].score}, path: ${results[0].relativePath}`);
}
return results;
} else {
// Regular semantic search
// 1. Generate query vector
const queryEmbedding: EmbeddingVector = await this.embedding.embed(query);
// 2. Search in vector database
const searchResults: VectorSearchResult[] = await this.vectorDatabase.search(
collectionName,
queryEmbedding.vector,
{ topK, threshold }
);
// 3. Convert to semantic search result format
const results: SemanticSearchResult[] = searchResults.map(result => ({
content: result.document.content,
relativePath: result.document.relativePath,
startLine: result.document.startLine,
endLine: result.document.endLine,
language: result.document.metadata.language || 'unknown',
score: result.score
}));
console.log(`✅ Found ${results.length} relevant results`);
return results;
}
}
/**
@@ -458,10 +557,18 @@ export class Context {
* Prepare vector collection
*/
private async prepareCollection(codebasePath: string): Promise<void> {
// Create new collection
console.log(`🔧 Preparing vector collection for codebase: ${codebasePath}`);
const isHybrid = this.getIsHybrid();
const collectionType = isHybrid === true ? 'hybrid vector' : 'vector';
console.log(`🔧 Preparing ${collectionType} collection for codebase: ${codebasePath}`);
const collectionName = this.getCollectionName(codebasePath);
// Check if collection already exists
const collectionExists = await this.vectorDatabase.hasCollection(collectionName);
if (collectionExists) {
console.log(`📋 Collection ${collectionName} already exists, skipping creation`);
return;
}
// For Ollama embeddings, ensure dimension is detected before creating collection
if (this.embedding.getProvider() === 'Ollama' && typeof (this.embedding as any).initializeDimension === 'function') {
await (this.embedding as any).initializeDimension();
@@ -469,7 +576,13 @@ export class Context {
const dimension = this.embedding.getDimension();
const dirName = path.basename(codebasePath);
await this.vectorDatabase.createCollection(collectionName, dimension, `Index for ${dirName}`);
if (isHybrid === true) {
await this.vectorDatabase.createHybridCollection(collectionName, dimension, `Hybrid Index for ${dirName}`);
} else {
await this.vectorDatabase.createCollection(collectionName, dimension, `Index for ${dirName}`);
}
console.log(`✅ Collection ${collectionName} created successfully (dimension: ${dimension})`);
}
@@ -517,6 +630,7 @@ export class Context {
codebasePath: string,
onFileProcessed?: (filePath: string, fileIndex: number, totalFiles: number) => void
): Promise<{ processedFiles: number; totalChunks: number; status: 'completed' | 'limit_reached' }> {
const isHybrid = this.getIsHybrid();
const EMBEDDING_BATCH_SIZE = Math.max(1, parseInt(envManager.get('EMBEDDING_BATCH_SIZE') || '100', 10));
const CHUNK_LIMIT = 450000;
console.log(`🔧 Using EMBEDDING_BATCH_SIZE: ${EMBEDDING_BATCH_SIZE}`);
@@ -551,8 +665,8 @@ export class Context {
try {
await this.processChunkBuffer(chunkBuffer);
} catch (error) {
// TODO:
console.error(`❌ Failed to process chunk batch: ${error}`);
const searchType = isHybrid === true ? 'hybrid' : 'regular';
console.error(`❌ Failed to process chunk batch for ${searchType}: ${error}`);
} finally {
chunkBuffer = []; // Always clear buffer, even on failure
}
@@ -580,11 +694,12 @@ export class Context {
// Process any remaining chunks in the buffer
if (chunkBuffer.length > 0) {
console.log(`📝 Processing final batch of ${chunkBuffer.length} chunks`);
const searchType = isHybrid === true ? 'hybrid' : 'regular';
console.log(`📝 Processing final batch of ${chunkBuffer.length} chunks for ${searchType}`);
try {
await this.processChunkBuffer(chunkBuffer);
} catch (error) {
console.error(`❌ Failed to process final chunk batch: ${error}`);
console.error(`❌ Failed to process final chunk batch for ${searchType}: ${error}`);
}
}
@@ -608,7 +723,9 @@ export class Context {
// Estimate tokens (rough estimation: 1 token ≈ 4 characters)
const estimatedTokens = chunks.reduce((sum, chunk) => sum + Math.ceil(chunk.content.length / 4), 0);
console.log(`🔄 Processing batch of ${chunks.length} chunks (~${estimatedTokens} tokens)`);
const isHybrid = this.getIsHybrid();
const searchType = isHybrid === true ? 'hybrid' : 'regular';
console.log(`🔄 Processing batch of ${chunks.length} chunks (~${estimatedTokens} tokens) for ${searchType}`);
await this.processChunkBatch(chunks, codebasePath);
}
@@ -616,45 +733,75 @@ export class Context {
* Process a batch of chunks
*/
private async processChunkBatch(chunks: CodeChunk[], codebasePath: string): Promise<void> {
const isHybrid = this.getIsHybrid();
// Generate embedding vectors
const chunkContents = chunks.map(chunk => chunk.content);
const embeddings: EmbeddingVector[] = await this.embedding.embedBatch(chunkContents);
const embeddings = await this.embedding.embedBatch(chunkContents);
// Prepare vector documents
const documents: VectorDocument[] = chunks.map((chunk, index) => {
if (!chunk.metadata.filePath) {
throw new Error(`Missing filePath in chunk metadata at index ${index}`);
}
const relativePath = path.relative(codebasePath, chunk.metadata.filePath);
const fileExtension = path.extname(chunk.metadata.filePath);
// Extract metadata that should be stored separately
const { filePath, startLine, endLine, ...restMetadata } = chunk.metadata;
return {
id: this.generateId(relativePath, chunk.metadata.startLine || 0, chunk.metadata.endLine || 0, chunk.content),
vector: embeddings[index].vector,
content: chunk.content,
relativePath,
startLine: chunk.metadata.startLine || 0,
endLine: chunk.metadata.endLine || 0,
fileExtension,
metadata: {
...restMetadata,
codebasePath,
language: chunk.metadata.language || 'unknown',
chunkIndex: index
if (isHybrid === true) {
// Create hybrid vector documents
const documents: VectorDocument[] = chunks.map((chunk, index) => {
if (!chunk.metadata.filePath) {
throw new Error(`Missing filePath in chunk metadata at index ${index}`);
}
};
});
// Store to vector database
await this.vectorDatabase.insert(this.getCollectionName(codebasePath), documents);
const relativePath = path.relative(codebasePath, chunk.metadata.filePath);
const fileExtension = path.extname(chunk.metadata.filePath);
const { filePath, startLine, endLine, ...restMetadata } = chunk.metadata;
return {
id: this.generateId(relativePath, chunk.metadata.startLine || 0, chunk.metadata.endLine || 0, chunk.content),
content: chunk.content, // Full text content for BM25 and storage
vector: embeddings[index].vector, // Dense vector
relativePath,
startLine: chunk.metadata.startLine || 0,
endLine: chunk.metadata.endLine || 0,
fileExtension,
metadata: {
...restMetadata,
codebasePath,
language: chunk.metadata.language || 'unknown',
chunkIndex: index
}
};
});
// Store to vector database
await this.vectorDatabase.insertHybrid(this.getCollectionName(codebasePath), documents);
} else {
// Create regular vector documents
const documents: VectorDocument[] = chunks.map((chunk, index) => {
if (!chunk.metadata.filePath) {
throw new Error(`Missing filePath in chunk metadata at index ${index}`);
}
const relativePath = path.relative(codebasePath, chunk.metadata.filePath);
const fileExtension = path.extname(chunk.metadata.filePath);
const { filePath, startLine, endLine, ...restMetadata } = chunk.metadata;
return {
id: this.generateId(relativePath, chunk.metadata.startLine || 0, chunk.metadata.endLine || 0, chunk.content),
vector: embeddings[index].vector,
content: chunk.content,
relativePath,
startLine: chunk.metadata.startLine || 0,
endLine: chunk.metadata.endLine || 0,
fileExtension,
metadata: {
...restMetadata,
codebasePath,
language: chunk.metadata.language || 'unknown',
chunkIndex: index
}
};
});
// Store to vector database
await this.vectorDatabase.insert(this.getCollectionName(codebasePath), documents);
}
}
/**
* Get programming language based on file extension
*/

View File

@@ -4,6 +4,10 @@ export {
SearchOptions,
VectorSearchResult,
VectorDatabase,
HybridSearchRequest,
HybridSearchOptions,
HybridSearchResult,
RerankStrategy,
COLLECTION_LIMIT_MESSAGE
} from './types';

View File

@@ -3,6 +3,9 @@ import {
SearchOptions,
VectorSearchResult,
VectorDatabase,
HybridSearchRequest,
HybridSearchOptions,
HybridSearchResult,
COLLECTION_LIMIT_MESSAGE
} from './types';
import { ClusterManager } from './zilliz-utils';
@@ -467,4 +470,277 @@ export class MilvusRestfulVectorDatabase implements VectorDatabase {
throw error;
}
}
async createHybridCollection(collectionName: string, dimension: number, description?: string): Promise<void> {
try {
const restfulConfig = this.config as MilvusRestfulConfig;
const collectionSchema = {
collectionName,
dbName: restfulConfig.database,
schema: {
enableDynamicField: false,
functions: [
{
name: "content_bm25_emb",
description: "content bm25 function",
type: "BM25",
inputFieldNames: ["content"],
outputFieldNames: ["sparse_vector"],
params: {},
},
],
fields: [
{
fieldName: "id",
dataType: "VarChar",
isPrimary: true,
elementTypeParams: {
max_length: 512
}
},
{
fieldName: "content",
dataType: "VarChar",
elementTypeParams: {
max_length: 65535,
enable_analyzer: true
}
},
{
fieldName: "vector",
dataType: "FloatVector",
elementTypeParams: {
dim: dimension
}
},
{
fieldName: "sparse_vector",
dataType: "SparseFloatVector"
},
{
fieldName: "relativePath",
dataType: "VarChar",
elementTypeParams: {
max_length: 1024
}
},
{
fieldName: "startLine",
dataType: "Int64"
},
{
fieldName: "endLine",
dataType: "Int64"
},
{
fieldName: "fileExtension",
dataType: "VarChar",
elementTypeParams: {
max_length: 32
}
},
{
fieldName: "metadata",
dataType: "VarChar",
elementTypeParams: {
max_length: 65535
}
}
]
}
};
// Step 1: Create collection with schema and functions
await createCollectionWithLimitCheck(this.makeRequest.bind(this), collectionSchema);
// Step 2: Create indexes for both vector fields
await this.createHybridIndexes(collectionName);
// Step 3: Load collection to memory for searching
await this.loadCollection(collectionName);
} catch (error) {
console.error(`❌ Failed to create hybrid collection '${collectionName}':`, error);
throw error;
}
}
private async createHybridIndexes(collectionName: string): Promise<void> {
try {
const restfulConfig = this.config as MilvusRestfulConfig;
// Create index for dense vector
const denseIndexParams = {
collectionName,
dbName: restfulConfig.database,
indexParams: [
{
fieldName: "vector",
indexName: "vector_index",
metricType: "COSINE",
index_type: "AUTOINDEX"
}
]
};
await this.makeRequest('/indexes/create', 'POST', denseIndexParams);
// Create index for sparse vector
const sparseIndexParams = {
collectionName,
dbName: restfulConfig.database,
indexParams: [
{
fieldName: "sparse_vector",
indexName: "sparse_vector_index",
metricType: "BM25",
index_type: "SPARSE_INVERTED_INDEX"
}
]
};
await this.makeRequest('/indexes/create', 'POST', sparseIndexParams);
} catch (error) {
console.error(`❌ Failed to create hybrid indexes for collection '${collectionName}':`, error);
throw error;
}
}
async insertHybrid(collectionName: string, documents: VectorDocument[]): Promise<void> {
await this.ensureInitialized();
try {
const restfulConfig = this.config as MilvusRestfulConfig;
const data = documents.map(doc => ({
id: doc.id,
content: doc.content,
vector: doc.vector,
relativePath: doc.relativePath,
startLine: doc.startLine,
endLine: doc.endLine,
fileExtension: doc.fileExtension,
metadata: JSON.stringify(doc.metadata),
}));
const insertRequest = {
collectionName,
dbName: restfulConfig.database,
data: data
};
const response = await this.makeRequest('/entities/insert', 'POST', insertRequest);
if (response.code !== 0) {
throw new Error(`Insert failed: ${response.message || 'Unknown error'}`);
}
} catch (error) {
console.error(`❌ Failed to insert hybrid documents to collection '${collectionName}':`, error);
throw error;
}
}
async hybridSearch(collectionName: string, searchRequests: HybridSearchRequest[], options?: HybridSearchOptions): Promise<HybridSearchResult[]> {
await this.ensureInitialized();
try {
const restfulConfig = this.config as MilvusRestfulConfig;
console.log(`🔍 Preparing hybrid search for collection: ${collectionName}`);
// Prepare search requests according to Milvus REST API hybrid search specification
// For dense vector search - data must be array of vectors: [[0.1, 0.2, 0.3, ...]]
const search_param_1 = {
data: Array.isArray(searchRequests[0].data) ? [searchRequests[0].data] : [[searchRequests[0].data]],
annsField: searchRequests[0].anns_field, // "vector"
limit: searchRequests[0].limit,
outputFields: ["*"],
searchParams: {
metricType: "COSINE",
params: searchRequests[0].param || { "nprobe": 10 }
}
};
// For sparse vector search - data must be array of queries: ["query text"]
const search_param_2 = {
data: Array.isArray(searchRequests[1].data) ? searchRequests[1].data : [searchRequests[1].data],
annsField: searchRequests[1].anns_field, // "sparse_vector"
limit: searchRequests[1].limit,
outputFields: ["*"],
searchParams: {
metricType: "BM25",
params: searchRequests[1].param || { "drop_ratio_search": 0.2 }
}
};
const rerank_strategy = {
strategy: "rrf",
params: {
k: 100
}
};
console.log(`🔍 Dense search params:`, JSON.stringify({
annsField: search_param_1.annsField,
limit: search_param_1.limit,
data_length: Array.isArray(search_param_1.data[0]) ? search_param_1.data[0].length : 'N/A',
searchParams: search_param_1.searchParams
}, null, 2));
console.log(`🔍 Sparse search params:`, JSON.stringify({
annsField: search_param_2.annsField,
limit: search_param_2.limit,
query_text: typeof search_param_2.data[0] === 'string' ? search_param_2.data[0].substring(0, 50) + '...' : 'N/A',
searchParams: search_param_2.searchParams
}, null, 2));
const hybridSearchRequest = {
collectionName,
dbName: restfulConfig.database,
search: [search_param_1, search_param_2],
rerank: rerank_strategy,
limit: options?.limit || searchRequests[0]?.limit || 10,
outputFields: ['id', 'content', 'relativePath', 'startLine', 'endLine', 'fileExtension', 'metadata'],
};
console.log(`🔍 Complete REST API request:`, JSON.stringify({
collectionName: hybridSearchRequest.collectionName,
dbName: hybridSearchRequest.dbName,
search_count: hybridSearchRequest.search.length,
rerank: hybridSearchRequest.rerank,
limit: hybridSearchRequest.limit,
outputFields: hybridSearchRequest.outputFields
}, null, 2));
console.log(`🔍 Executing REST API hybrid search...`);
const response = await this.makeRequest('/entities/hybrid_search', 'POST', hybridSearchRequest);
if (response.code !== 0) {
throw new Error(`Hybrid search failed: ${response.message || 'Unknown error'}`);
}
const results = response.data || [];
console.log(`✅ Found ${results.length} results from hybrid search`);
// Transform response to HybridSearchResult format
return results.map((result: any) => ({
document: {
id: result.id,
content: result.content,
vector: [], // Vector not returned in search results
sparse_vector: [], // Vector not returned in search results
relativePath: result.relativePath,
startLine: result.startLine,
endLine: result.endLine,
fileExtension: result.fileExtension,
metadata: JSON.parse(result.metadata || '{}'),
},
score: result.score || result.distance || 0,
}));
} catch (error) {
console.error(`❌ Failed to perform hybrid search on collection '${collectionName}':`, error);
throw error;
}
}
}

View File

@@ -1,9 +1,12 @@
import { MilvusClient, DataType, MetricType } from '@zilliz/milvus2-sdk-node';
import { MilvusClient, DataType, MetricType, FunctionType } from '@zilliz/milvus2-sdk-node';
import {
VectorDocument,
SearchOptions,
VectorSearchResult,
VectorDatabase,
HybridSearchRequest,
HybridSearchOptions,
HybridSearchResult,
COLLECTION_LIMIT_MESSAGE
} from './types';
import { ClusterManager } from './zilliz-utils';
@@ -298,4 +301,231 @@ export class MilvusVectorDatabase implements VectorDatabase {
throw error;
}
}
async createHybridCollection(collectionName: string, dimension: number, description?: string): Promise<void> {
await this.ensureInitialized();
console.log('Beginning hybrid collection creation:', collectionName);
console.log('Collection dimension:', dimension);
const schema = [
{
name: 'id',
description: 'Document ID',
data_type: DataType.VarChar,
max_length: 512,
is_primary_key: true,
},
{
name: 'content',
description: 'Full text content for BM25 and storage',
data_type: DataType.VarChar,
max_length: 65535,
enable_analyzer: true,
},
{
name: 'vector',
description: 'Dense vector embedding',
data_type: DataType.FloatVector,
dim: dimension,
},
{
name: 'sparse_vector',
description: 'Sparse vector embedding from BM25',
data_type: DataType.SparseFloatVector,
},
{
name: 'relativePath',
description: 'Relative path to the codebase',
data_type: DataType.VarChar,
max_length: 1024,
},
{
name: 'startLine',
description: 'Start line number of the chunk',
data_type: DataType.Int64,
},
{
name: 'endLine',
description: 'End line number of the chunk',
data_type: DataType.Int64,
},
{
name: 'fileExtension',
description: 'File extension',
data_type: DataType.VarChar,
max_length: 32,
},
{
name: 'metadata',
description: 'Additional document metadata as JSON string',
data_type: DataType.VarChar,
max_length: 65535,
},
];
// Add BM25 function
const functions = [
{
name: "content_bm25_emb",
description: "content bm25 function",
type: FunctionType.BM25,
input_field_names: ["content"],
output_field_names: ["sparse_vector"],
params: {},
},
];
const createCollectionParams = {
collection_name: collectionName,
description: description || `Hybrid code context collection: ${collectionName}`,
fields: schema,
functions: functions,
};
await createCollectionWithLimitCheck(this.client!, createCollectionParams);
// Create indexes for both vector fields
// Index for dense vector
const denseIndexParams = {
collection_name: collectionName,
field_name: 'vector',
index_type: 'AUTOINDEX',
metric_type: MetricType.COSINE,
};
await this.client!.createIndex(denseIndexParams);
// Index for sparse vector
const sparseIndexParams = {
collection_name: collectionName,
field_name: 'sparse_vector',
index_type: 'SPARSE_INVERTED_INDEX',
metric_type: MetricType.BM25,
};
await this.client!.createIndex(sparseIndexParams);
// Load collection to memory
await this.client!.loadCollection({
collection_name: collectionName,
});
// Verify collection is created correctly
await this.client!.describeCollection({
collection_name: collectionName,
});
}
async insertHybrid(collectionName: string, documents: VectorDocument[]): Promise<void> {
await this.ensureInitialized();
const data = documents.map(doc => ({
id: doc.id,
content: doc.content,
vector: doc.vector,
relativePath: doc.relativePath,
startLine: doc.startLine,
endLine: doc.endLine,
fileExtension: doc.fileExtension,
metadata: JSON.stringify(doc.metadata),
}));
await this.client!.insert({
collection_name: collectionName,
data: data,
});
}
async hybridSearch(collectionName: string, searchRequests: HybridSearchRequest[], options?: HybridSearchOptions): Promise<HybridSearchResult[]> {
await this.ensureInitialized();
try {
// Generate OpenAI embedding for the first search request (dense)
console.log(`🔍 Preparing hybrid search for collection: ${collectionName}`);
// Prepare search requests in the correct Milvus format
const search_param_1 = {
data: Array.isArray(searchRequests[0].data) ? searchRequests[0].data : [searchRequests[0].data],
anns_field: searchRequests[0].anns_field, // "vector"
param: searchRequests[0].param, // {"nprobe": 10}
limit: searchRequests[0].limit
};
const search_param_2 = {
data: searchRequests[1].data, // query text for sparse search
anns_field: searchRequests[1].anns_field, // "sparse_vector"
param: searchRequests[1].param, // {"drop_ratio_search": 0.2}
limit: searchRequests[1].limit
};
// Set rerank strategy to RRF (100) by default
const rerank_strategy = {
strategy: "rrf",
params: {
k: 100
}
};
console.log(`🔍 Dense search params:`, JSON.stringify({
anns_field: search_param_1.anns_field,
param: search_param_1.param,
limit: search_param_1.limit,
data_length: Array.isArray(search_param_1.data[0]) ? search_param_1.data[0].length : 'N/A'
}, null, 2));
console.log(`🔍 Sparse search params:`, JSON.stringify({
anns_field: search_param_2.anns_field,
param: search_param_2.param,
limit: search_param_2.limit,
query_text: typeof search_param_2.data === 'string' ? search_param_2.data.substring(0, 50) + '...' : 'N/A'
}, null, 2));
console.log(`🔍 Rerank strategy:`, JSON.stringify(rerank_strategy, null, 2));
// Execute hybrid search using the correct client.search format
const searchParams = {
collection_name: collectionName,
data: [search_param_1, search_param_2],
limit: options?.limit || searchRequests[0]?.limit || 10,
rerank: rerank_strategy,
output_fields: ['id', 'content', 'relativePath', 'startLine', 'endLine', 'fileExtension', 'metadata'],
};
console.log(`🔍 Complete search request:`, JSON.stringify({
collection_name: searchParams.collection_name,
data_count: searchParams.data.length,
limit: searchParams.limit,
rerank: searchParams.rerank,
output_fields: searchParams.output_fields
}, null, 2));
const searchResult = await this.client!.search(searchParams);
console.log(`🔍 Search executed, processing results...`);
if (!searchResult.results || searchResult.results.length === 0) {
console.log(`⚠️ No results returned from Milvus search`);
return [];
}
console.log(`✅ Found ${searchResult.results.length} results from hybrid search`);
// Transform results to HybridSearchResult format
return searchResult.results.map((result: any) => ({
document: {
id: result.id,
content: result.content,
vector: [],
sparse_vector: [],
relativePath: result.relativePath,
startLine: result.startLine,
endLine: result.endLine,
fileExtension: result.fileExtension,
metadata: JSON.parse(result.metadata || '{}'),
},
score: result.score,
}));
} catch (error) {
console.error(`❌ Failed to perform hybrid search on collection '${collectionName}':`, error);
throw error;
}
}
}

View File

@@ -16,11 +16,34 @@ export interface SearchOptions {
threshold?: number;
}
// New interfaces for hybrid search
export interface HybridSearchRequest {
data: number[] | string; // Query vector or text
anns_field: string; // Vector field name (vector or sparse_vector)
param: Record<string, any>; // Search parameters
limit: number;
}
export interface HybridSearchOptions {
rerank?: RerankStrategy;
limit?: number;
}
export interface RerankStrategy {
strategy: 'rrf' | 'weighted';
params?: Record<string, any>;
}
export interface VectorSearchResult {
document: VectorDocument;
score: number;
}
export interface HybridSearchResult {
document: VectorDocument;
score: number;
}
export interface VectorDatabase {
/**
* Create collection
@@ -30,6 +53,14 @@ export interface VectorDatabase {
*/
createCollection(collectionName: string, dimension: number, description?: string): Promise<void>;
/**
* Create collection with hybrid search support
* @param collectionName Collection name
* @param dimension Dense vector dimension
* @param description Collection description
*/
createHybridCollection(collectionName: string, dimension: number, description?: string): Promise<void>;
/**
* Drop collection
* @param collectionName Collection name
@@ -54,6 +85,13 @@ export interface VectorDatabase {
*/
insert(collectionName: string, documents: VectorDocument[]): Promise<void>;
/**
* Insert hybrid vector documents
* @param collectionName Collection name
* @param documents Document array
*/
insertHybrid(collectionName: string, documents: VectorDocument[]): Promise<void>;
/**
* Search similar vectors
* @param collectionName Collection name
@@ -62,6 +100,14 @@ export interface VectorDatabase {
*/
search(collectionName: string, queryVector: number[], options?: SearchOptions): Promise<VectorSearchResult[]>;
/**
* Hybrid search with multiple vector fields
* @param collectionName Collection name
* @param searchRequests Array of search requests for different fields
* @param options Hybrid search options including reranking
*/
hybridSearch(collectionName: string, searchRequests: HybridSearchRequest[], options?: HybridSearchOptions): Promise<HybridSearchResult[]>;
/**
* Delete documents
* @param collectionName Collection name
@@ -70,11 +116,11 @@ export interface VectorDatabase {
delete(collectionName: string, ids: string[]): Promise<void>;
/**
* Query documents by filter
* Query documents with filter conditions
* @param collectionName Collection name
* @param filter Filter expression string
* @param filter Filter expression
* @param outputFields Fields to return
* @param limit Maximum number of results to return (optional)
* @param limit Maximum number of results
*/
query(collectionName: string, filter: string, outputFields: string[], limit?: number): Promise<Record<string, any>[]>;
}

View File

@@ -61,8 +61,8 @@ export class ToolHandlers {
// Check each collection for codebase path
for (const collectionName of collections) {
try {
// Skip collections that don't match the code_chunks pattern
if (!collectionName.startsWith('code_chunks_')) {
// Skip collections that don't match the code_chunks pattern (support both legacy and new collections)
if (!collectionName.startsWith('code_chunks_') && !collectionName.startsWith('hybrid_code_chunks_')) {
console.log(`[SYNC-CLOUD] ⏭️ Skipping non-code collection: ${collectionName}`);
continue;
}
@@ -218,38 +218,19 @@ export class ToolHandlers {
// CRITICAL: Pre-index collection creation validation
try {
const normalizedPath = path.resolve(absolutePath);
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
const collectionName = `code_chunks_${hash.substring(0, 8)}`;
console.log(`[INDEX-VALIDATION] 🔍 Validating collection creation capability`);
console.log(`[INDEX-VALIDATION] 🔍 Validating collection creation for: ${collectionName}`);
// Get embedding dimension for collection creation
const embeddingProvider = this.context['embedding'];
const dimension = embeddingProvider.getDimension();
// If force reindex, clear existing collection first
if (forceReindex) {
console.log(`[INDEX-VALIDATION] 🧹 Force reindex enabled, clearing existing collection: ${collectionName}`);
try {
await this.context['vectorDatabase'].dropCollection(collectionName);
console.log(`[INDEX-VALIDATION] ✅ Existing collection cleared: ${collectionName}`);
} catch (dropError: any) {
// Collection might not exist, which is fine
console.log(`[INDEX-VALIDATION] Collection ${collectionName} does not exist or already cleared`);
}
// Check if collection can be created (this will be handled entirely by context.ts)
const hasExistingIndex = await this.context.hasIndex(absolutePath);
if (hasExistingIndex && forceReindex) {
console.log(`[INDEX-VALIDATION] Force reindex enabled, existing index will be cleared`);
await this.context.clearIndex(absolutePath);
console.log(`[INDEX-VALIDATION] ✅ Existing index cleared for re-indexing`);
} else if (hasExistingIndex) {
console.log(`[INDEX-VALIDATION] Index already exists for this codebase`);
}
// Attempt to create collection - this will throw COLLECTION_LIMIT_MESSAGE if limit reached
await this.context['vectorDatabase'].createCollection(
collectionName,
dimension,
`Claude Context collection: ${collectionName}`
);
// If creation succeeds, immediately drop the test collection
await this.context['vectorDatabase'].dropCollection(collectionName);
console.log(`[INDEX-VALIDATION] ✅ Collection creation validated successfully`);
console.log(`[INDEX-VALIDATION] ✅ Collection creation validation completed`);
} catch (validationError: any) {
const errorMessage = typeof validationError === 'string' ? validationError :
@@ -352,14 +333,9 @@ export class ToolHandlers {
console.warn(`[BACKGROUND-INDEX] Non-AST splitter '${splitterType}' requested; falling back to AST splitter`);
}
// Generate collection name
const normalizedPath = path.resolve(absolutePath);
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
const collectionName = `code_chunks_${hash.substring(0, 8)}`;
// Load ignore patterns from files first (including .ignore, .gitignore, etc.)
await this.context['loadGitignorePatterns'](absolutePath);
// Initialize file synchronizer with proper ignore patterns (including project-specific patterns)
const { FileSynchronizer } = await import("@zilliz/claude-context-core");
const ignorePatterns = this.context['ignorePatterns'] || [];
@@ -367,7 +343,9 @@ export class ToolHandlers {
const synchronizer = new FileSynchronizer(absolutePath, ignorePatterns);
await synchronizer.initialize();
// Store synchronizer in the context's internal map
// Store synchronizer in the context (let context manage collection names)
await this.context['prepareCollection'](absolutePath);
const collectionName = this.context['getCollectionName'](absolutePath);
this.context['synchronizers'].set(collectionName, synchronizer);
if (contextForThisTask !== this.context) {
contextForThisTask['synchronizers'].set(collectionName, synchronizer);
@@ -471,7 +449,7 @@ export class ToolHandlers {
// Log embedding provider information before search
const embeddingProvider = this.context['embedding'];
console.log(`[SEARCH] 🧠 Using embedding provider: ${embeddingProvider.getProvider()} for semantic search`);
console.log(`[SEARCH] 🧠 Using embedding provider: ${embeddingProvider.getProvider()} for search`);
console.log(`[SEARCH] 🔍 Generating embeddings for query using ${embeddingProvider.getProvider()}...`);
// Search in the specified codebase
@@ -505,7 +483,7 @@ export class ToolHandlers {
return `${index + 1}. Code snippet (${result.language}) [${codebaseInfo}]\n` +
` Location: ${location}\n` +
` Score: ${result.score.toFixed(3)}\n` +
` Rank: ${index + 1}\n` +
` Context: \n\`\`\`${result.language}\n${context}\n\`\`\`\n`;
}).join('\n');

View File

@@ -1,5 +1,5 @@
import * as fs from "fs";
import { Context } from "@zilliz/claude-context-core";
import { Context, FileSynchronizer } from "@zilliz/claude-context-core";
import { SnapshotManager } from "./snapshot.js";
export class SyncManager {
@@ -79,6 +79,11 @@ export class SyncManager {
console.error(`[SYNC-DEBUG] Error syncing codebase '${codebasePath}' after ${codebaseElapsed}ms:`, error);
console.error(`[SYNC-DEBUG] Error stack:`, error.stack);
if (error.message.includes('Failed to query Milvus')) {
// Collection maybe deleted manually, delete the snapshot file
await FileSynchronizer.deleteSnapshot(codebasePath);
}
// Log additional error details
if (error.code) {
console.error(`[SYNC-DEBUG] Error code: ${error.code}`);

View File

@@ -1,7 +1,6 @@
import * as vscode from 'vscode';
import { Context } from '@zilliz/claude-context-core';
import * as path from 'path';
import * as crypto from 'crypto';
export class IndexCommand {
private context: Context;
@@ -78,10 +77,9 @@ export class IndexCommand {
const { FileSynchronizer } = await import("@zilliz/claude-context-core");
const synchronizer = new FileSynchronizer(selectedFolder.uri.fsPath, this.context['ignorePatterns'] || []);
await synchronizer.initialize();
// Store synchronizer in the context's internal map using the same collection name generation logic
const normalizedPath = path.resolve(selectedFolder.uri.fsPath);
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
const collectionName = `code_chunks_${hash.substring(0, 8)}`;
// Store synchronizer in the context's internal map using the collection name from context
await this.context['prepareCollection'](selectedFolder.uri.fsPath);
const collectionName = this.context['getCollectionName'](selectedFolder.uri.fsPath);
this.context['synchronizers'].set(collectionName, synchronizer);
// Start indexing with progress callback

View File

@@ -52,13 +52,25 @@ export class SearchCommand {
}
const codebasePath = workspaceFolders[0].uri.fsPath;
// Use the new semantic search service
// Check if index exists
progress.report({ increment: 20, message: 'Checking index...' });
const hasIndex = await this.context.hasIndex(codebasePath);
if (!hasIndex) {
vscode.window.showErrorMessage('Index not found. Please index the codebase first.');
return;
}
// Use semantic search
const query: SearchQuery = {
term: searchTerm,
includeContent: true,
limit: 20
};
console.log('🔍 Using semantic search...');
progress.report({ increment: 50, message: 'Executing semantic search...' });
const results = await this.context.semanticSearch(
codebasePath,
query.term,
@@ -66,7 +78,7 @@ export class SearchCommand {
0.3 // similarity threshold
);
progress.report({ increment: 100, message: 'Semantic search complete!' });
progress.report({ increment: 100, message: 'Search complete!' });
if (results.length === 0) {
vscode.window.showInformationMessage(`No results found for "${searchTerm}"`);
@@ -77,7 +89,7 @@ export class SearchCommand {
const quickPickItems = this.generateQuickPickItems(results, searchTerm, codebasePath);
const selected = await vscode.window.showQuickPick(quickPickItems, {
placeHolder: `Found ${results.length} results for "${searchTerm}"`,
placeHolder: `Found ${results.length} results for "${searchTerm}" using semantic search`,
matchOnDescription: true,
matchOnDetail: true
});
@@ -88,8 +100,8 @@ export class SearchCommand {
});
} catch (error) {
console.error('Semantic search failed:', error);
vscode.window.showErrorMessage(`Semantic search failed: ${error}`);
console.error('Search failed:', error);
vscode.window.showErrorMessage(`Search failed: ${error}. Please ensure the codebase is indexed.`);
}
}
@@ -135,7 +147,13 @@ export class SearchCommand {
}
const codebasePath = workspaceFolders[0].uri.fsPath;
// Use the semantic search service
// Check if index exists
const hasIndex = await this.context.hasIndex(codebasePath);
if (!hasIndex) {
throw new Error('Index not found. Please index the codebase first.');
}
console.log('🔍 Using semantic search for webview...');
return await this.context.semanticSearch(
codebasePath,
searchTerm,
@@ -148,23 +166,31 @@ export class SearchCommand {
* Check if index exists for the given codebase path
*/
async hasIndex(codebasePath: string): Promise<boolean> {
return await this.context.hasIndex(codebasePath);
try {
return await this.context.hasIndex(codebasePath);
} catch (error) {
console.error('Error checking index existence:', error);
return false;
}
}
/**
* Generate quick pick items for VS Code
*/
private generateQuickPickItems(results: SemanticSearchResult[], searchTerm: string, workspaceRoot?: string) {
return results.slice(0, 20).map(result => {
return results.slice(0, 20).map((result, index) => {
let displayPath = result.relativePath;
// Truncate content for display
const truncatedContent = result.content.length <= 150
? result.content
: result.content.substring(0, 150) + '...';
// Add rank info to description
const rankText = ` (rank: ${index + 1})`;
return {
label: `$(file-code) ${displayPath}`,
description: `1 match in ${displayPath}`,
description: `$(search) semantic search${rankText}`,
detail: truncatedContent,
result: result
};

View File

@@ -197,7 +197,7 @@ class SemanticSearchController {
this.resultsList.innerHTML = '<div class="no-results">No matches found</div>';
} else {
this.resultsHeader.textContent = `${results.length} result${results.length === 1 ? '' : 's'} for "${query}"`;
this.resultsList.innerHTML = results.map(result => this.createResultHTML(result)).join('');
this.resultsList.innerHTML = results.map((result, index) => this.createResultHTML(result, index + 1)).join('');
}
this.resultsContainer.style.display = 'block';
}
@@ -205,9 +205,10 @@ class SemanticSearchController {
/**
* Create HTML for a single result item
* @param {Object} result - Result object
* @param {number} rank - Result rank (1-indexed)
* @returns {string} HTML string
*/
createResultHTML(result) {
createResultHTML(result, rank) {
return `
<div class="result-item" onclick="searchController.openFile('${result.relativePath}', ${result.line}, ${result.startLine}, ${result.endLine})">
<div class="result-file">
@@ -216,7 +217,7 @@ class SemanticSearchController {
</div>
<div class="result-preview">${result.preview}</div>
<div class="result-context">${result.context}</div>
${result.score ? `<div class="result-score" style="margin-top: 8px; text-align: right;">Similarity: ${(result.score * 100).toFixed(1)}%</div>` : ''}
<div class="result-rank" style="margin-top: 8px; text-align: right;">Rank: ${rank}</div>
</div>
`;
}

View File

@@ -184,7 +184,7 @@ body {
white-space: nowrap;
}
.result-score {
.result-rank {
font-size: 10px;
color: var(--vscode-descriptionForeground);
background-color: var(--vscode-badge-background);