From 419d40e3aa4a1a47170dd51062aba4a44a3a4c6a Mon Sep 17 00:00:00 2001 From: Shawn Zheng <88486803+Shawnzheng011019@users.noreply.github.com> Date: Tue, 5 Aug 2025 16:58:20 +0800 Subject: [PATCH] [Refactor]: Change search method to BM25 & Dense vector Hybrid search (#119) * [Refactor]: Change search method to BM25 & Dense vector Hybrid search * [Restructure] 1.Refactor codebase to use Context class 2.Add hybrid mode environment variable Signed-off-by: ShawnZheng --------- Signed-off-by: ShawnZheng --- packages/core/src/context.ts | 289 +++++++++++++----- packages/core/src/vectordb/index.ts | 4 + .../src/vectordb/milvus-restful-vectordb.ts | 276 +++++++++++++++++ packages/core/src/vectordb/milvus-vectordb.ts | 232 +++++++++++++- packages/core/src/vectordb/types.ts | 52 +++- packages/mcp/src/handlers.ts | 58 ++-- packages/mcp/src/sync.ts | 7 +- .../src/commands/indexCommand.ts | 8 +- .../src/commands/searchCommand.ts | 44 ++- .../src/webview/scripts/semanticSearch.js | 7 +- .../src/webview/styles/semanticSearch.css | 2 +- 11 files changed, 845 insertions(+), 134 deletions(-) diff --git a/packages/core/src/context.ts b/packages/core/src/context.ts index e07ec85..f3492dd 100644 --- a/packages/core/src/context.ts +++ b/packages/core/src/context.ts @@ -11,7 +11,10 @@ import { import { VectorDatabase, VectorDocument, - VectorSearchResult + VectorSearchResult, + HybridSearchRequest, + HybridSearchOptions, + HybridSearchResult } from './vectordb'; import { SemanticSearchResult } from './types'; import { envManager } from './utils/env-manager'; @@ -152,17 +155,30 @@ export class Context { } /** - * Generate collection name based on codebase path + * Get isHybrid setting from environment variable with default true */ - private getCollectionName(codebasePath: string): string { - const normalizedPath = path.resolve(codebasePath); - const hash = crypto.createHash('md5').update(normalizedPath).digest('hex'); - return `code_chunks_${hash.substring(0, 8)}`; + private getIsHybrid(): boolean { + const isHybridEnv = envManager.get('HYBRID_MODE'); + if (isHybridEnv === undefined || isHybridEnv === null) { + return true; // Default to true + } + return isHybridEnv.toLowerCase() === 'true'; } /** - * Index entire codebase - * @param codebasePath Codebase path + * Generate collection name based on codebase path and hybrid mode + */ + private getCollectionName(codebasePath: string): string { + const isHybrid = this.getIsHybrid(); + const normalizedPath = path.resolve(codebasePath); + const hash = crypto.createHash('md5').update(normalizedPath).digest('hex'); + const prefix = isHybrid === true ? 'hybrid_code_chunks' : 'code_chunks'; + return `${prefix}_${hash.substring(0, 8)}`; + } + + /** + * Index a codebase for semantic search + * @param codebasePath Codebase root path * @param progressCallback Optional progress callback function * @returns Indexing statistics */ @@ -170,7 +186,9 @@ export class Context { codebasePath: string, progressCallback?: (progress: { phase: string; current: number; total: number; percentage: number }) => void ): Promise<{ indexedFiles: number; totalChunks: number; status: 'completed' | 'limit_reached' }> { - console.log(`๐Ÿš€ Starting to index codebase: ${codebasePath}`); + const isHybrid = this.getIsHybrid(); + const searchType = isHybrid === true ? 'hybrid search' : 'semantic search'; + console.log(`๐Ÿš€ Starting to index codebase with ${searchType}: ${codebasePath}`); // 1. Load ignore patterns from various ignore files await this.loadGitignorePatterns(codebasePath); @@ -239,7 +257,7 @@ export class Context { if (!synchronizer) { // Load project-specific ignore patterns before creating FileSynchronizer await this.loadGitignorePatterns(codebasePath); - + // To be safe, let's initialize if it's not there. const newSynchronizer = new FileSynchronizer(codebasePath, this.ignorePatterns); await newSynchronizer.initialize(); @@ -317,37 +335,118 @@ export class Context { } /** - * Semantic search + * Semantic search with unified implementation * @param codebasePath Codebase path to search in * @param query Search query * @param topK Number of results to return * @param threshold Similarity threshold */ async semanticSearch(codebasePath: string, query: string, topK: number = 5, threshold: number = 0.5): Promise { - console.log(`๐Ÿ” Executing semantic search: "${query}" in ${codebasePath}`); + const isHybrid = this.getIsHybrid(); + const searchType = isHybrid === true ? 'hybrid search' : 'semantic search'; + console.log(`๐Ÿ” Executing ${searchType}: "${query}" in ${codebasePath}`); - // 1. Generate query vector - const queryEmbedding: EmbeddingVector = await this.embedding.embed(query); + const collectionName = this.getCollectionName(codebasePath); + console.log(`๐Ÿ” Using collection: ${collectionName}`); - // 2. Search in vector database - const searchResults: VectorSearchResult[] = await this.vectorDatabase.search( - this.getCollectionName(codebasePath), - queryEmbedding.vector, - { topK, threshold } - ); + // Check if collection exists and has data + const hasCollection = await this.vectorDatabase.hasCollection(collectionName); + if (!hasCollection) { + console.log(`โš ๏ธ Collection '${collectionName}' does not exist. Please index the codebase first.`); + return []; + } - // 3. Convert to semantic search result format - const results: SemanticSearchResult[] = searchResults.map(result => ({ - content: result.document.content, - relativePath: result.document.relativePath, - startLine: result.document.startLine, - endLine: result.document.endLine, - language: result.document.metadata.language || 'unknown', - score: result.score - })); + if (isHybrid === true) { + try { + // Check collection stats to see if it has data + const stats = await this.vectorDatabase.query(collectionName, '', ['id'], 1); + console.log(`๐Ÿ” Collection '${collectionName}' exists and appears to have data`); + } catch (error) { + console.log(`โš ๏ธ Collection '${collectionName}' exists but may be empty or not properly indexed:`, error); + } - console.log(`โœ… Found ${results.length} relevant results`); - return results; + // 1. Generate query vector + console.log(`๐Ÿ” Generating embeddings for query: "${query}"`); + const queryEmbedding: EmbeddingVector = await this.embedding.embed(query); + console.log(`โœ… Generated embedding vector with dimension: ${queryEmbedding.vector.length}`); + console.log(`๐Ÿ” First 5 embedding values: [${queryEmbedding.vector.slice(0, 5).join(', ')}]`); + + // 2. Prepare hybrid search requests + const searchRequests: HybridSearchRequest[] = [ + { + data: queryEmbedding.vector, + anns_field: "vector", + param: { "nprobe": 10 }, + limit: topK + }, + { + data: query, + anns_field: "sparse_vector", + param: { "drop_ratio_search": 0.2 }, + limit: topK + } + ]; + + console.log(`๐Ÿ” Search request 1 (dense): anns_field="${searchRequests[0].anns_field}", vector_dim=${queryEmbedding.vector.length}, limit=${searchRequests[0].limit}`); + console.log(`๐Ÿ” Search request 2 (sparse): anns_field="${searchRequests[1].anns_field}", query_text="${query}", limit=${searchRequests[1].limit}`); + + // 3. Execute hybrid search + console.log(`๐Ÿ” Executing hybrid search with RRF reranking...`); + const searchResults: HybridSearchResult[] = await this.vectorDatabase.hybridSearch( + collectionName, + searchRequests, + { + rerank: { + strategy: 'rrf', + params: { k: 100 } + }, + limit: topK + } + ); + + console.log(`๐Ÿ” Raw search results count: ${searchResults.length}`); + + // 4. Convert to semantic search result format + const results: SemanticSearchResult[] = searchResults.map(result => ({ + content: result.document.content, + relativePath: result.document.relativePath, + startLine: result.document.startLine, + endLine: result.document.endLine, + language: result.document.metadata.language || 'unknown', + score: result.score + })); + + console.log(`โœ… Found ${results.length} relevant hybrid results`); + if (results.length > 0) { + console.log(`๐Ÿ” Top result score: ${results[0].score}, path: ${results[0].relativePath}`); + } + + return results; + } else { + // Regular semantic search + // 1. Generate query vector + const queryEmbedding: EmbeddingVector = await this.embedding.embed(query); + + // 2. Search in vector database + const searchResults: VectorSearchResult[] = await this.vectorDatabase.search( + collectionName, + queryEmbedding.vector, + { topK, threshold } + ); + + // 3. Convert to semantic search result format + const results: SemanticSearchResult[] = searchResults.map(result => ({ + content: result.document.content, + relativePath: result.document.relativePath, + startLine: result.document.startLine, + endLine: result.document.endLine, + language: result.document.metadata.language || 'unknown', + score: result.score + })); + + console.log(`โœ… Found ${results.length} relevant results`); + return results; + } } /** @@ -458,10 +557,18 @@ export class Context { * Prepare vector collection */ private async prepareCollection(codebasePath: string): Promise { - // Create new collection - console.log(`๐Ÿ”ง Preparing vector collection for codebase: ${codebasePath}`); + const isHybrid = this.getIsHybrid(); + const collectionType = isHybrid === true ? 'hybrid vector' : 'vector'; + console.log(`๐Ÿ”ง Preparing ${collectionType} collection for codebase: ${codebasePath}`); const collectionName = this.getCollectionName(codebasePath); + // Check if collection already exists + const collectionExists = await this.vectorDatabase.hasCollection(collectionName); + if (collectionExists) { + console.log(`๐Ÿ“‹ Collection ${collectionName} already exists, skipping creation`); + return; + } + // For Ollama embeddings, ensure dimension is detected before creating collection if (this.embedding.getProvider() === 'Ollama' && typeof (this.embedding as any).initializeDimension === 'function') { await (this.embedding as any).initializeDimension(); @@ -469,7 +576,13 @@ export class Context { const dimension = this.embedding.getDimension(); const dirName = path.basename(codebasePath); - await this.vectorDatabase.createCollection(collectionName, dimension, `Index for ${dirName}`); + + if (isHybrid === true) { + await this.vectorDatabase.createHybridCollection(collectionName, dimension, `Hybrid Index for ${dirName}`); + } else { + await this.vectorDatabase.createCollection(collectionName, dimension, `Index for ${dirName}`); + } + console.log(`โœ… Collection ${collectionName} created successfully (dimension: ${dimension})`); } @@ -517,6 +630,7 @@ export class Context { codebasePath: string, onFileProcessed?: (filePath: string, fileIndex: number, totalFiles: number) => void ): Promise<{ processedFiles: number; totalChunks: number; status: 'completed' | 'limit_reached' }> { + const isHybrid = this.getIsHybrid(); const EMBEDDING_BATCH_SIZE = Math.max(1, parseInt(envManager.get('EMBEDDING_BATCH_SIZE') || '100', 10)); const CHUNK_LIMIT = 450000; console.log(`๐Ÿ”ง Using EMBEDDING_BATCH_SIZE: ${EMBEDDING_BATCH_SIZE}`); @@ -551,8 +665,8 @@ export class Context { try { await this.processChunkBuffer(chunkBuffer); } catch (error) { - // TODO: - console.error(`โŒ Failed to process chunk batch: ${error}`); + const searchType = isHybrid === true ? 'hybrid' : 'regular'; + console.error(`โŒ Failed to process chunk batch for ${searchType}: ${error}`); } finally { chunkBuffer = []; // Always clear buffer, even on failure } @@ -580,11 +694,12 @@ export class Context { // Process any remaining chunks in the buffer if (chunkBuffer.length > 0) { - console.log(`๐Ÿ“ Processing final batch of ${chunkBuffer.length} chunks`); + const searchType = isHybrid === true ? 'hybrid' : 'regular'; + console.log(`๐Ÿ“ Processing final batch of ${chunkBuffer.length} chunks for ${searchType}`); try { await this.processChunkBuffer(chunkBuffer); } catch (error) { - console.error(`โŒ Failed to process final chunk batch: ${error}`); + console.error(`โŒ Failed to process final chunk batch for ${searchType}: ${error}`); } } @@ -608,7 +723,9 @@ export class Context { // Estimate tokens (rough estimation: 1 token โ‰ˆ 4 characters) const estimatedTokens = chunks.reduce((sum, chunk) => sum + Math.ceil(chunk.content.length / 4), 0); - console.log(`๐Ÿ”„ Processing batch of ${chunks.length} chunks (~${estimatedTokens} tokens)`); + const isHybrid = this.getIsHybrid(); + const searchType = isHybrid === true ? 'hybrid' : 'regular'; + console.log(`๐Ÿ”„ Processing batch of ${chunks.length} chunks (~${estimatedTokens} tokens) for ${searchType}`); await this.processChunkBatch(chunks, codebasePath); } @@ -616,45 +733,75 @@ export class Context { * Process a batch of chunks */ private async processChunkBatch(chunks: CodeChunk[], codebasePath: string): Promise { + const isHybrid = this.getIsHybrid(); + // Generate embedding vectors const chunkContents = chunks.map(chunk => chunk.content); - const embeddings: EmbeddingVector[] = await this.embedding.embedBatch(chunkContents); + const embeddings = await this.embedding.embedBatch(chunkContents); - // Prepare vector documents - const documents: VectorDocument[] = chunks.map((chunk, index) => { - if (!chunk.metadata.filePath) { - throw new Error(`Missing filePath in chunk metadata at index ${index}`); - } - - const relativePath = path.relative(codebasePath, chunk.metadata.filePath); - const fileExtension = path.extname(chunk.metadata.filePath); - - // Extract metadata that should be stored separately - const { filePath, startLine, endLine, ...restMetadata } = chunk.metadata; - - return { - id: this.generateId(relativePath, chunk.metadata.startLine || 0, chunk.metadata.endLine || 0, chunk.content), - vector: embeddings[index].vector, - content: chunk.content, - relativePath, - startLine: chunk.metadata.startLine || 0, - endLine: chunk.metadata.endLine || 0, - fileExtension, - metadata: { - ...restMetadata, - codebasePath, - language: chunk.metadata.language || 'unknown', - chunkIndex: index + if (isHybrid === true) { + // Create hybrid vector documents + const documents: VectorDocument[] = chunks.map((chunk, index) => { + if (!chunk.metadata.filePath) { + throw new Error(`Missing filePath in chunk metadata at index ${index}`); } - }; - }); - // Store to vector database - await this.vectorDatabase.insert(this.getCollectionName(codebasePath), documents); + const relativePath = path.relative(codebasePath, chunk.metadata.filePath); + const fileExtension = path.extname(chunk.metadata.filePath); + const { filePath, startLine, endLine, ...restMetadata } = chunk.metadata; + + return { + id: this.generateId(relativePath, chunk.metadata.startLine || 0, chunk.metadata.endLine || 0, chunk.content), + content: chunk.content, // Full text content for BM25 and storage + vector: embeddings[index].vector, // Dense vector + relativePath, + startLine: chunk.metadata.startLine || 0, + endLine: chunk.metadata.endLine || 0, + fileExtension, + metadata: { + ...restMetadata, + codebasePath, + language: chunk.metadata.language || 'unknown', + chunkIndex: index + } + }; + }); + + // Store to vector database + await this.vectorDatabase.insertHybrid(this.getCollectionName(codebasePath), documents); + } else { + // Create regular vector documents + const documents: VectorDocument[] = chunks.map((chunk, index) => { + if (!chunk.metadata.filePath) { + throw new Error(`Missing filePath in chunk metadata at index ${index}`); + } + + const relativePath = path.relative(codebasePath, chunk.metadata.filePath); + const fileExtension = path.extname(chunk.metadata.filePath); + const { filePath, startLine, endLine, ...restMetadata } = chunk.metadata; + + return { + id: this.generateId(relativePath, chunk.metadata.startLine || 0, chunk.metadata.endLine || 0, chunk.content), + vector: embeddings[index].vector, + content: chunk.content, + relativePath, + startLine: chunk.metadata.startLine || 0, + endLine: chunk.metadata.endLine || 0, + fileExtension, + metadata: { + ...restMetadata, + codebasePath, + language: chunk.metadata.language || 'unknown', + chunkIndex: index + } + }; + }); + + // Store to vector database + await this.vectorDatabase.insert(this.getCollectionName(codebasePath), documents); + } } - - /** * Get programming language based on file extension */ diff --git a/packages/core/src/vectordb/index.ts b/packages/core/src/vectordb/index.ts index eb4530d..b4231c0 100644 --- a/packages/core/src/vectordb/index.ts +++ b/packages/core/src/vectordb/index.ts @@ -4,6 +4,10 @@ export { SearchOptions, VectorSearchResult, VectorDatabase, + HybridSearchRequest, + HybridSearchOptions, + HybridSearchResult, + RerankStrategy, COLLECTION_LIMIT_MESSAGE } from './types'; diff --git a/packages/core/src/vectordb/milvus-restful-vectordb.ts b/packages/core/src/vectordb/milvus-restful-vectordb.ts index f1c61d3..c50dd05 100644 --- a/packages/core/src/vectordb/milvus-restful-vectordb.ts +++ b/packages/core/src/vectordb/milvus-restful-vectordb.ts @@ -3,6 +3,9 @@ import { SearchOptions, VectorSearchResult, VectorDatabase, + HybridSearchRequest, + HybridSearchOptions, + HybridSearchResult, COLLECTION_LIMIT_MESSAGE } from './types'; import { ClusterManager } from './zilliz-utils'; @@ -467,4 +470,277 @@ export class MilvusRestfulVectorDatabase implements VectorDatabase { throw error; } } + + async createHybridCollection(collectionName: string, dimension: number, description?: string): Promise { + try { + const restfulConfig = this.config as MilvusRestfulConfig; + + const collectionSchema = { + collectionName, + dbName: restfulConfig.database, + schema: { + enableDynamicField: false, + functions: [ + { + name: "content_bm25_emb", + description: "content bm25 function", + type: "BM25", + inputFieldNames: ["content"], + outputFieldNames: ["sparse_vector"], + params: {}, + }, + ], + fields: [ + { + fieldName: "id", + dataType: "VarChar", + isPrimary: true, + elementTypeParams: { + max_length: 512 + } + }, + { + fieldName: "content", + dataType: "VarChar", + elementTypeParams: { + max_length: 65535, + enable_analyzer: true + } + }, + { + fieldName: "vector", + dataType: "FloatVector", + elementTypeParams: { + dim: dimension + } + }, + { + fieldName: "sparse_vector", + dataType: "SparseFloatVector" + }, + { + fieldName: "relativePath", + dataType: "VarChar", + elementTypeParams: { + max_length: 1024 + } + }, + { + fieldName: "startLine", + dataType: "Int64" + }, + { + fieldName: "endLine", + dataType: "Int64" + }, + { + fieldName: "fileExtension", + dataType: "VarChar", + elementTypeParams: { + max_length: 32 + } + }, + { + fieldName: "metadata", + dataType: "VarChar", + elementTypeParams: { + max_length: 65535 + } + } + ] + } + }; + + // Step 1: Create collection with schema and functions + await createCollectionWithLimitCheck(this.makeRequest.bind(this), collectionSchema); + + // Step 2: Create indexes for both vector fields + await this.createHybridIndexes(collectionName); + + // Step 3: Load collection to memory for searching + await this.loadCollection(collectionName); + + } catch (error) { + console.error(`โŒ Failed to create hybrid collection '${collectionName}':`, error); + throw error; + } + } + + private async createHybridIndexes(collectionName: string): Promise { + try { + const restfulConfig = this.config as MilvusRestfulConfig; + + // Create index for dense vector + const denseIndexParams = { + collectionName, + dbName: restfulConfig.database, + indexParams: [ + { + fieldName: "vector", + indexName: "vector_index", + metricType: "COSINE", + index_type: "AUTOINDEX" + } + ] + }; + await this.makeRequest('/indexes/create', 'POST', denseIndexParams); + + // Create index for sparse vector + const sparseIndexParams = { + collectionName, + dbName: restfulConfig.database, + indexParams: [ + { + fieldName: "sparse_vector", + indexName: "sparse_vector_index", + metricType: "BM25", + index_type: "SPARSE_INVERTED_INDEX" + } + ] + }; + await this.makeRequest('/indexes/create', 'POST', sparseIndexParams); + + } catch (error) { + console.error(`โŒ Failed to create hybrid indexes for collection '${collectionName}':`, error); + throw error; + } + } + + async insertHybrid(collectionName: string, documents: VectorDocument[]): Promise { + await this.ensureInitialized(); + + try { + const restfulConfig = this.config as MilvusRestfulConfig; + + const data = documents.map(doc => ({ + id: doc.id, + content: doc.content, + vector: doc.vector, + relativePath: doc.relativePath, + startLine: doc.startLine, + endLine: doc.endLine, + fileExtension: doc.fileExtension, + metadata: JSON.stringify(doc.metadata), + })); + + const insertRequest = { + collectionName, + dbName: restfulConfig.database, + data: data + }; + + const response = await this.makeRequest('/entities/insert', 'POST', insertRequest); + + if (response.code !== 0) { + throw new Error(`Insert failed: ${response.message || 'Unknown error'}`); + } + + } catch (error) { + console.error(`โŒ Failed to insert hybrid documents to collection '${collectionName}':`, error); + throw error; + } + } + + async hybridSearch(collectionName: string, searchRequests: HybridSearchRequest[], options?: HybridSearchOptions): Promise { + await this.ensureInitialized(); + + try { + const restfulConfig = this.config as MilvusRestfulConfig; + + console.log(`๐Ÿ” Preparing hybrid search for collection: ${collectionName}`); + + // Prepare search requests according to Milvus REST API hybrid search specification + // For dense vector search - data must be array of vectors: [[0.1, 0.2, 0.3, ...]] + const search_param_1 = { + data: Array.isArray(searchRequests[0].data) ? [searchRequests[0].data] : [[searchRequests[0].data]], + annsField: searchRequests[0].anns_field, // "vector" + limit: searchRequests[0].limit, + outputFields: ["*"], + searchParams: { + metricType: "COSINE", + params: searchRequests[0].param || { "nprobe": 10 } + } + }; + + // For sparse vector search - data must be array of queries: ["query text"] + const search_param_2 = { + data: Array.isArray(searchRequests[1].data) ? searchRequests[1].data : [searchRequests[1].data], + annsField: searchRequests[1].anns_field, // "sparse_vector" + limit: searchRequests[1].limit, + outputFields: ["*"], + searchParams: { + metricType: "BM25", + params: searchRequests[1].param || { "drop_ratio_search": 0.2 } + } + }; + + const rerank_strategy = { + strategy: "rrf", + params: { + k: 100 + } + }; + + console.log(`๐Ÿ” Dense search params:`, JSON.stringify({ + annsField: search_param_1.annsField, + limit: search_param_1.limit, + data_length: Array.isArray(search_param_1.data[0]) ? search_param_1.data[0].length : 'N/A', + searchParams: search_param_1.searchParams + }, null, 2)); + console.log(`๐Ÿ” Sparse search params:`, JSON.stringify({ + annsField: search_param_2.annsField, + limit: search_param_2.limit, + query_text: typeof search_param_2.data[0] === 'string' ? search_param_2.data[0].substring(0, 50) + '...' : 'N/A', + searchParams: search_param_2.searchParams + }, null, 2)); + + const hybridSearchRequest = { + collectionName, + dbName: restfulConfig.database, + search: [search_param_1, search_param_2], + rerank: rerank_strategy, + limit: options?.limit || searchRequests[0]?.limit || 10, + outputFields: ['id', 'content', 'relativePath', 'startLine', 'endLine', 'fileExtension', 'metadata'], + }; + + console.log(`๐Ÿ” Complete REST API request:`, JSON.stringify({ + collectionName: hybridSearchRequest.collectionName, + dbName: hybridSearchRequest.dbName, + search_count: hybridSearchRequest.search.length, + rerank: hybridSearchRequest.rerank, + limit: hybridSearchRequest.limit, + outputFields: hybridSearchRequest.outputFields + }, null, 2)); + + console.log(`๐Ÿ” Executing REST API hybrid search...`); + const response = await this.makeRequest('/entities/hybrid_search', 'POST', hybridSearchRequest); + + if (response.code !== 0) { + throw new Error(`Hybrid search failed: ${response.message || 'Unknown error'}`); + } + + const results = response.data || []; + console.log(`โœ… Found ${results.length} results from hybrid search`); + + // Transform response to HybridSearchResult format + return results.map((result: any) => ({ + document: { + id: result.id, + content: result.content, + vector: [], // Vector not returned in search results + sparse_vector: [], // Vector not returned in search results + relativePath: result.relativePath, + startLine: result.startLine, + endLine: result.endLine, + fileExtension: result.fileExtension, + metadata: JSON.parse(result.metadata || '{}'), + }, + score: result.score || result.distance || 0, + })); + + } catch (error) { + console.error(`โŒ Failed to perform hybrid search on collection '${collectionName}':`, error); + throw error; + } + } } \ No newline at end of file diff --git a/packages/core/src/vectordb/milvus-vectordb.ts b/packages/core/src/vectordb/milvus-vectordb.ts index 2a88493..1b35c30 100644 --- a/packages/core/src/vectordb/milvus-vectordb.ts +++ b/packages/core/src/vectordb/milvus-vectordb.ts @@ -1,9 +1,12 @@ -import { MilvusClient, DataType, MetricType } from '@zilliz/milvus2-sdk-node'; +import { MilvusClient, DataType, MetricType, FunctionType } from '@zilliz/milvus2-sdk-node'; import { VectorDocument, SearchOptions, VectorSearchResult, VectorDatabase, + HybridSearchRequest, + HybridSearchOptions, + HybridSearchResult, COLLECTION_LIMIT_MESSAGE } from './types'; import { ClusterManager } from './zilliz-utils'; @@ -298,4 +301,231 @@ export class MilvusVectorDatabase implements VectorDatabase { throw error; } } + + async createHybridCollection(collectionName: string, dimension: number, description?: string): Promise { + await this.ensureInitialized(); + + console.log('Beginning hybrid collection creation:', collectionName); + console.log('Collection dimension:', dimension); + + const schema = [ + { + name: 'id', + description: 'Document ID', + data_type: DataType.VarChar, + max_length: 512, + is_primary_key: true, + }, + { + name: 'content', + description: 'Full text content for BM25 and storage', + data_type: DataType.VarChar, + max_length: 65535, + enable_analyzer: true, + }, + { + name: 'vector', + description: 'Dense vector embedding', + data_type: DataType.FloatVector, + dim: dimension, + }, + { + name: 'sparse_vector', + description: 'Sparse vector embedding from BM25', + data_type: DataType.SparseFloatVector, + }, + { + name: 'relativePath', + description: 'Relative path to the codebase', + data_type: DataType.VarChar, + max_length: 1024, + }, + { + name: 'startLine', + description: 'Start line number of the chunk', + data_type: DataType.Int64, + }, + { + name: 'endLine', + description: 'End line number of the chunk', + data_type: DataType.Int64, + }, + { + name: 'fileExtension', + description: 'File extension', + data_type: DataType.VarChar, + max_length: 32, + }, + { + name: 'metadata', + description: 'Additional document metadata as JSON string', + data_type: DataType.VarChar, + max_length: 65535, + }, + ]; + + // Add BM25 function + const functions = [ + { + name: "content_bm25_emb", + description: "content bm25 function", + type: FunctionType.BM25, + input_field_names: ["content"], + output_field_names: ["sparse_vector"], + params: {}, + }, + ]; + + const createCollectionParams = { + collection_name: collectionName, + description: description || `Hybrid code context collection: ${collectionName}`, + fields: schema, + functions: functions, + }; + + await createCollectionWithLimitCheck(this.client!, createCollectionParams); + + // Create indexes for both vector fields + // Index for dense vector + const denseIndexParams = { + collection_name: collectionName, + field_name: 'vector', + index_type: 'AUTOINDEX', + metric_type: MetricType.COSINE, + }; + await this.client!.createIndex(denseIndexParams); + + // Index for sparse vector + const sparseIndexParams = { + collection_name: collectionName, + field_name: 'sparse_vector', + index_type: 'SPARSE_INVERTED_INDEX', + metric_type: MetricType.BM25, + }; + await this.client!.createIndex(sparseIndexParams); + + // Load collection to memory + await this.client!.loadCollection({ + collection_name: collectionName, + }); + + // Verify collection is created correctly + await this.client!.describeCollection({ + collection_name: collectionName, + }); + } + + async insertHybrid(collectionName: string, documents: VectorDocument[]): Promise { + await this.ensureInitialized(); + + const data = documents.map(doc => ({ + id: doc.id, + content: doc.content, + vector: doc.vector, + relativePath: doc.relativePath, + startLine: doc.startLine, + endLine: doc.endLine, + fileExtension: doc.fileExtension, + metadata: JSON.stringify(doc.metadata), + })); + + await this.client!.insert({ + collection_name: collectionName, + data: data, + }); + } + + async hybridSearch(collectionName: string, searchRequests: HybridSearchRequest[], options?: HybridSearchOptions): Promise { + await this.ensureInitialized(); + + try { + // Generate OpenAI embedding for the first search request (dense) + console.log(`๐Ÿ” Preparing hybrid search for collection: ${collectionName}`); + + // Prepare search requests in the correct Milvus format + const search_param_1 = { + data: Array.isArray(searchRequests[0].data) ? searchRequests[0].data : [searchRequests[0].data], + anns_field: searchRequests[0].anns_field, // "vector" + param: searchRequests[0].param, // {"nprobe": 10} + limit: searchRequests[0].limit + }; + + const search_param_2 = { + data: searchRequests[1].data, // query text for sparse search + anns_field: searchRequests[1].anns_field, // "sparse_vector" + param: searchRequests[1].param, // {"drop_ratio_search": 0.2} + limit: searchRequests[1].limit + }; + + // Set rerank strategy to RRF (100) by default + const rerank_strategy = { + strategy: "rrf", + params: { + k: 100 + } + }; + + console.log(`๐Ÿ” Dense search params:`, JSON.stringify({ + anns_field: search_param_1.anns_field, + param: search_param_1.param, + limit: search_param_1.limit, + data_length: Array.isArray(search_param_1.data[0]) ? search_param_1.data[0].length : 'N/A' + }, null, 2)); + console.log(`๐Ÿ” Sparse search params:`, JSON.stringify({ + anns_field: search_param_2.anns_field, + param: search_param_2.param, + limit: search_param_2.limit, + query_text: typeof search_param_2.data === 'string' ? search_param_2.data.substring(0, 50) + '...' : 'N/A' + }, null, 2)); + console.log(`๐Ÿ” Rerank strategy:`, JSON.stringify(rerank_strategy, null, 2)); + + // Execute hybrid search using the correct client.search format + const searchParams = { + collection_name: collectionName, + data: [search_param_1, search_param_2], + limit: options?.limit || searchRequests[0]?.limit || 10, + rerank: rerank_strategy, + output_fields: ['id', 'content', 'relativePath', 'startLine', 'endLine', 'fileExtension', 'metadata'], + }; + + console.log(`๐Ÿ” Complete search request:`, JSON.stringify({ + collection_name: searchParams.collection_name, + data_count: searchParams.data.length, + limit: searchParams.limit, + rerank: searchParams.rerank, + output_fields: searchParams.output_fields + }, null, 2)); + + const searchResult = await this.client!.search(searchParams); + + console.log(`๐Ÿ” Search executed, processing results...`); + + if (!searchResult.results || searchResult.results.length === 0) { + console.log(`โš ๏ธ No results returned from Milvus search`); + return []; + } + + console.log(`โœ… Found ${searchResult.results.length} results from hybrid search`); + + // Transform results to HybridSearchResult format + return searchResult.results.map((result: any) => ({ + document: { + id: result.id, + content: result.content, + vector: [], + sparse_vector: [], + relativePath: result.relativePath, + startLine: result.startLine, + endLine: result.endLine, + fileExtension: result.fileExtension, + metadata: JSON.parse(result.metadata || '{}'), + }, + score: result.score, + })); + + } catch (error) { + console.error(`โŒ Failed to perform hybrid search on collection '${collectionName}':`, error); + throw error; + } + } } \ No newline at end of file diff --git a/packages/core/src/vectordb/types.ts b/packages/core/src/vectordb/types.ts index 84e9227..7f0e9ac 100644 --- a/packages/core/src/vectordb/types.ts +++ b/packages/core/src/vectordb/types.ts @@ -16,11 +16,34 @@ export interface SearchOptions { threshold?: number; } +// New interfaces for hybrid search +export interface HybridSearchRequest { + data: number[] | string; // Query vector or text + anns_field: string; // Vector field name (vector or sparse_vector) + param: Record; // Search parameters + limit: number; +} + +export interface HybridSearchOptions { + rerank?: RerankStrategy; + limit?: number; +} + +export interface RerankStrategy { + strategy: 'rrf' | 'weighted'; + params?: Record; +} + export interface VectorSearchResult { document: VectorDocument; score: number; } +export interface HybridSearchResult { + document: VectorDocument; + score: number; +} + export interface VectorDatabase { /** * Create collection @@ -30,6 +53,14 @@ export interface VectorDatabase { */ createCollection(collectionName: string, dimension: number, description?: string): Promise; + /** + * Create collection with hybrid search support + * @param collectionName Collection name + * @param dimension Dense vector dimension + * @param description Collection description + */ + createHybridCollection(collectionName: string, dimension: number, description?: string): Promise; + /** * Drop collection * @param collectionName Collection name @@ -54,6 +85,13 @@ export interface VectorDatabase { */ insert(collectionName: string, documents: VectorDocument[]): Promise; + /** + * Insert hybrid vector documents + * @param collectionName Collection name + * @param documents Document array + */ + insertHybrid(collectionName: string, documents: VectorDocument[]): Promise; + /** * Search similar vectors * @param collectionName Collection name @@ -62,6 +100,14 @@ export interface VectorDatabase { */ search(collectionName: string, queryVector: number[], options?: SearchOptions): Promise; + /** + * Hybrid search with multiple vector fields + * @param collectionName Collection name + * @param searchRequests Array of search requests for different fields + * @param options Hybrid search options including reranking + */ + hybridSearch(collectionName: string, searchRequests: HybridSearchRequest[], options?: HybridSearchOptions): Promise; + /** * Delete documents * @param collectionName Collection name @@ -70,11 +116,11 @@ export interface VectorDatabase { delete(collectionName: string, ids: string[]): Promise; /** - * Query documents by filter + * Query documents with filter conditions * @param collectionName Collection name - * @param filter Filter expression string + * @param filter Filter expression * @param outputFields Fields to return - * @param limit Maximum number of results to return (optional) + * @param limit Maximum number of results */ query(collectionName: string, filter: string, outputFields: string[], limit?: number): Promise[]>; } diff --git a/packages/mcp/src/handlers.ts b/packages/mcp/src/handlers.ts index ea63223..11df5b7 100644 --- a/packages/mcp/src/handlers.ts +++ b/packages/mcp/src/handlers.ts @@ -61,8 +61,8 @@ export class ToolHandlers { // Check each collection for codebase path for (const collectionName of collections) { try { - // Skip collections that don't match the code_chunks pattern - if (!collectionName.startsWith('code_chunks_')) { + // Skip collections that don't match the code_chunks pattern (support both legacy and new collections) + if (!collectionName.startsWith('code_chunks_') && !collectionName.startsWith('hybrid_code_chunks_')) { console.log(`[SYNC-CLOUD] โญ๏ธ Skipping non-code collection: ${collectionName}`); continue; } @@ -218,38 +218,19 @@ export class ToolHandlers { // CRITICAL: Pre-index collection creation validation try { - const normalizedPath = path.resolve(absolutePath); - const hash = crypto.createHash('md5').update(normalizedPath).digest('hex'); - const collectionName = `code_chunks_${hash.substring(0, 8)}`; + console.log(`[INDEX-VALIDATION] ๐Ÿ” Validating collection creation capability`); - console.log(`[INDEX-VALIDATION] ๐Ÿ” Validating collection creation for: ${collectionName}`); - - // Get embedding dimension for collection creation - const embeddingProvider = this.context['embedding']; - const dimension = embeddingProvider.getDimension(); - - // If force reindex, clear existing collection first - if (forceReindex) { - console.log(`[INDEX-VALIDATION] ๐Ÿงน Force reindex enabled, clearing existing collection: ${collectionName}`); - try { - await this.context['vectorDatabase'].dropCollection(collectionName); - console.log(`[INDEX-VALIDATION] โœ… Existing collection cleared: ${collectionName}`); - } catch (dropError: any) { - // Collection might not exist, which is fine - console.log(`[INDEX-VALIDATION] โ„น๏ธ Collection ${collectionName} does not exist or already cleared`); - } + // Check if collection can be created (this will be handled entirely by context.ts) + const hasExistingIndex = await this.context.hasIndex(absolutePath); + if (hasExistingIndex && forceReindex) { + console.log(`[INDEX-VALIDATION] โ„น๏ธ Force reindex enabled, existing index will be cleared`); + await this.context.clearIndex(absolutePath); + console.log(`[INDEX-VALIDATION] โœ… Existing index cleared for re-indexing`); + } else if (hasExistingIndex) { + console.log(`[INDEX-VALIDATION] โ„น๏ธ Index already exists for this codebase`); } - // Attempt to create collection - this will throw COLLECTION_LIMIT_MESSAGE if limit reached - await this.context['vectorDatabase'].createCollection( - collectionName, - dimension, - `Claude Context collection: ${collectionName}` - ); - - // If creation succeeds, immediately drop the test collection - await this.context['vectorDatabase'].dropCollection(collectionName); - console.log(`[INDEX-VALIDATION] โœ… Collection creation validated successfully`); + console.log(`[INDEX-VALIDATION] โœ… Collection creation validation completed`); } catch (validationError: any) { const errorMessage = typeof validationError === 'string' ? validationError : @@ -352,14 +333,9 @@ export class ToolHandlers { console.warn(`[BACKGROUND-INDEX] Non-AST splitter '${splitterType}' requested; falling back to AST splitter`); } - // Generate collection name - const normalizedPath = path.resolve(absolutePath); - const hash = crypto.createHash('md5').update(normalizedPath).digest('hex'); - const collectionName = `code_chunks_${hash.substring(0, 8)}`; - // Load ignore patterns from files first (including .ignore, .gitignore, etc.) await this.context['loadGitignorePatterns'](absolutePath); - + // Initialize file synchronizer with proper ignore patterns (including project-specific patterns) const { FileSynchronizer } = await import("@zilliz/claude-context-core"); const ignorePatterns = this.context['ignorePatterns'] || []; @@ -367,7 +343,9 @@ export class ToolHandlers { const synchronizer = new FileSynchronizer(absolutePath, ignorePatterns); await synchronizer.initialize(); - // Store synchronizer in the context's internal map + // Store synchronizer in the context (let context manage collection names) + await this.context['prepareCollection'](absolutePath); + const collectionName = this.context['getCollectionName'](absolutePath); this.context['synchronizers'].set(collectionName, synchronizer); if (contextForThisTask !== this.context) { contextForThisTask['synchronizers'].set(collectionName, synchronizer); @@ -471,7 +449,7 @@ export class ToolHandlers { // Log embedding provider information before search const embeddingProvider = this.context['embedding']; - console.log(`[SEARCH] ๐Ÿง  Using embedding provider: ${embeddingProvider.getProvider()} for semantic search`); + console.log(`[SEARCH] ๐Ÿง  Using embedding provider: ${embeddingProvider.getProvider()} for search`); console.log(`[SEARCH] ๐Ÿ” Generating embeddings for query using ${embeddingProvider.getProvider()}...`); // Search in the specified codebase @@ -505,7 +483,7 @@ export class ToolHandlers { return `${index + 1}. Code snippet (${result.language}) [${codebaseInfo}]\n` + ` Location: ${location}\n` + - ` Score: ${result.score.toFixed(3)}\n` + + ` Rank: ${index + 1}\n` + ` Context: \n\`\`\`${result.language}\n${context}\n\`\`\`\n`; }).join('\n'); diff --git a/packages/mcp/src/sync.ts b/packages/mcp/src/sync.ts index 6243c5b..0f53bab 100644 --- a/packages/mcp/src/sync.ts +++ b/packages/mcp/src/sync.ts @@ -1,5 +1,5 @@ import * as fs from "fs"; -import { Context } from "@zilliz/claude-context-core"; +import { Context, FileSynchronizer } from "@zilliz/claude-context-core"; import { SnapshotManager } from "./snapshot.js"; export class SyncManager { @@ -79,6 +79,11 @@ export class SyncManager { console.error(`[SYNC-DEBUG] Error syncing codebase '${codebasePath}' after ${codebaseElapsed}ms:`, error); console.error(`[SYNC-DEBUG] Error stack:`, error.stack); + if (error.message.includes('Failed to query Milvus')) { + // Collection maybe deleted manually, delete the snapshot file + await FileSynchronizer.deleteSnapshot(codebasePath); + } + // Log additional error details if (error.code) { console.error(`[SYNC-DEBUG] Error code: ${error.code}`); diff --git a/packages/vscode-extension/src/commands/indexCommand.ts b/packages/vscode-extension/src/commands/indexCommand.ts index f2554cf..fd2d957 100644 --- a/packages/vscode-extension/src/commands/indexCommand.ts +++ b/packages/vscode-extension/src/commands/indexCommand.ts @@ -1,7 +1,6 @@ import * as vscode from 'vscode'; import { Context } from '@zilliz/claude-context-core'; import * as path from 'path'; -import * as crypto from 'crypto'; export class IndexCommand { private context: Context; @@ -78,10 +77,9 @@ export class IndexCommand { const { FileSynchronizer } = await import("@zilliz/claude-context-core"); const synchronizer = new FileSynchronizer(selectedFolder.uri.fsPath, this.context['ignorePatterns'] || []); await synchronizer.initialize(); - // Store synchronizer in the context's internal map using the same collection name generation logic - const normalizedPath = path.resolve(selectedFolder.uri.fsPath); - const hash = crypto.createHash('md5').update(normalizedPath).digest('hex'); - const collectionName = `code_chunks_${hash.substring(0, 8)}`; + // Store synchronizer in the context's internal map using the collection name from context + await this.context['prepareCollection'](selectedFolder.uri.fsPath); + const collectionName = this.context['getCollectionName'](selectedFolder.uri.fsPath); this.context['synchronizers'].set(collectionName, synchronizer); // Start indexing with progress callback diff --git a/packages/vscode-extension/src/commands/searchCommand.ts b/packages/vscode-extension/src/commands/searchCommand.ts index e44878a..9fb739d 100644 --- a/packages/vscode-extension/src/commands/searchCommand.ts +++ b/packages/vscode-extension/src/commands/searchCommand.ts @@ -52,13 +52,25 @@ export class SearchCommand { } const codebasePath = workspaceFolders[0].uri.fsPath; - // Use the new semantic search service + // Check if index exists + progress.report({ increment: 20, message: 'Checking index...' }); + const hasIndex = await this.context.hasIndex(codebasePath); + + if (!hasIndex) { + vscode.window.showErrorMessage('Index not found. Please index the codebase first.'); + return; + } + + // Use semantic search const query: SearchQuery = { term: searchTerm, includeContent: true, limit: 20 }; + console.log('๐Ÿ” Using semantic search...'); + progress.report({ increment: 50, message: 'Executing semantic search...' }); + const results = await this.context.semanticSearch( codebasePath, query.term, @@ -66,7 +78,7 @@ export class SearchCommand { 0.3 // similarity threshold ); - progress.report({ increment: 100, message: 'Semantic search complete!' }); + progress.report({ increment: 100, message: 'Search complete!' }); if (results.length === 0) { vscode.window.showInformationMessage(`No results found for "${searchTerm}"`); @@ -77,7 +89,7 @@ export class SearchCommand { const quickPickItems = this.generateQuickPickItems(results, searchTerm, codebasePath); const selected = await vscode.window.showQuickPick(quickPickItems, { - placeHolder: `Found ${results.length} results for "${searchTerm}"`, + placeHolder: `Found ${results.length} results for "${searchTerm}" using semantic search`, matchOnDescription: true, matchOnDetail: true }); @@ -88,8 +100,8 @@ export class SearchCommand { }); } catch (error) { - console.error('Semantic search failed:', error); - vscode.window.showErrorMessage(`Semantic search failed: ${error}`); + console.error('Search failed:', error); + vscode.window.showErrorMessage(`Search failed: ${error}. Please ensure the codebase is indexed.`); } } @@ -135,7 +147,13 @@ export class SearchCommand { } const codebasePath = workspaceFolders[0].uri.fsPath; - // Use the semantic search service + // Check if index exists + const hasIndex = await this.context.hasIndex(codebasePath); + if (!hasIndex) { + throw new Error('Index not found. Please index the codebase first.'); + } + + console.log('๐Ÿ” Using semantic search for webview...'); return await this.context.semanticSearch( codebasePath, searchTerm, @@ -148,23 +166,31 @@ export class SearchCommand { * Check if index exists for the given codebase path */ async hasIndex(codebasePath: string): Promise { - return await this.context.hasIndex(codebasePath); + try { + return await this.context.hasIndex(codebasePath); + } catch (error) { + console.error('Error checking index existence:', error); + return false; + } } /** * Generate quick pick items for VS Code */ private generateQuickPickItems(results: SemanticSearchResult[], searchTerm: string, workspaceRoot?: string) { - return results.slice(0, 20).map(result => { + return results.slice(0, 20).map((result, index) => { let displayPath = result.relativePath; // Truncate content for display const truncatedContent = result.content.length <= 150 ? result.content : result.content.substring(0, 150) + '...'; + // Add rank info to description + const rankText = ` (rank: ${index + 1})`; + return { label: `$(file-code) ${displayPath}`, - description: `1 match in ${displayPath}`, + description: `$(search) semantic search${rankText}`, detail: truncatedContent, result: result }; diff --git a/packages/vscode-extension/src/webview/scripts/semanticSearch.js b/packages/vscode-extension/src/webview/scripts/semanticSearch.js index 079767f..2adf03a 100644 --- a/packages/vscode-extension/src/webview/scripts/semanticSearch.js +++ b/packages/vscode-extension/src/webview/scripts/semanticSearch.js @@ -197,7 +197,7 @@ class SemanticSearchController { this.resultsList.innerHTML = '
No matches found
'; } else { this.resultsHeader.textContent = `${results.length} result${results.length === 1 ? '' : 's'} for "${query}"`; - this.resultsList.innerHTML = results.map(result => this.createResultHTML(result)).join(''); + this.resultsList.innerHTML = results.map((result, index) => this.createResultHTML(result, index + 1)).join(''); } this.resultsContainer.style.display = 'block'; } @@ -205,9 +205,10 @@ class SemanticSearchController { /** * Create HTML for a single result item * @param {Object} result - Result object + * @param {number} rank - Result rank (1-indexed) * @returns {string} HTML string */ - createResultHTML(result) { + createResultHTML(result, rank) { return `
@@ -216,7 +217,7 @@ class SemanticSearchController {
${result.preview}
${result.context}
- ${result.score ? `
Similarity: ${(result.score * 100).toFixed(1)}%
` : ''} +
Rank: ${rank}
`; } diff --git a/packages/vscode-extension/src/webview/styles/semanticSearch.css b/packages/vscode-extension/src/webview/styles/semanticSearch.css index 24b9791..77070c8 100644 --- a/packages/vscode-extension/src/webview/styles/semanticSearch.css +++ b/packages/vscode-extension/src/webview/styles/semanticSearch.css @@ -184,7 +184,7 @@ body { white-space: nowrap; } -.result-score { +.result-rank { font-size: 10px; color: var(--vscode-descriptionForeground); background-color: var(--vscode-badge-background);