mirror of
https://github.com/zilliztech/claude-context.git
synced 2025-10-06 01:10:02 +03:00
[Refactor]: Change search method to BM25 & Dense vector Hybrid search (#119)
* [Refactor]: Change search method to BM25 & Dense vector Hybrid search * [Restructure] 1.Refactor codebase to use Context class 2.Add hybrid mode environment variable Signed-off-by: ShawnZheng <shawn.zheng@zilliz.com> --------- Signed-off-by: ShawnZheng <shawn.zheng@zilliz.com>
This commit is contained in:
@@ -11,7 +11,10 @@ import {
|
||||
import {
|
||||
VectorDatabase,
|
||||
VectorDocument,
|
||||
VectorSearchResult
|
||||
VectorSearchResult,
|
||||
HybridSearchRequest,
|
||||
HybridSearchOptions,
|
||||
HybridSearchResult
|
||||
} from './vectordb';
|
||||
import { SemanticSearchResult } from './types';
|
||||
import { envManager } from './utils/env-manager';
|
||||
@@ -152,17 +155,30 @@ export class Context {
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate collection name based on codebase path
|
||||
* Get isHybrid setting from environment variable with default true
|
||||
*/
|
||||
private getCollectionName(codebasePath: string): string {
|
||||
const normalizedPath = path.resolve(codebasePath);
|
||||
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
|
||||
return `code_chunks_${hash.substring(0, 8)}`;
|
||||
private getIsHybrid(): boolean {
|
||||
const isHybridEnv = envManager.get('HYBRID_MODE');
|
||||
if (isHybridEnv === undefined || isHybridEnv === null) {
|
||||
return true; // Default to true
|
||||
}
|
||||
return isHybridEnv.toLowerCase() === 'true';
|
||||
}
|
||||
|
||||
/**
|
||||
* Index entire codebase
|
||||
* @param codebasePath Codebase path
|
||||
* Generate collection name based on codebase path and hybrid mode
|
||||
*/
|
||||
private getCollectionName(codebasePath: string): string {
|
||||
const isHybrid = this.getIsHybrid();
|
||||
const normalizedPath = path.resolve(codebasePath);
|
||||
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
|
||||
const prefix = isHybrid === true ? 'hybrid_code_chunks' : 'code_chunks';
|
||||
return `${prefix}_${hash.substring(0, 8)}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Index a codebase for semantic search
|
||||
* @param codebasePath Codebase root path
|
||||
* @param progressCallback Optional progress callback function
|
||||
* @returns Indexing statistics
|
||||
*/
|
||||
@@ -170,7 +186,9 @@ export class Context {
|
||||
codebasePath: string,
|
||||
progressCallback?: (progress: { phase: string; current: number; total: number; percentage: number }) => void
|
||||
): Promise<{ indexedFiles: number; totalChunks: number; status: 'completed' | 'limit_reached' }> {
|
||||
console.log(`🚀 Starting to index codebase: ${codebasePath}`);
|
||||
const isHybrid = this.getIsHybrid();
|
||||
const searchType = isHybrid === true ? 'hybrid search' : 'semantic search';
|
||||
console.log(`🚀 Starting to index codebase with ${searchType}: ${codebasePath}`);
|
||||
|
||||
// 1. Load ignore patterns from various ignore files
|
||||
await this.loadGitignorePatterns(codebasePath);
|
||||
@@ -239,7 +257,7 @@ export class Context {
|
||||
if (!synchronizer) {
|
||||
// Load project-specific ignore patterns before creating FileSynchronizer
|
||||
await this.loadGitignorePatterns(codebasePath);
|
||||
|
||||
|
||||
// To be safe, let's initialize if it's not there.
|
||||
const newSynchronizer = new FileSynchronizer(codebasePath, this.ignorePatterns);
|
||||
await newSynchronizer.initialize();
|
||||
@@ -317,37 +335,118 @@ export class Context {
|
||||
}
|
||||
|
||||
/**
|
||||
* Semantic search
|
||||
* Semantic search with unified implementation
|
||||
* @param codebasePath Codebase path to search in
|
||||
* @param query Search query
|
||||
* @param topK Number of results to return
|
||||
* @param threshold Similarity threshold
|
||||
*/
|
||||
async semanticSearch(codebasePath: string, query: string, topK: number = 5, threshold: number = 0.5): Promise<SemanticSearchResult[]> {
|
||||
console.log(`🔍 Executing semantic search: "${query}" in ${codebasePath}`);
|
||||
const isHybrid = this.getIsHybrid();
|
||||
const searchType = isHybrid === true ? 'hybrid search' : 'semantic search';
|
||||
console.log(`🔍 Executing ${searchType}: "${query}" in ${codebasePath}`);
|
||||
|
||||
// 1. Generate query vector
|
||||
const queryEmbedding: EmbeddingVector = await this.embedding.embed(query);
|
||||
const collectionName = this.getCollectionName(codebasePath);
|
||||
console.log(`🔍 Using collection: ${collectionName}`);
|
||||
|
||||
// 2. Search in vector database
|
||||
const searchResults: VectorSearchResult[] = await this.vectorDatabase.search(
|
||||
this.getCollectionName(codebasePath),
|
||||
queryEmbedding.vector,
|
||||
{ topK, threshold }
|
||||
);
|
||||
// Check if collection exists and has data
|
||||
const hasCollection = await this.vectorDatabase.hasCollection(collectionName);
|
||||
if (!hasCollection) {
|
||||
console.log(`⚠️ Collection '${collectionName}' does not exist. Please index the codebase first.`);
|
||||
return [];
|
||||
}
|
||||
|
||||
// 3. Convert to semantic search result format
|
||||
const results: SemanticSearchResult[] = searchResults.map(result => ({
|
||||
content: result.document.content,
|
||||
relativePath: result.document.relativePath,
|
||||
startLine: result.document.startLine,
|
||||
endLine: result.document.endLine,
|
||||
language: result.document.metadata.language || 'unknown',
|
||||
score: result.score
|
||||
}));
|
||||
if (isHybrid === true) {
|
||||
try {
|
||||
// Check collection stats to see if it has data
|
||||
const stats = await this.vectorDatabase.query(collectionName, '', ['id'], 1);
|
||||
console.log(`🔍 Collection '${collectionName}' exists and appears to have data`);
|
||||
} catch (error) {
|
||||
console.log(`⚠️ Collection '${collectionName}' exists but may be empty or not properly indexed:`, error);
|
||||
}
|
||||
|
||||
console.log(`✅ Found ${results.length} relevant results`);
|
||||
return results;
|
||||
// 1. Generate query vector
|
||||
console.log(`🔍 Generating embeddings for query: "${query}"`);
|
||||
const queryEmbedding: EmbeddingVector = await this.embedding.embed(query);
|
||||
console.log(`✅ Generated embedding vector with dimension: ${queryEmbedding.vector.length}`);
|
||||
console.log(`🔍 First 5 embedding values: [${queryEmbedding.vector.slice(0, 5).join(', ')}]`);
|
||||
|
||||
// 2. Prepare hybrid search requests
|
||||
const searchRequests: HybridSearchRequest[] = [
|
||||
{
|
||||
data: queryEmbedding.vector,
|
||||
anns_field: "vector",
|
||||
param: { "nprobe": 10 },
|
||||
limit: topK
|
||||
},
|
||||
{
|
||||
data: query,
|
||||
anns_field: "sparse_vector",
|
||||
param: { "drop_ratio_search": 0.2 },
|
||||
limit: topK
|
||||
}
|
||||
];
|
||||
|
||||
console.log(`🔍 Search request 1 (dense): anns_field="${searchRequests[0].anns_field}", vector_dim=${queryEmbedding.vector.length}, limit=${searchRequests[0].limit}`);
|
||||
console.log(`🔍 Search request 2 (sparse): anns_field="${searchRequests[1].anns_field}", query_text="${query}", limit=${searchRequests[1].limit}`);
|
||||
|
||||
// 3. Execute hybrid search
|
||||
console.log(`🔍 Executing hybrid search with RRF reranking...`);
|
||||
const searchResults: HybridSearchResult[] = await this.vectorDatabase.hybridSearch(
|
||||
collectionName,
|
||||
searchRequests,
|
||||
{
|
||||
rerank: {
|
||||
strategy: 'rrf',
|
||||
params: { k: 100 }
|
||||
},
|
||||
limit: topK
|
||||
}
|
||||
);
|
||||
|
||||
console.log(`🔍 Raw search results count: ${searchResults.length}`);
|
||||
|
||||
// 4. Convert to semantic search result format
|
||||
const results: SemanticSearchResult[] = searchResults.map(result => ({
|
||||
content: result.document.content,
|
||||
relativePath: result.document.relativePath,
|
||||
startLine: result.document.startLine,
|
||||
endLine: result.document.endLine,
|
||||
language: result.document.metadata.language || 'unknown',
|
||||
score: result.score
|
||||
}));
|
||||
|
||||
console.log(`✅ Found ${results.length} relevant hybrid results`);
|
||||
if (results.length > 0) {
|
||||
console.log(`🔍 Top result score: ${results[0].score}, path: ${results[0].relativePath}`);
|
||||
}
|
||||
|
||||
return results;
|
||||
} else {
|
||||
// Regular semantic search
|
||||
// 1. Generate query vector
|
||||
const queryEmbedding: EmbeddingVector = await this.embedding.embed(query);
|
||||
|
||||
// 2. Search in vector database
|
||||
const searchResults: VectorSearchResult[] = await this.vectorDatabase.search(
|
||||
collectionName,
|
||||
queryEmbedding.vector,
|
||||
{ topK, threshold }
|
||||
);
|
||||
|
||||
// 3. Convert to semantic search result format
|
||||
const results: SemanticSearchResult[] = searchResults.map(result => ({
|
||||
content: result.document.content,
|
||||
relativePath: result.document.relativePath,
|
||||
startLine: result.document.startLine,
|
||||
endLine: result.document.endLine,
|
||||
language: result.document.metadata.language || 'unknown',
|
||||
score: result.score
|
||||
}));
|
||||
|
||||
console.log(`✅ Found ${results.length} relevant results`);
|
||||
return results;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -458,10 +557,18 @@ export class Context {
|
||||
* Prepare vector collection
|
||||
*/
|
||||
private async prepareCollection(codebasePath: string): Promise<void> {
|
||||
// Create new collection
|
||||
console.log(`🔧 Preparing vector collection for codebase: ${codebasePath}`);
|
||||
const isHybrid = this.getIsHybrid();
|
||||
const collectionType = isHybrid === true ? 'hybrid vector' : 'vector';
|
||||
console.log(`🔧 Preparing ${collectionType} collection for codebase: ${codebasePath}`);
|
||||
const collectionName = this.getCollectionName(codebasePath);
|
||||
|
||||
// Check if collection already exists
|
||||
const collectionExists = await this.vectorDatabase.hasCollection(collectionName);
|
||||
if (collectionExists) {
|
||||
console.log(`📋 Collection ${collectionName} already exists, skipping creation`);
|
||||
return;
|
||||
}
|
||||
|
||||
// For Ollama embeddings, ensure dimension is detected before creating collection
|
||||
if (this.embedding.getProvider() === 'Ollama' && typeof (this.embedding as any).initializeDimension === 'function') {
|
||||
await (this.embedding as any).initializeDimension();
|
||||
@@ -469,7 +576,13 @@ export class Context {
|
||||
|
||||
const dimension = this.embedding.getDimension();
|
||||
const dirName = path.basename(codebasePath);
|
||||
await this.vectorDatabase.createCollection(collectionName, dimension, `Index for ${dirName}`);
|
||||
|
||||
if (isHybrid === true) {
|
||||
await this.vectorDatabase.createHybridCollection(collectionName, dimension, `Hybrid Index for ${dirName}`);
|
||||
} else {
|
||||
await this.vectorDatabase.createCollection(collectionName, dimension, `Index for ${dirName}`);
|
||||
}
|
||||
|
||||
console.log(`✅ Collection ${collectionName} created successfully (dimension: ${dimension})`);
|
||||
}
|
||||
|
||||
@@ -517,6 +630,7 @@ export class Context {
|
||||
codebasePath: string,
|
||||
onFileProcessed?: (filePath: string, fileIndex: number, totalFiles: number) => void
|
||||
): Promise<{ processedFiles: number; totalChunks: number; status: 'completed' | 'limit_reached' }> {
|
||||
const isHybrid = this.getIsHybrid();
|
||||
const EMBEDDING_BATCH_SIZE = Math.max(1, parseInt(envManager.get('EMBEDDING_BATCH_SIZE') || '100', 10));
|
||||
const CHUNK_LIMIT = 450000;
|
||||
console.log(`🔧 Using EMBEDDING_BATCH_SIZE: ${EMBEDDING_BATCH_SIZE}`);
|
||||
@@ -551,8 +665,8 @@ export class Context {
|
||||
try {
|
||||
await this.processChunkBuffer(chunkBuffer);
|
||||
} catch (error) {
|
||||
// TODO:
|
||||
console.error(`❌ Failed to process chunk batch: ${error}`);
|
||||
const searchType = isHybrid === true ? 'hybrid' : 'regular';
|
||||
console.error(`❌ Failed to process chunk batch for ${searchType}: ${error}`);
|
||||
} finally {
|
||||
chunkBuffer = []; // Always clear buffer, even on failure
|
||||
}
|
||||
@@ -580,11 +694,12 @@ export class Context {
|
||||
|
||||
// Process any remaining chunks in the buffer
|
||||
if (chunkBuffer.length > 0) {
|
||||
console.log(`📝 Processing final batch of ${chunkBuffer.length} chunks`);
|
||||
const searchType = isHybrid === true ? 'hybrid' : 'regular';
|
||||
console.log(`📝 Processing final batch of ${chunkBuffer.length} chunks for ${searchType}`);
|
||||
try {
|
||||
await this.processChunkBuffer(chunkBuffer);
|
||||
} catch (error) {
|
||||
console.error(`❌ Failed to process final chunk batch: ${error}`);
|
||||
console.error(`❌ Failed to process final chunk batch for ${searchType}: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -608,7 +723,9 @@ export class Context {
|
||||
// Estimate tokens (rough estimation: 1 token ≈ 4 characters)
|
||||
const estimatedTokens = chunks.reduce((sum, chunk) => sum + Math.ceil(chunk.content.length / 4), 0);
|
||||
|
||||
console.log(`🔄 Processing batch of ${chunks.length} chunks (~${estimatedTokens} tokens)`);
|
||||
const isHybrid = this.getIsHybrid();
|
||||
const searchType = isHybrid === true ? 'hybrid' : 'regular';
|
||||
console.log(`🔄 Processing batch of ${chunks.length} chunks (~${estimatedTokens} tokens) for ${searchType}`);
|
||||
await this.processChunkBatch(chunks, codebasePath);
|
||||
}
|
||||
|
||||
@@ -616,45 +733,75 @@ export class Context {
|
||||
* Process a batch of chunks
|
||||
*/
|
||||
private async processChunkBatch(chunks: CodeChunk[], codebasePath: string): Promise<void> {
|
||||
const isHybrid = this.getIsHybrid();
|
||||
|
||||
// Generate embedding vectors
|
||||
const chunkContents = chunks.map(chunk => chunk.content);
|
||||
const embeddings: EmbeddingVector[] = await this.embedding.embedBatch(chunkContents);
|
||||
const embeddings = await this.embedding.embedBatch(chunkContents);
|
||||
|
||||
// Prepare vector documents
|
||||
const documents: VectorDocument[] = chunks.map((chunk, index) => {
|
||||
if (!chunk.metadata.filePath) {
|
||||
throw new Error(`Missing filePath in chunk metadata at index ${index}`);
|
||||
}
|
||||
|
||||
const relativePath = path.relative(codebasePath, chunk.metadata.filePath);
|
||||
const fileExtension = path.extname(chunk.metadata.filePath);
|
||||
|
||||
// Extract metadata that should be stored separately
|
||||
const { filePath, startLine, endLine, ...restMetadata } = chunk.metadata;
|
||||
|
||||
return {
|
||||
id: this.generateId(relativePath, chunk.metadata.startLine || 0, chunk.metadata.endLine || 0, chunk.content),
|
||||
vector: embeddings[index].vector,
|
||||
content: chunk.content,
|
||||
relativePath,
|
||||
startLine: chunk.metadata.startLine || 0,
|
||||
endLine: chunk.metadata.endLine || 0,
|
||||
fileExtension,
|
||||
metadata: {
|
||||
...restMetadata,
|
||||
codebasePath,
|
||||
language: chunk.metadata.language || 'unknown',
|
||||
chunkIndex: index
|
||||
if (isHybrid === true) {
|
||||
// Create hybrid vector documents
|
||||
const documents: VectorDocument[] = chunks.map((chunk, index) => {
|
||||
if (!chunk.metadata.filePath) {
|
||||
throw new Error(`Missing filePath in chunk metadata at index ${index}`);
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
// Store to vector database
|
||||
await this.vectorDatabase.insert(this.getCollectionName(codebasePath), documents);
|
||||
const relativePath = path.relative(codebasePath, chunk.metadata.filePath);
|
||||
const fileExtension = path.extname(chunk.metadata.filePath);
|
||||
const { filePath, startLine, endLine, ...restMetadata } = chunk.metadata;
|
||||
|
||||
return {
|
||||
id: this.generateId(relativePath, chunk.metadata.startLine || 0, chunk.metadata.endLine || 0, chunk.content),
|
||||
content: chunk.content, // Full text content for BM25 and storage
|
||||
vector: embeddings[index].vector, // Dense vector
|
||||
relativePath,
|
||||
startLine: chunk.metadata.startLine || 0,
|
||||
endLine: chunk.metadata.endLine || 0,
|
||||
fileExtension,
|
||||
metadata: {
|
||||
...restMetadata,
|
||||
codebasePath,
|
||||
language: chunk.metadata.language || 'unknown',
|
||||
chunkIndex: index
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
// Store to vector database
|
||||
await this.vectorDatabase.insertHybrid(this.getCollectionName(codebasePath), documents);
|
||||
} else {
|
||||
// Create regular vector documents
|
||||
const documents: VectorDocument[] = chunks.map((chunk, index) => {
|
||||
if (!chunk.metadata.filePath) {
|
||||
throw new Error(`Missing filePath in chunk metadata at index ${index}`);
|
||||
}
|
||||
|
||||
const relativePath = path.relative(codebasePath, chunk.metadata.filePath);
|
||||
const fileExtension = path.extname(chunk.metadata.filePath);
|
||||
const { filePath, startLine, endLine, ...restMetadata } = chunk.metadata;
|
||||
|
||||
return {
|
||||
id: this.generateId(relativePath, chunk.metadata.startLine || 0, chunk.metadata.endLine || 0, chunk.content),
|
||||
vector: embeddings[index].vector,
|
||||
content: chunk.content,
|
||||
relativePath,
|
||||
startLine: chunk.metadata.startLine || 0,
|
||||
endLine: chunk.metadata.endLine || 0,
|
||||
fileExtension,
|
||||
metadata: {
|
||||
...restMetadata,
|
||||
codebasePath,
|
||||
language: chunk.metadata.language || 'unknown',
|
||||
chunkIndex: index
|
||||
}
|
||||
};
|
||||
});
|
||||
|
||||
// Store to vector database
|
||||
await this.vectorDatabase.insert(this.getCollectionName(codebasePath), documents);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Get programming language based on file extension
|
||||
*/
|
||||
|
||||
@@ -4,6 +4,10 @@ export {
|
||||
SearchOptions,
|
||||
VectorSearchResult,
|
||||
VectorDatabase,
|
||||
HybridSearchRequest,
|
||||
HybridSearchOptions,
|
||||
HybridSearchResult,
|
||||
RerankStrategy,
|
||||
COLLECTION_LIMIT_MESSAGE
|
||||
} from './types';
|
||||
|
||||
|
||||
@@ -3,6 +3,9 @@ import {
|
||||
SearchOptions,
|
||||
VectorSearchResult,
|
||||
VectorDatabase,
|
||||
HybridSearchRequest,
|
||||
HybridSearchOptions,
|
||||
HybridSearchResult,
|
||||
COLLECTION_LIMIT_MESSAGE
|
||||
} from './types';
|
||||
import { ClusterManager } from './zilliz-utils';
|
||||
@@ -467,4 +470,277 @@ export class MilvusRestfulVectorDatabase implements VectorDatabase {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async createHybridCollection(collectionName: string, dimension: number, description?: string): Promise<void> {
|
||||
try {
|
||||
const restfulConfig = this.config as MilvusRestfulConfig;
|
||||
|
||||
const collectionSchema = {
|
||||
collectionName,
|
||||
dbName: restfulConfig.database,
|
||||
schema: {
|
||||
enableDynamicField: false,
|
||||
functions: [
|
||||
{
|
||||
name: "content_bm25_emb",
|
||||
description: "content bm25 function",
|
||||
type: "BM25",
|
||||
inputFieldNames: ["content"],
|
||||
outputFieldNames: ["sparse_vector"],
|
||||
params: {},
|
||||
},
|
||||
],
|
||||
fields: [
|
||||
{
|
||||
fieldName: "id",
|
||||
dataType: "VarChar",
|
||||
isPrimary: true,
|
||||
elementTypeParams: {
|
||||
max_length: 512
|
||||
}
|
||||
},
|
||||
{
|
||||
fieldName: "content",
|
||||
dataType: "VarChar",
|
||||
elementTypeParams: {
|
||||
max_length: 65535,
|
||||
enable_analyzer: true
|
||||
}
|
||||
},
|
||||
{
|
||||
fieldName: "vector",
|
||||
dataType: "FloatVector",
|
||||
elementTypeParams: {
|
||||
dim: dimension
|
||||
}
|
||||
},
|
||||
{
|
||||
fieldName: "sparse_vector",
|
||||
dataType: "SparseFloatVector"
|
||||
},
|
||||
{
|
||||
fieldName: "relativePath",
|
||||
dataType: "VarChar",
|
||||
elementTypeParams: {
|
||||
max_length: 1024
|
||||
}
|
||||
},
|
||||
{
|
||||
fieldName: "startLine",
|
||||
dataType: "Int64"
|
||||
},
|
||||
{
|
||||
fieldName: "endLine",
|
||||
dataType: "Int64"
|
||||
},
|
||||
{
|
||||
fieldName: "fileExtension",
|
||||
dataType: "VarChar",
|
||||
elementTypeParams: {
|
||||
max_length: 32
|
||||
}
|
||||
},
|
||||
{
|
||||
fieldName: "metadata",
|
||||
dataType: "VarChar",
|
||||
elementTypeParams: {
|
||||
max_length: 65535
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
};
|
||||
|
||||
// Step 1: Create collection with schema and functions
|
||||
await createCollectionWithLimitCheck(this.makeRequest.bind(this), collectionSchema);
|
||||
|
||||
// Step 2: Create indexes for both vector fields
|
||||
await this.createHybridIndexes(collectionName);
|
||||
|
||||
// Step 3: Load collection to memory for searching
|
||||
await this.loadCollection(collectionName);
|
||||
|
||||
} catch (error) {
|
||||
console.error(`❌ Failed to create hybrid collection '${collectionName}':`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
private async createHybridIndexes(collectionName: string): Promise<void> {
|
||||
try {
|
||||
const restfulConfig = this.config as MilvusRestfulConfig;
|
||||
|
||||
// Create index for dense vector
|
||||
const denseIndexParams = {
|
||||
collectionName,
|
||||
dbName: restfulConfig.database,
|
||||
indexParams: [
|
||||
{
|
||||
fieldName: "vector",
|
||||
indexName: "vector_index",
|
||||
metricType: "COSINE",
|
||||
index_type: "AUTOINDEX"
|
||||
}
|
||||
]
|
||||
};
|
||||
await this.makeRequest('/indexes/create', 'POST', denseIndexParams);
|
||||
|
||||
// Create index for sparse vector
|
||||
const sparseIndexParams = {
|
||||
collectionName,
|
||||
dbName: restfulConfig.database,
|
||||
indexParams: [
|
||||
{
|
||||
fieldName: "sparse_vector",
|
||||
indexName: "sparse_vector_index",
|
||||
metricType: "BM25",
|
||||
index_type: "SPARSE_INVERTED_INDEX"
|
||||
}
|
||||
]
|
||||
};
|
||||
await this.makeRequest('/indexes/create', 'POST', sparseIndexParams);
|
||||
|
||||
} catch (error) {
|
||||
console.error(`❌ Failed to create hybrid indexes for collection '${collectionName}':`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async insertHybrid(collectionName: string, documents: VectorDocument[]): Promise<void> {
|
||||
await this.ensureInitialized();
|
||||
|
||||
try {
|
||||
const restfulConfig = this.config as MilvusRestfulConfig;
|
||||
|
||||
const data = documents.map(doc => ({
|
||||
id: doc.id,
|
||||
content: doc.content,
|
||||
vector: doc.vector,
|
||||
relativePath: doc.relativePath,
|
||||
startLine: doc.startLine,
|
||||
endLine: doc.endLine,
|
||||
fileExtension: doc.fileExtension,
|
||||
metadata: JSON.stringify(doc.metadata),
|
||||
}));
|
||||
|
||||
const insertRequest = {
|
||||
collectionName,
|
||||
dbName: restfulConfig.database,
|
||||
data: data
|
||||
};
|
||||
|
||||
const response = await this.makeRequest('/entities/insert', 'POST', insertRequest);
|
||||
|
||||
if (response.code !== 0) {
|
||||
throw new Error(`Insert failed: ${response.message || 'Unknown error'}`);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`❌ Failed to insert hybrid documents to collection '${collectionName}':`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async hybridSearch(collectionName: string, searchRequests: HybridSearchRequest[], options?: HybridSearchOptions): Promise<HybridSearchResult[]> {
|
||||
await this.ensureInitialized();
|
||||
|
||||
try {
|
||||
const restfulConfig = this.config as MilvusRestfulConfig;
|
||||
|
||||
console.log(`🔍 Preparing hybrid search for collection: ${collectionName}`);
|
||||
|
||||
// Prepare search requests according to Milvus REST API hybrid search specification
|
||||
// For dense vector search - data must be array of vectors: [[0.1, 0.2, 0.3, ...]]
|
||||
const search_param_1 = {
|
||||
data: Array.isArray(searchRequests[0].data) ? [searchRequests[0].data] : [[searchRequests[0].data]],
|
||||
annsField: searchRequests[0].anns_field, // "vector"
|
||||
limit: searchRequests[0].limit,
|
||||
outputFields: ["*"],
|
||||
searchParams: {
|
||||
metricType: "COSINE",
|
||||
params: searchRequests[0].param || { "nprobe": 10 }
|
||||
}
|
||||
};
|
||||
|
||||
// For sparse vector search - data must be array of queries: ["query text"]
|
||||
const search_param_2 = {
|
||||
data: Array.isArray(searchRequests[1].data) ? searchRequests[1].data : [searchRequests[1].data],
|
||||
annsField: searchRequests[1].anns_field, // "sparse_vector"
|
||||
limit: searchRequests[1].limit,
|
||||
outputFields: ["*"],
|
||||
searchParams: {
|
||||
metricType: "BM25",
|
||||
params: searchRequests[1].param || { "drop_ratio_search": 0.2 }
|
||||
}
|
||||
};
|
||||
|
||||
const rerank_strategy = {
|
||||
strategy: "rrf",
|
||||
params: {
|
||||
k: 100
|
||||
}
|
||||
};
|
||||
|
||||
console.log(`🔍 Dense search params:`, JSON.stringify({
|
||||
annsField: search_param_1.annsField,
|
||||
limit: search_param_1.limit,
|
||||
data_length: Array.isArray(search_param_1.data[0]) ? search_param_1.data[0].length : 'N/A',
|
||||
searchParams: search_param_1.searchParams
|
||||
}, null, 2));
|
||||
console.log(`🔍 Sparse search params:`, JSON.stringify({
|
||||
annsField: search_param_2.annsField,
|
||||
limit: search_param_2.limit,
|
||||
query_text: typeof search_param_2.data[0] === 'string' ? search_param_2.data[0].substring(0, 50) + '...' : 'N/A',
|
||||
searchParams: search_param_2.searchParams
|
||||
}, null, 2));
|
||||
|
||||
const hybridSearchRequest = {
|
||||
collectionName,
|
||||
dbName: restfulConfig.database,
|
||||
search: [search_param_1, search_param_2],
|
||||
rerank: rerank_strategy,
|
||||
limit: options?.limit || searchRequests[0]?.limit || 10,
|
||||
outputFields: ['id', 'content', 'relativePath', 'startLine', 'endLine', 'fileExtension', 'metadata'],
|
||||
};
|
||||
|
||||
console.log(`🔍 Complete REST API request:`, JSON.stringify({
|
||||
collectionName: hybridSearchRequest.collectionName,
|
||||
dbName: hybridSearchRequest.dbName,
|
||||
search_count: hybridSearchRequest.search.length,
|
||||
rerank: hybridSearchRequest.rerank,
|
||||
limit: hybridSearchRequest.limit,
|
||||
outputFields: hybridSearchRequest.outputFields
|
||||
}, null, 2));
|
||||
|
||||
console.log(`🔍 Executing REST API hybrid search...`);
|
||||
const response = await this.makeRequest('/entities/hybrid_search', 'POST', hybridSearchRequest);
|
||||
|
||||
if (response.code !== 0) {
|
||||
throw new Error(`Hybrid search failed: ${response.message || 'Unknown error'}`);
|
||||
}
|
||||
|
||||
const results = response.data || [];
|
||||
console.log(`✅ Found ${results.length} results from hybrid search`);
|
||||
|
||||
// Transform response to HybridSearchResult format
|
||||
return results.map((result: any) => ({
|
||||
document: {
|
||||
id: result.id,
|
||||
content: result.content,
|
||||
vector: [], // Vector not returned in search results
|
||||
sparse_vector: [], // Vector not returned in search results
|
||||
relativePath: result.relativePath,
|
||||
startLine: result.startLine,
|
||||
endLine: result.endLine,
|
||||
fileExtension: result.fileExtension,
|
||||
metadata: JSON.parse(result.metadata || '{}'),
|
||||
},
|
||||
score: result.score || result.distance || 0,
|
||||
}));
|
||||
|
||||
} catch (error) {
|
||||
console.error(`❌ Failed to perform hybrid search on collection '${collectionName}':`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,12 @@
|
||||
import { MilvusClient, DataType, MetricType } from '@zilliz/milvus2-sdk-node';
|
||||
import { MilvusClient, DataType, MetricType, FunctionType } from '@zilliz/milvus2-sdk-node';
|
||||
import {
|
||||
VectorDocument,
|
||||
SearchOptions,
|
||||
VectorSearchResult,
|
||||
VectorDatabase,
|
||||
HybridSearchRequest,
|
||||
HybridSearchOptions,
|
||||
HybridSearchResult,
|
||||
COLLECTION_LIMIT_MESSAGE
|
||||
} from './types';
|
||||
import { ClusterManager } from './zilliz-utils';
|
||||
@@ -298,4 +301,231 @@ export class MilvusVectorDatabase implements VectorDatabase {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async createHybridCollection(collectionName: string, dimension: number, description?: string): Promise<void> {
|
||||
await this.ensureInitialized();
|
||||
|
||||
console.log('Beginning hybrid collection creation:', collectionName);
|
||||
console.log('Collection dimension:', dimension);
|
||||
|
||||
const schema = [
|
||||
{
|
||||
name: 'id',
|
||||
description: 'Document ID',
|
||||
data_type: DataType.VarChar,
|
||||
max_length: 512,
|
||||
is_primary_key: true,
|
||||
},
|
||||
{
|
||||
name: 'content',
|
||||
description: 'Full text content for BM25 and storage',
|
||||
data_type: DataType.VarChar,
|
||||
max_length: 65535,
|
||||
enable_analyzer: true,
|
||||
},
|
||||
{
|
||||
name: 'vector',
|
||||
description: 'Dense vector embedding',
|
||||
data_type: DataType.FloatVector,
|
||||
dim: dimension,
|
||||
},
|
||||
{
|
||||
name: 'sparse_vector',
|
||||
description: 'Sparse vector embedding from BM25',
|
||||
data_type: DataType.SparseFloatVector,
|
||||
},
|
||||
{
|
||||
name: 'relativePath',
|
||||
description: 'Relative path to the codebase',
|
||||
data_type: DataType.VarChar,
|
||||
max_length: 1024,
|
||||
},
|
||||
{
|
||||
name: 'startLine',
|
||||
description: 'Start line number of the chunk',
|
||||
data_type: DataType.Int64,
|
||||
},
|
||||
{
|
||||
name: 'endLine',
|
||||
description: 'End line number of the chunk',
|
||||
data_type: DataType.Int64,
|
||||
},
|
||||
{
|
||||
name: 'fileExtension',
|
||||
description: 'File extension',
|
||||
data_type: DataType.VarChar,
|
||||
max_length: 32,
|
||||
},
|
||||
{
|
||||
name: 'metadata',
|
||||
description: 'Additional document metadata as JSON string',
|
||||
data_type: DataType.VarChar,
|
||||
max_length: 65535,
|
||||
},
|
||||
];
|
||||
|
||||
// Add BM25 function
|
||||
const functions = [
|
||||
{
|
||||
name: "content_bm25_emb",
|
||||
description: "content bm25 function",
|
||||
type: FunctionType.BM25,
|
||||
input_field_names: ["content"],
|
||||
output_field_names: ["sparse_vector"],
|
||||
params: {},
|
||||
},
|
||||
];
|
||||
|
||||
const createCollectionParams = {
|
||||
collection_name: collectionName,
|
||||
description: description || `Hybrid code context collection: ${collectionName}`,
|
||||
fields: schema,
|
||||
functions: functions,
|
||||
};
|
||||
|
||||
await createCollectionWithLimitCheck(this.client!, createCollectionParams);
|
||||
|
||||
// Create indexes for both vector fields
|
||||
// Index for dense vector
|
||||
const denseIndexParams = {
|
||||
collection_name: collectionName,
|
||||
field_name: 'vector',
|
||||
index_type: 'AUTOINDEX',
|
||||
metric_type: MetricType.COSINE,
|
||||
};
|
||||
await this.client!.createIndex(denseIndexParams);
|
||||
|
||||
// Index for sparse vector
|
||||
const sparseIndexParams = {
|
||||
collection_name: collectionName,
|
||||
field_name: 'sparse_vector',
|
||||
index_type: 'SPARSE_INVERTED_INDEX',
|
||||
metric_type: MetricType.BM25,
|
||||
};
|
||||
await this.client!.createIndex(sparseIndexParams);
|
||||
|
||||
// Load collection to memory
|
||||
await this.client!.loadCollection({
|
||||
collection_name: collectionName,
|
||||
});
|
||||
|
||||
// Verify collection is created correctly
|
||||
await this.client!.describeCollection({
|
||||
collection_name: collectionName,
|
||||
});
|
||||
}
|
||||
|
||||
async insertHybrid(collectionName: string, documents: VectorDocument[]): Promise<void> {
|
||||
await this.ensureInitialized();
|
||||
|
||||
const data = documents.map(doc => ({
|
||||
id: doc.id,
|
||||
content: doc.content,
|
||||
vector: doc.vector,
|
||||
relativePath: doc.relativePath,
|
||||
startLine: doc.startLine,
|
||||
endLine: doc.endLine,
|
||||
fileExtension: doc.fileExtension,
|
||||
metadata: JSON.stringify(doc.metadata),
|
||||
}));
|
||||
|
||||
await this.client!.insert({
|
||||
collection_name: collectionName,
|
||||
data: data,
|
||||
});
|
||||
}
|
||||
|
||||
async hybridSearch(collectionName: string, searchRequests: HybridSearchRequest[], options?: HybridSearchOptions): Promise<HybridSearchResult[]> {
|
||||
await this.ensureInitialized();
|
||||
|
||||
try {
|
||||
// Generate OpenAI embedding for the first search request (dense)
|
||||
console.log(`🔍 Preparing hybrid search for collection: ${collectionName}`);
|
||||
|
||||
// Prepare search requests in the correct Milvus format
|
||||
const search_param_1 = {
|
||||
data: Array.isArray(searchRequests[0].data) ? searchRequests[0].data : [searchRequests[0].data],
|
||||
anns_field: searchRequests[0].anns_field, // "vector"
|
||||
param: searchRequests[0].param, // {"nprobe": 10}
|
||||
limit: searchRequests[0].limit
|
||||
};
|
||||
|
||||
const search_param_2 = {
|
||||
data: searchRequests[1].data, // query text for sparse search
|
||||
anns_field: searchRequests[1].anns_field, // "sparse_vector"
|
||||
param: searchRequests[1].param, // {"drop_ratio_search": 0.2}
|
||||
limit: searchRequests[1].limit
|
||||
};
|
||||
|
||||
// Set rerank strategy to RRF (100) by default
|
||||
const rerank_strategy = {
|
||||
strategy: "rrf",
|
||||
params: {
|
||||
k: 100
|
||||
}
|
||||
};
|
||||
|
||||
console.log(`🔍 Dense search params:`, JSON.stringify({
|
||||
anns_field: search_param_1.anns_field,
|
||||
param: search_param_1.param,
|
||||
limit: search_param_1.limit,
|
||||
data_length: Array.isArray(search_param_1.data[0]) ? search_param_1.data[0].length : 'N/A'
|
||||
}, null, 2));
|
||||
console.log(`🔍 Sparse search params:`, JSON.stringify({
|
||||
anns_field: search_param_2.anns_field,
|
||||
param: search_param_2.param,
|
||||
limit: search_param_2.limit,
|
||||
query_text: typeof search_param_2.data === 'string' ? search_param_2.data.substring(0, 50) + '...' : 'N/A'
|
||||
}, null, 2));
|
||||
console.log(`🔍 Rerank strategy:`, JSON.stringify(rerank_strategy, null, 2));
|
||||
|
||||
// Execute hybrid search using the correct client.search format
|
||||
const searchParams = {
|
||||
collection_name: collectionName,
|
||||
data: [search_param_1, search_param_2],
|
||||
limit: options?.limit || searchRequests[0]?.limit || 10,
|
||||
rerank: rerank_strategy,
|
||||
output_fields: ['id', 'content', 'relativePath', 'startLine', 'endLine', 'fileExtension', 'metadata'],
|
||||
};
|
||||
|
||||
console.log(`🔍 Complete search request:`, JSON.stringify({
|
||||
collection_name: searchParams.collection_name,
|
||||
data_count: searchParams.data.length,
|
||||
limit: searchParams.limit,
|
||||
rerank: searchParams.rerank,
|
||||
output_fields: searchParams.output_fields
|
||||
}, null, 2));
|
||||
|
||||
const searchResult = await this.client!.search(searchParams);
|
||||
|
||||
console.log(`🔍 Search executed, processing results...`);
|
||||
|
||||
if (!searchResult.results || searchResult.results.length === 0) {
|
||||
console.log(`⚠️ No results returned from Milvus search`);
|
||||
return [];
|
||||
}
|
||||
|
||||
console.log(`✅ Found ${searchResult.results.length} results from hybrid search`);
|
||||
|
||||
// Transform results to HybridSearchResult format
|
||||
return searchResult.results.map((result: any) => ({
|
||||
document: {
|
||||
id: result.id,
|
||||
content: result.content,
|
||||
vector: [],
|
||||
sparse_vector: [],
|
||||
relativePath: result.relativePath,
|
||||
startLine: result.startLine,
|
||||
endLine: result.endLine,
|
||||
fileExtension: result.fileExtension,
|
||||
metadata: JSON.parse(result.metadata || '{}'),
|
||||
},
|
||||
score: result.score,
|
||||
}));
|
||||
|
||||
} catch (error) {
|
||||
console.error(`❌ Failed to perform hybrid search on collection '${collectionName}':`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -16,11 +16,34 @@ export interface SearchOptions {
|
||||
threshold?: number;
|
||||
}
|
||||
|
||||
// New interfaces for hybrid search
|
||||
export interface HybridSearchRequest {
|
||||
data: number[] | string; // Query vector or text
|
||||
anns_field: string; // Vector field name (vector or sparse_vector)
|
||||
param: Record<string, any>; // Search parameters
|
||||
limit: number;
|
||||
}
|
||||
|
||||
export interface HybridSearchOptions {
|
||||
rerank?: RerankStrategy;
|
||||
limit?: number;
|
||||
}
|
||||
|
||||
export interface RerankStrategy {
|
||||
strategy: 'rrf' | 'weighted';
|
||||
params?: Record<string, any>;
|
||||
}
|
||||
|
||||
export interface VectorSearchResult {
|
||||
document: VectorDocument;
|
||||
score: number;
|
||||
}
|
||||
|
||||
export interface HybridSearchResult {
|
||||
document: VectorDocument;
|
||||
score: number;
|
||||
}
|
||||
|
||||
export interface VectorDatabase {
|
||||
/**
|
||||
* Create collection
|
||||
@@ -30,6 +53,14 @@ export interface VectorDatabase {
|
||||
*/
|
||||
createCollection(collectionName: string, dimension: number, description?: string): Promise<void>;
|
||||
|
||||
/**
|
||||
* Create collection with hybrid search support
|
||||
* @param collectionName Collection name
|
||||
* @param dimension Dense vector dimension
|
||||
* @param description Collection description
|
||||
*/
|
||||
createHybridCollection(collectionName: string, dimension: number, description?: string): Promise<void>;
|
||||
|
||||
/**
|
||||
* Drop collection
|
||||
* @param collectionName Collection name
|
||||
@@ -54,6 +85,13 @@ export interface VectorDatabase {
|
||||
*/
|
||||
insert(collectionName: string, documents: VectorDocument[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Insert hybrid vector documents
|
||||
* @param collectionName Collection name
|
||||
* @param documents Document array
|
||||
*/
|
||||
insertHybrid(collectionName: string, documents: VectorDocument[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Search similar vectors
|
||||
* @param collectionName Collection name
|
||||
@@ -62,6 +100,14 @@ export interface VectorDatabase {
|
||||
*/
|
||||
search(collectionName: string, queryVector: number[], options?: SearchOptions): Promise<VectorSearchResult[]>;
|
||||
|
||||
/**
|
||||
* Hybrid search with multiple vector fields
|
||||
* @param collectionName Collection name
|
||||
* @param searchRequests Array of search requests for different fields
|
||||
* @param options Hybrid search options including reranking
|
||||
*/
|
||||
hybridSearch(collectionName: string, searchRequests: HybridSearchRequest[], options?: HybridSearchOptions): Promise<HybridSearchResult[]>;
|
||||
|
||||
/**
|
||||
* Delete documents
|
||||
* @param collectionName Collection name
|
||||
@@ -70,11 +116,11 @@ export interface VectorDatabase {
|
||||
delete(collectionName: string, ids: string[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Query documents by filter
|
||||
* Query documents with filter conditions
|
||||
* @param collectionName Collection name
|
||||
* @param filter Filter expression string
|
||||
* @param filter Filter expression
|
||||
* @param outputFields Fields to return
|
||||
* @param limit Maximum number of results to return (optional)
|
||||
* @param limit Maximum number of results
|
||||
*/
|
||||
query(collectionName: string, filter: string, outputFields: string[], limit?: number): Promise<Record<string, any>[]>;
|
||||
}
|
||||
|
||||
@@ -61,8 +61,8 @@ export class ToolHandlers {
|
||||
// Check each collection for codebase path
|
||||
for (const collectionName of collections) {
|
||||
try {
|
||||
// Skip collections that don't match the code_chunks pattern
|
||||
if (!collectionName.startsWith('code_chunks_')) {
|
||||
// Skip collections that don't match the code_chunks pattern (support both legacy and new collections)
|
||||
if (!collectionName.startsWith('code_chunks_') && !collectionName.startsWith('hybrid_code_chunks_')) {
|
||||
console.log(`[SYNC-CLOUD] ⏭️ Skipping non-code collection: ${collectionName}`);
|
||||
continue;
|
||||
}
|
||||
@@ -218,38 +218,19 @@ export class ToolHandlers {
|
||||
|
||||
// CRITICAL: Pre-index collection creation validation
|
||||
try {
|
||||
const normalizedPath = path.resolve(absolutePath);
|
||||
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
|
||||
const collectionName = `code_chunks_${hash.substring(0, 8)}`;
|
||||
console.log(`[INDEX-VALIDATION] 🔍 Validating collection creation capability`);
|
||||
|
||||
console.log(`[INDEX-VALIDATION] 🔍 Validating collection creation for: ${collectionName}`);
|
||||
|
||||
// Get embedding dimension for collection creation
|
||||
const embeddingProvider = this.context['embedding'];
|
||||
const dimension = embeddingProvider.getDimension();
|
||||
|
||||
// If force reindex, clear existing collection first
|
||||
if (forceReindex) {
|
||||
console.log(`[INDEX-VALIDATION] 🧹 Force reindex enabled, clearing existing collection: ${collectionName}`);
|
||||
try {
|
||||
await this.context['vectorDatabase'].dropCollection(collectionName);
|
||||
console.log(`[INDEX-VALIDATION] ✅ Existing collection cleared: ${collectionName}`);
|
||||
} catch (dropError: any) {
|
||||
// Collection might not exist, which is fine
|
||||
console.log(`[INDEX-VALIDATION] ℹ️ Collection ${collectionName} does not exist or already cleared`);
|
||||
}
|
||||
// Check if collection can be created (this will be handled entirely by context.ts)
|
||||
const hasExistingIndex = await this.context.hasIndex(absolutePath);
|
||||
if (hasExistingIndex && forceReindex) {
|
||||
console.log(`[INDEX-VALIDATION] ℹ️ Force reindex enabled, existing index will be cleared`);
|
||||
await this.context.clearIndex(absolutePath);
|
||||
console.log(`[INDEX-VALIDATION] ✅ Existing index cleared for re-indexing`);
|
||||
} else if (hasExistingIndex) {
|
||||
console.log(`[INDEX-VALIDATION] ℹ️ Index already exists for this codebase`);
|
||||
}
|
||||
|
||||
// Attempt to create collection - this will throw COLLECTION_LIMIT_MESSAGE if limit reached
|
||||
await this.context['vectorDatabase'].createCollection(
|
||||
collectionName,
|
||||
dimension,
|
||||
`Claude Context collection: ${collectionName}`
|
||||
);
|
||||
|
||||
// If creation succeeds, immediately drop the test collection
|
||||
await this.context['vectorDatabase'].dropCollection(collectionName);
|
||||
console.log(`[INDEX-VALIDATION] ✅ Collection creation validated successfully`);
|
||||
console.log(`[INDEX-VALIDATION] ✅ Collection creation validation completed`);
|
||||
|
||||
} catch (validationError: any) {
|
||||
const errorMessage = typeof validationError === 'string' ? validationError :
|
||||
@@ -352,14 +333,9 @@ export class ToolHandlers {
|
||||
console.warn(`[BACKGROUND-INDEX] Non-AST splitter '${splitterType}' requested; falling back to AST splitter`);
|
||||
}
|
||||
|
||||
// Generate collection name
|
||||
const normalizedPath = path.resolve(absolutePath);
|
||||
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
|
||||
const collectionName = `code_chunks_${hash.substring(0, 8)}`;
|
||||
|
||||
// Load ignore patterns from files first (including .ignore, .gitignore, etc.)
|
||||
await this.context['loadGitignorePatterns'](absolutePath);
|
||||
|
||||
|
||||
// Initialize file synchronizer with proper ignore patterns (including project-specific patterns)
|
||||
const { FileSynchronizer } = await import("@zilliz/claude-context-core");
|
||||
const ignorePatterns = this.context['ignorePatterns'] || [];
|
||||
@@ -367,7 +343,9 @@ export class ToolHandlers {
|
||||
const synchronizer = new FileSynchronizer(absolutePath, ignorePatterns);
|
||||
await synchronizer.initialize();
|
||||
|
||||
// Store synchronizer in the context's internal map
|
||||
// Store synchronizer in the context (let context manage collection names)
|
||||
await this.context['prepareCollection'](absolutePath);
|
||||
const collectionName = this.context['getCollectionName'](absolutePath);
|
||||
this.context['synchronizers'].set(collectionName, synchronizer);
|
||||
if (contextForThisTask !== this.context) {
|
||||
contextForThisTask['synchronizers'].set(collectionName, synchronizer);
|
||||
@@ -471,7 +449,7 @@ export class ToolHandlers {
|
||||
|
||||
// Log embedding provider information before search
|
||||
const embeddingProvider = this.context['embedding'];
|
||||
console.log(`[SEARCH] 🧠 Using embedding provider: ${embeddingProvider.getProvider()} for semantic search`);
|
||||
console.log(`[SEARCH] 🧠 Using embedding provider: ${embeddingProvider.getProvider()} for search`);
|
||||
console.log(`[SEARCH] 🔍 Generating embeddings for query using ${embeddingProvider.getProvider()}...`);
|
||||
|
||||
// Search in the specified codebase
|
||||
@@ -505,7 +483,7 @@ export class ToolHandlers {
|
||||
|
||||
return `${index + 1}. Code snippet (${result.language}) [${codebaseInfo}]\n` +
|
||||
` Location: ${location}\n` +
|
||||
` Score: ${result.score.toFixed(3)}\n` +
|
||||
` Rank: ${index + 1}\n` +
|
||||
` Context: \n\`\`\`${result.language}\n${context}\n\`\`\`\n`;
|
||||
}).join('\n');
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import * as fs from "fs";
|
||||
import { Context } from "@zilliz/claude-context-core";
|
||||
import { Context, FileSynchronizer } from "@zilliz/claude-context-core";
|
||||
import { SnapshotManager } from "./snapshot.js";
|
||||
|
||||
export class SyncManager {
|
||||
@@ -79,6 +79,11 @@ export class SyncManager {
|
||||
console.error(`[SYNC-DEBUG] Error syncing codebase '${codebasePath}' after ${codebaseElapsed}ms:`, error);
|
||||
console.error(`[SYNC-DEBUG] Error stack:`, error.stack);
|
||||
|
||||
if (error.message.includes('Failed to query Milvus')) {
|
||||
// Collection maybe deleted manually, delete the snapshot file
|
||||
await FileSynchronizer.deleteSnapshot(codebasePath);
|
||||
}
|
||||
|
||||
// Log additional error details
|
||||
if (error.code) {
|
||||
console.error(`[SYNC-DEBUG] Error code: ${error.code}`);
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import * as vscode from 'vscode';
|
||||
import { Context } from '@zilliz/claude-context-core';
|
||||
import * as path from 'path';
|
||||
import * as crypto from 'crypto';
|
||||
|
||||
export class IndexCommand {
|
||||
private context: Context;
|
||||
@@ -78,10 +77,9 @@ export class IndexCommand {
|
||||
const { FileSynchronizer } = await import("@zilliz/claude-context-core");
|
||||
const synchronizer = new FileSynchronizer(selectedFolder.uri.fsPath, this.context['ignorePatterns'] || []);
|
||||
await synchronizer.initialize();
|
||||
// Store synchronizer in the context's internal map using the same collection name generation logic
|
||||
const normalizedPath = path.resolve(selectedFolder.uri.fsPath);
|
||||
const hash = crypto.createHash('md5').update(normalizedPath).digest('hex');
|
||||
const collectionName = `code_chunks_${hash.substring(0, 8)}`;
|
||||
// Store synchronizer in the context's internal map using the collection name from context
|
||||
await this.context['prepareCollection'](selectedFolder.uri.fsPath);
|
||||
const collectionName = this.context['getCollectionName'](selectedFolder.uri.fsPath);
|
||||
this.context['synchronizers'].set(collectionName, synchronizer);
|
||||
|
||||
// Start indexing with progress callback
|
||||
|
||||
@@ -52,13 +52,25 @@ export class SearchCommand {
|
||||
}
|
||||
const codebasePath = workspaceFolders[0].uri.fsPath;
|
||||
|
||||
// Use the new semantic search service
|
||||
// Check if index exists
|
||||
progress.report({ increment: 20, message: 'Checking index...' });
|
||||
const hasIndex = await this.context.hasIndex(codebasePath);
|
||||
|
||||
if (!hasIndex) {
|
||||
vscode.window.showErrorMessage('Index not found. Please index the codebase first.');
|
||||
return;
|
||||
}
|
||||
|
||||
// Use semantic search
|
||||
const query: SearchQuery = {
|
||||
term: searchTerm,
|
||||
includeContent: true,
|
||||
limit: 20
|
||||
};
|
||||
|
||||
console.log('🔍 Using semantic search...');
|
||||
progress.report({ increment: 50, message: 'Executing semantic search...' });
|
||||
|
||||
const results = await this.context.semanticSearch(
|
||||
codebasePath,
|
||||
query.term,
|
||||
@@ -66,7 +78,7 @@ export class SearchCommand {
|
||||
0.3 // similarity threshold
|
||||
);
|
||||
|
||||
progress.report({ increment: 100, message: 'Semantic search complete!' });
|
||||
progress.report({ increment: 100, message: 'Search complete!' });
|
||||
|
||||
if (results.length === 0) {
|
||||
vscode.window.showInformationMessage(`No results found for "${searchTerm}"`);
|
||||
@@ -77,7 +89,7 @@ export class SearchCommand {
|
||||
const quickPickItems = this.generateQuickPickItems(results, searchTerm, codebasePath);
|
||||
|
||||
const selected = await vscode.window.showQuickPick(quickPickItems, {
|
||||
placeHolder: `Found ${results.length} results for "${searchTerm}"`,
|
||||
placeHolder: `Found ${results.length} results for "${searchTerm}" using semantic search`,
|
||||
matchOnDescription: true,
|
||||
matchOnDetail: true
|
||||
});
|
||||
@@ -88,8 +100,8 @@ export class SearchCommand {
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
console.error('Semantic search failed:', error);
|
||||
vscode.window.showErrorMessage(`Semantic search failed: ${error}`);
|
||||
console.error('Search failed:', error);
|
||||
vscode.window.showErrorMessage(`Search failed: ${error}. Please ensure the codebase is indexed.`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,7 +147,13 @@ export class SearchCommand {
|
||||
}
|
||||
const codebasePath = workspaceFolders[0].uri.fsPath;
|
||||
|
||||
// Use the semantic search service
|
||||
// Check if index exists
|
||||
const hasIndex = await this.context.hasIndex(codebasePath);
|
||||
if (!hasIndex) {
|
||||
throw new Error('Index not found. Please index the codebase first.');
|
||||
}
|
||||
|
||||
console.log('🔍 Using semantic search for webview...');
|
||||
return await this.context.semanticSearch(
|
||||
codebasePath,
|
||||
searchTerm,
|
||||
@@ -148,23 +166,31 @@ export class SearchCommand {
|
||||
* Check if index exists for the given codebase path
|
||||
*/
|
||||
async hasIndex(codebasePath: string): Promise<boolean> {
|
||||
return await this.context.hasIndex(codebasePath);
|
||||
try {
|
||||
return await this.context.hasIndex(codebasePath);
|
||||
} catch (error) {
|
||||
console.error('Error checking index existence:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate quick pick items for VS Code
|
||||
*/
|
||||
private generateQuickPickItems(results: SemanticSearchResult[], searchTerm: string, workspaceRoot?: string) {
|
||||
return results.slice(0, 20).map(result => {
|
||||
return results.slice(0, 20).map((result, index) => {
|
||||
let displayPath = result.relativePath;
|
||||
// Truncate content for display
|
||||
const truncatedContent = result.content.length <= 150
|
||||
? result.content
|
||||
: result.content.substring(0, 150) + '...';
|
||||
|
||||
// Add rank info to description
|
||||
const rankText = ` (rank: ${index + 1})`;
|
||||
|
||||
return {
|
||||
label: `$(file-code) ${displayPath}`,
|
||||
description: `1 match in ${displayPath}`,
|
||||
description: `$(search) semantic search${rankText}`,
|
||||
detail: truncatedContent,
|
||||
result: result
|
||||
};
|
||||
|
||||
@@ -197,7 +197,7 @@ class SemanticSearchController {
|
||||
this.resultsList.innerHTML = '<div class="no-results">No matches found</div>';
|
||||
} else {
|
||||
this.resultsHeader.textContent = `${results.length} result${results.length === 1 ? '' : 's'} for "${query}"`;
|
||||
this.resultsList.innerHTML = results.map(result => this.createResultHTML(result)).join('');
|
||||
this.resultsList.innerHTML = results.map((result, index) => this.createResultHTML(result, index + 1)).join('');
|
||||
}
|
||||
this.resultsContainer.style.display = 'block';
|
||||
}
|
||||
@@ -205,9 +205,10 @@ class SemanticSearchController {
|
||||
/**
|
||||
* Create HTML for a single result item
|
||||
* @param {Object} result - Result object
|
||||
* @param {number} rank - Result rank (1-indexed)
|
||||
* @returns {string} HTML string
|
||||
*/
|
||||
createResultHTML(result) {
|
||||
createResultHTML(result, rank) {
|
||||
return `
|
||||
<div class="result-item" onclick="searchController.openFile('${result.relativePath}', ${result.line}, ${result.startLine}, ${result.endLine})">
|
||||
<div class="result-file">
|
||||
@@ -216,7 +217,7 @@ class SemanticSearchController {
|
||||
</div>
|
||||
<div class="result-preview">${result.preview}</div>
|
||||
<div class="result-context">${result.context}</div>
|
||||
${result.score ? `<div class="result-score" style="margin-top: 8px; text-align: right;">Similarity: ${(result.score * 100).toFixed(1)}%</div>` : ''}
|
||||
<div class="result-rank" style="margin-top: 8px; text-align: right;">Rank: ${rank}</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
@@ -184,7 +184,7 @@ body {
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.result-score {
|
||||
.result-rank {
|
||||
font-size: 10px;
|
||||
color: var(--vscode-descriptionForeground);
|
||||
background-color: var(--vscode-badge-background);
|
||||
|
||||
Reference in New Issue
Block a user