[Refactor]: Refactor the file synchronizer to use Merkle DAG

Signed-off-by: ShawnZheng <shawn.zheng@zilliz.com>
This commit is contained in:
ShawnZheng
2025-07-24 10:31:59 +08:00
committed by Cheney Zhang
parent 51b7b7cb09
commit 7d0d9dae4d
6 changed files with 152 additions and 106 deletions

View File

@@ -351,7 +351,10 @@ export class CodeContext {
updateIgnorePatterns(ignorePatterns: string[]): void {
// Merge with default patterns, avoiding duplicates
const mergedPatterns = [...DEFAULT_IGNORE_PATTERNS, ...ignorePatterns];
this.ignorePatterns = [...new Set(mergedPatterns)]; // Remove duplicates
const uniquePatterns: string[] = [];
const patternSet = new Set(mergedPatterns);
patternSet.forEach(pattern => uniquePatterns.push(pattern));
this.ignorePatterns = uniquePatterns;
console.log(`🚫 Updated ignore patterns: ${ignorePatterns.length} from .gitignore + ${DEFAULT_IGNORE_PATTERNS.length} default = ${this.ignorePatterns.length} total patterns`);
}

View File

@@ -31,7 +31,7 @@ export class AstCodeSplitter implements Splitter {
if (chunkSize) this.chunkSize = chunkSize;
if (chunkOverlap) this.chunkOverlap = chunkOverlap;
this.parser = new Parser();
// Initialize fallback splitter
const { LangChainCodeSplitter } = require('./langchain-splitter');
this.langchainFallback = new LangChainCodeSplitter(chunkSize, chunkOverlap);
@@ -47,7 +47,7 @@ export class AstCodeSplitter implements Splitter {
try {
console.log(`🌳 Using AST splitter for ${language} file: ${filePath || 'unknown'}`);
this.parser.setLanguage(langConfig.parser);
const tree = this.parser.parse(code);
@@ -58,7 +58,7 @@ export class AstCodeSplitter implements Splitter {
// Extract chunks based on AST nodes
const chunks = this.extractChunks(tree.rootNode, code, langConfig.nodeTypes, language, filePath);
// If chunks are too large, split them further
const refinedChunks = await this.refineChunks(chunks, code);
@@ -100,8 +100,8 @@ export class AstCodeSplitter implements Splitter {
}
private extractChunks(
node: Parser.SyntaxNode,
code: string,
node: Parser.SyntaxNode,
code: string,
splittableTypes: string[],
language: string,
filePath?: string
@@ -255,7 +255,7 @@ export class AstCodeSplitter implements Splitter {
*/
static isLanguageSupported(language: string): boolean {
const supportedLanguages = [
'javascript', 'js', 'typescript', 'ts', 'python', 'py',
'javascript', 'js', 'typescript', 'ts', 'python', 'py',
'java', 'cpp', 'c++', 'c', 'go', 'rust', 'rs'
];
return supportedLanguages.includes(language.toLowerCase());

View File

@@ -1,93 +1,99 @@
import * as crypto from 'crypto';
export class MerkleNode {
export interface MerkleDAGNode {
id: string;
hash: string;
left: MerkleNode | null;
right: MerkleNode | null;
constructor(hash: string, left: MerkleNode | null = null, right: MerkleNode | null = null) {
this.hash = hash;
this.left = left;
this.right = right;
}
static serializeNode(node: MerkleNode | null): any {
if (!node) return null;
return {
hash: node.hash,
left: MerkleNode.serializeNode(node.left),
right: MerkleNode.serializeNode(node.right)
};
}
static deserializeNode(data: any): MerkleNode | null {
if (!data) return null;
return new MerkleNode(
data.hash,
MerkleNode.deserializeNode(data.left),
MerkleNode.deserializeNode(data.right)
);
}
data: string;
parents: string[];
children: string[];
}
export class MerkleTree {
root: MerkleNode;
leaves: MerkleNode[];
export class MerkleDAG {
nodes: Map<string, MerkleDAGNode>;
rootIds: string[];
constructor(data: string[]) {
const leaves = data.map(d => new MerkleNode(this.hash(d)));
this.leaves = leaves;
this.root = this.buildTree(leaves);
constructor() {
this.nodes = new Map();
this.rootIds = [];
}
private hash(data: string): string {
return crypto.createHash('sha256').update(data).digest('hex');
}
private buildTree(nodes: MerkleNode[]): MerkleNode {
if (nodes.length === 0) {
return new MerkleNode(this.hash(''));
}
if (nodes.length === 1) {
return nodes[0];
public addNode(data: string, parentId?: string): string {
const nodeId = this.hash(data);
const node: MerkleDAGNode = {
id: nodeId,
hash: nodeId,
data,
parents: [],
children: []
};
// If there's a parent, create the relationship
if (parentId) {
const parentNode = this.nodes.get(parentId);
if (parentNode) {
node.parents.push(parentId);
parentNode.children.push(nodeId);
this.nodes.set(parentId, parentNode);
}
} else {
// If no parent, it's a root node
this.rootIds.push(nodeId);
}
const parents: MerkleNode[] = [];
for (let i = 0; i < nodes.length; i += 2) {
const left = nodes[i];
const right = (i + 1 < nodes.length) ? nodes[i + 1] : left;
const parentHash = this.hash(left.hash + right.hash);
parents.push(new MerkleNode(parentHash, left, right));
}
return this.buildTree(parents);
this.nodes.set(nodeId, node);
return nodeId;
}
public getRootHash(): string {
return this.root.hash;
public getNode(nodeId: string): MerkleDAGNode | undefined {
return this.nodes.get(nodeId);
}
public static compare(tree1: MerkleTree, tree2: MerkleTree): { added: string[], removed: string[], modified: string[] } {
const C1 = new Map(tree1.leaves.map(l => [l.hash, l]));
const C2 = new Map(tree2.leaves.map(l => [l.hash, l]));
public getAllNodes(): MerkleDAGNode[] {
return Array.from(this.nodes.values());
}
const added = Array.from(C2.keys()).filter(k => !C1.has(k));
const removed = Array.from(C1.keys()).filter(k => !C2.has(k));
return { added, removed, modified: [] };
public getRootNodes(): MerkleDAGNode[] {
return this.rootIds.map(id => this.nodes.get(id)!).filter(Boolean);
}
public getLeafNodes(): MerkleDAGNode[] {
return Array.from(this.nodes.values()).filter(node => node.children.length === 0);
}
public serialize(): any {
return {
root: MerkleNode.serializeNode(this.root),
leaves: this.leaves.map(l => MerkleNode.serializeNode(l))
nodes: Array.from(this.nodes.entries()),
rootIds: this.rootIds
};
}
static deserialize(data: any): MerkleTree {
const tree = Object.create(MerkleTree.prototype);
tree.root = MerkleNode.deserializeNode(data.root);
tree.leaves = (data.leaves || []).map((l: any) => MerkleNode.deserializeNode(l));
return tree;
public static deserialize(data: any): MerkleDAG {
const dag = new MerkleDAG();
dag.nodes = new Map(data.nodes);
dag.rootIds = data.rootIds;
return dag;
}
public static compare(dag1: MerkleDAG, dag2: MerkleDAG): { added: string[], removed: string[], modified: string[] } {
const nodes1 = new Map(Array.from(dag1.getAllNodes()).map(n => [n.id, n]));
const nodes2 = new Map(Array.from(dag2.getAllNodes()).map(n => [n.id, n]));
const added = Array.from(nodes2.keys()).filter(k => !nodes1.has(k));
const removed = Array.from(nodes1.keys()).filter(k => !nodes2.has(k));
// For modified, we'll check if the data has changed for nodes that exist in both
const modified: string[] = [];
for (const [id, node1] of Array.from(nodes1.entries())) {
const node2 = nodes2.get(id);
if (node2 && node1.data !== node2.data) {
modified.push(id);
}
}
return { added, removed, modified };
}
}

View File

@@ -1,12 +1,12 @@
import * as fs from 'fs/promises';
import * as path from 'path';
import * as crypto from 'crypto';
import { MerkleTree } from './merkle';
import { MerkleDAG } from './merkle';
import * as os from 'os';
export class FileSynchronizer {
private fileHashes: Map<string, string>;
private merkleTree: MerkleTree;
private merkleDAG: MerkleDAG;
private rootDir: string;
private snapshotPath: string;
private ignorePatterns: string[];
@@ -15,7 +15,7 @@ export class FileSynchronizer {
this.rootDir = rootDir;
this.snapshotPath = this.getSnapshotPath(rootDir);
this.fileHashes = new Map();
this.merkleTree = new MerkleTree([]);
this.merkleDAG = new MerkleDAG();
this.ignorePatterns = ignorePatterns;
}
@@ -72,7 +72,9 @@ export class FileSynchronizer {
// Verify it's really a directory and not ignored
if (!this.shouldIgnore(relativePath, true)) {
const subHashes = await this.generateFileHashes(fullPath);
for (const [p, h] of subHashes) {
const entries = Array.from(subHashes.entries());
for (let i = 0; i < entries.length; i++) {
const [p, h] = entries[i];
fileHashes.set(p, h);
}
}
@@ -184,16 +186,32 @@ export class FileSynchronizer {
return regex.test(text);
}
private buildMerkleTree(fileHashes: Map<string, string>): MerkleTree {
const sortedPaths = Array.from(fileHashes.keys()).sort();
const data = sortedPaths.map(p => p + fileHashes.get(p));
return new MerkleTree(data);
private buildMerkleDAG(fileHashes: Map<string, string>): MerkleDAG {
const dag = new MerkleDAG();
const keys = Array.from(fileHashes.keys());
const sortedPaths = keys.slice().sort(); // Create a sorted copy
// Create a root node for the entire directory
let valuesString = "";
keys.forEach(key => {
valuesString += fileHashes.get(key);
});
const rootNodeData = "root:" + valuesString;
const rootNodeId = dag.addNode(rootNodeData);
// Add each file as a child of the root
for (const path of sortedPaths) {
const fileData = path + ":" + fileHashes.get(path);
dag.addNode(fileData, rootNodeId);
}
return dag;
}
public async initialize() {
console.log(`Initializing file synchronizer for ${this.rootDir}`);
await this.loadSnapshot();
this.merkleTree = this.buildMerkleTree(this.fileHashes);
this.merkleDAG = this.buildMerkleDAG(this.fileHashes);
console.log(`File synchronizer initialized. Loaded ${this.fileHashes.size} file hashes.`);
}
@@ -201,27 +219,26 @@ export class FileSynchronizer {
console.log('Checking for file changes...');
const newFileHashes = await this.generateFileHashes(this.rootDir);
const newMerkleTree = this.buildMerkleTree(newFileHashes);
const newMerkleDAG = this.buildMerkleDAG(newFileHashes);
if (this.merkleTree.getRootHash() === newMerkleTree.getRootHash()) {
console.log('No changes detected based on Merkle root hash.');
return { added: [], removed: [], modified: [] };
}
console.log('Merkle root hash has changed. Comparing file states...');
const changes = this.compareStates(this.fileHashes, newFileHashes);
this.fileHashes = newFileHashes;
this.merkleTree = newMerkleTree;
await this.saveSnapshot();
// Compare the DAGs
const changes = MerkleDAG.compare(this.merkleDAG, newMerkleDAG);
// If there are any changes in the DAG, we should also do a file-level comparison
if (changes.added.length > 0 || changes.removed.length > 0 || changes.modified.length > 0) {
console.log(`Found changes: ${changes.added.length} added, ${changes.removed.length} removed, ${changes.modified.length} modified.`);
} else {
console.log('No file-level changes detected after detailed comparison.');
console.log('Merkle DAG has changed. Comparing file states...');
const fileChanges = this.compareStates(this.fileHashes, newFileHashes);
this.fileHashes = newFileHashes;
this.merkleDAG = newMerkleDAG;
await this.saveSnapshot();
console.log(`Found changes: ${fileChanges.added.length} added, ${fileChanges.removed.length} removed, ${fileChanges.modified.length} modified.`);
return fileChanges;
}
return changes;
console.log('No changes detected based on Merkle DAG comparison.');
return { added: [], removed: [], modified: [] };
}
private compareStates(oldHashes: Map<string, string>, newHashes: Map<string, string>): { added: string[], removed: string[], modified: string[] } {
@@ -229,7 +246,9 @@ export class FileSynchronizer {
const removed: string[] = [];
const modified: string[] = [];
for (const [file, hash] of newHashes.entries()) {
const newEntries = Array.from(newHashes.entries());
for (let i = 0; i < newEntries.length; i++) {
const [file, hash] = newEntries[i];
if (!oldHashes.has(file)) {
added.push(file);
} else if (oldHashes.get(file) !== hash) {
@@ -237,7 +256,9 @@ export class FileSynchronizer {
}
}
for (const file of oldHashes.keys()) {
const oldKeys = Array.from(oldHashes.keys());
for (let i = 0; i < oldKeys.length; i++) {
const file = oldKeys[i];
if (!newHashes.has(file)) {
removed.push(file);
}
@@ -253,9 +274,17 @@ export class FileSynchronizer {
private async saveSnapshot(): Promise<void> {
const merkleDir = path.dirname(this.snapshotPath);
await fs.mkdir(merkleDir, { recursive: true });
// Convert Map to array without using iterator
const fileHashesArray: [string, string][] = [];
const keys = Array.from(this.fileHashes.keys());
keys.forEach(key => {
fileHashesArray.push([key, this.fileHashes.get(key)!]);
});
const data = JSON.stringify({
fileHashes: Array.from(this.fileHashes.entries()),
merkleTree: this.merkleTree.serialize()
fileHashes: fileHashesArray,
merkleDAG: this.merkleDAG.serialize()
});
await fs.writeFile(this.snapshotPath, data, 'utf-8');
console.log(`Saved snapshot to ${this.snapshotPath}`);
@@ -265,16 +294,22 @@ export class FileSynchronizer {
try {
const data = await fs.readFile(this.snapshotPath, 'utf-8');
const obj = JSON.parse(data);
this.fileHashes = new Map(obj.fileHashes);
if (obj.merkleTree) {
this.merkleTree = MerkleTree.deserialize(obj.merkleTree);
// Reconstruct Map without using constructor with iterator
this.fileHashes = new Map();
for (const [key, value] of obj.fileHashes) {
this.fileHashes.set(key, value);
}
if (obj.merkleDAG) {
this.merkleDAG = MerkleDAG.deserialize(obj.merkleDAG);
}
console.log(`Loaded snapshot from ${this.snapshotPath}`);
} catch (error: any) {
if (error.code === 'ENOENT') {
console.log(`Snapshot file not found at ${this.snapshotPath}. Generating new one.`);
this.fileHashes = await this.generateFileHashes(this.rootDir);
this.merkleTree = this.buildMerkleTree(this.fileHashes);
this.merkleDAG = this.buildMerkleDAG(this.fileHashes);
await this.saveSnapshot();
} else {
throw error;

View File

@@ -16,7 +16,8 @@
"forceConsistentCasingInFileNames": true,
"moduleResolution": "node",
"composite": true,
"allowSyntheticDefaultImports": true
"allowSyntheticDefaultImports": true,
"downlevelIteration": true
},
"include": [
"src/**/*"

View File

@@ -17,6 +17,7 @@
"resolveJsonModule": true,
"allowSyntheticDefaultImports": true,
"baseUrl": ".",
"downlevelIteration": true,
"paths": {
"@zilliz/code-context-core": [
"./packages/core/src"