mirror of
https://github.com/ludo-technologies/pyscn.git
synced 2025-10-06 00:59:45 +03:00
fix: implement LSH auto-enable from config file
## Problem LSH (Locality-Sensitive Hashing) acceleration was not working because: 1. LSH settings in [clones] section of .pyscn.toml were not being loaded 2. toml_loader.go expected nested [lsh] section, but config used flat structure 3. cloneConfigToCloneRequest() was not converting LSH settings to CloneRequest 4. Auto-enable logic based on fragment count was not implemented This caused clone detection to always use slow APTED algorithm, even for large projects where LSH would provide significant speedup. ## Solution 1. Added ClonesConfig struct to read flat [clones] section structure 2. Implemented mergeClonesSection() to load all settings including LSH 3. Extended CloneRequest with LSH fields (LSHEnabled, LSHAutoThreshold, etc.) 4. Added auto-enable logic in clone_service.go: - "auto": enable LSH when fragments >= threshold (default: 500) - "true": always enable LSH - "false": always disable LSH 5. Added diagnostic messages showing LSH decision ## Changes - domain/clone.go: Add LSH config fields to CloneRequest - internal/config/toml_loader.go: Add ClonesConfig struct and merge logic - service/clone_config_loader.go: Convert LSH settings to CloneRequest - service/clone_service.go: Implement auto-enable logic based on fragment count - .pyscn.toml: Document LSH settings (no functional change) ## Testing - Verified LSH auto-detection with different thresholds - Confirmed settings load correctly from .pyscn.toml - All existing tests pass ## Related - Fixes issue discovered during performance investigation - Prepares for config refactoring in #124 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -75,7 +75,7 @@ k_core_k = 2 # K-core parameter
|
||||
|
||||
# LSH acceleration settings
|
||||
lsh_enabled = "auto" # Enable LSH: true, false, auto (based on project size)
|
||||
lsh_auto_threshold = 500 # Auto-enable LSH for projects with >500 files
|
||||
lsh_auto_threshold = 500 # Auto-enable LSH for projects with >500 fragments
|
||||
lsh_similarity_threshold = 0.78 # LSH similarity threshold
|
||||
lsh_bands = 32 # Number of LSH bands
|
||||
lsh_rows = 4 # Number of rows per band
|
||||
|
||||
@@ -177,7 +177,9 @@ type CloneRequest struct {
|
||||
Timeout time.Duration `json:"timeout"` // Maximum time for clone analysis (0 = no timeout)
|
||||
|
||||
// LSH acceleration (opt-in)
|
||||
UseLSH bool `json:"use_lsh"`
|
||||
UseLSH bool `json:"use_lsh"` // Deprecated: use LSHEnabled instead
|
||||
LSHEnabled string `json:"lsh_enabled"` // "auto", "true", "false"
|
||||
LSHAutoThreshold int `json:"lsh_auto_threshold"` // Auto-enable LSH for N+ fragments
|
||||
LSHSimilarityThreshold float64 `json:"lsh_similarity_threshold"`
|
||||
LSHBands int `json:"lsh_bands"`
|
||||
LSHRows int `json:"lsh_rows"`
|
||||
@@ -365,6 +367,8 @@ func DefaultCloneRequest() *CloneRequest {
|
||||
CloneTypes: []CloneType{Type1Clone, Type2Clone, Type3Clone, Type4Clone},
|
||||
// LSH defaults (opt-in)
|
||||
UseLSH: false,
|
||||
LSHEnabled: "auto", // Auto-enable based on fragment count
|
||||
LSHAutoThreshold: 500, // Enable LSH for 500+ fragments
|
||||
LSHSimilarityThreshold: 0.78,
|
||||
LSHBands: 32,
|
||||
LSHRows: 4,
|
||||
|
||||
@@ -9,14 +9,65 @@ import (
|
||||
|
||||
// PyscnTomlConfig represents the structure of .pyscn.toml
|
||||
type PyscnTomlConfig struct {
|
||||
Analysis PyscnTomlAnalysisConfig `toml:"analysis"`
|
||||
Thresholds ThresholdConfig `toml:"thresholds"`
|
||||
Filtering PyscnTomlFilteringConfig `toml:"filtering"`
|
||||
Input PyscnTomlInputConfig `toml:"input"`
|
||||
Output PyscnTomlOutputConfig `toml:"output"`
|
||||
Performance PerformanceConfig `toml:"performance"`
|
||||
Grouping GroupingConfig `toml:"grouping"`
|
||||
LSH LSHConfig `toml:"lsh"`
|
||||
Clones ClonesConfig `toml:"clones"` // Primary: [clones] section
|
||||
Analysis PyscnTomlAnalysisConfig `toml:"analysis"` // Fallback for compatibility
|
||||
Thresholds ThresholdConfig `toml:"thresholds"` // Fallback for compatibility
|
||||
Filtering PyscnTomlFilteringConfig `toml:"filtering"` // Fallback for compatibility
|
||||
Input PyscnTomlInputConfig `toml:"input"` // Fallback for compatibility
|
||||
Output PyscnTomlOutputConfig `toml:"output"` // Fallback for compatibility
|
||||
Performance PerformanceConfig `toml:"performance"` // Fallback for compatibility
|
||||
Grouping GroupingConfig `toml:"grouping"` // Fallback for compatibility
|
||||
LSH LSHConfig `toml:"lsh"` // Fallback for compatibility
|
||||
}
|
||||
|
||||
// ClonesConfig represents the [clones] section (flat structure)
|
||||
type ClonesConfig struct {
|
||||
// Analysis settings
|
||||
MinLines int `toml:"min_lines"`
|
||||
MinNodes int `toml:"min_nodes"`
|
||||
MaxEditDistance float64 `toml:"max_edit_distance"`
|
||||
IgnoreLiterals *bool `toml:"ignore_literals"` // pointer to detect unset
|
||||
IgnoreIdentifiers *bool `toml:"ignore_identifiers"` // pointer to detect unset
|
||||
CostModelType string `toml:"cost_model_type"`
|
||||
|
||||
// Thresholds
|
||||
Type1Threshold float64 `toml:"type1_threshold"`
|
||||
Type2Threshold float64 `toml:"type2_threshold"`
|
||||
Type3Threshold float64 `toml:"type3_threshold"`
|
||||
Type4Threshold float64 `toml:"type4_threshold"`
|
||||
SimilarityThreshold float64 `toml:"similarity_threshold"`
|
||||
|
||||
// Filtering
|
||||
MinSimilarity float64 `toml:"min_similarity"`
|
||||
MaxSimilarity float64 `toml:"max_similarity"`
|
||||
EnabledCloneTypes []string `toml:"enabled_clone_types"`
|
||||
MaxResults int `toml:"max_results"`
|
||||
|
||||
// Grouping
|
||||
GroupingMode string `toml:"grouping_mode"`
|
||||
GroupingThreshold float64 `toml:"grouping_threshold"`
|
||||
KCoreK int `toml:"k_core_k"`
|
||||
|
||||
// LSH (flat structure with lsh_ prefix)
|
||||
LSHEnabled string `toml:"lsh_enabled"`
|
||||
LSHAutoThreshold int `toml:"lsh_auto_threshold"`
|
||||
LSHSimilarityThreshold float64 `toml:"lsh_similarity_threshold"`
|
||||
LSHBands int `toml:"lsh_bands"`
|
||||
LSHRows int `toml:"lsh_rows"`
|
||||
LSHHashes int `toml:"lsh_hashes"`
|
||||
|
||||
// Performance
|
||||
MaxMemoryMB int `toml:"max_memory_mb"`
|
||||
BatchSize int `toml:"batch_size"`
|
||||
EnableBatching *bool `toml:"enable_batching"` // pointer to detect unset
|
||||
MaxGoroutines int `toml:"max_goroutines"`
|
||||
TimeoutSeconds int `toml:"timeout_seconds"`
|
||||
|
||||
// Output
|
||||
ShowDetails *bool `toml:"show_details"` // pointer to detect unset
|
||||
ShowContent *bool `toml:"show_content"` // pointer to detect unset
|
||||
SortBy string `toml:"sort_by"`
|
||||
GroupClones *bool `toml:"group_clones"` // pointer to detect unset
|
||||
}
|
||||
|
||||
type PyscnTomlAnalysisConfig struct {
|
||||
@@ -139,7 +190,14 @@ func (l *TomlConfigLoader) findPyscnToml(startDir string) (string, error) {
|
||||
|
||||
// mergePyscnTomlConfigs merges .pyscn.toml config into defaults
|
||||
// using pointer booleans to detect unset values
|
||||
// Priority: [clones] section > individual sections (for backward compatibility)
|
||||
func (l *TomlConfigLoader) mergePyscnTomlConfigs(defaults *CloneConfig, pyscnToml *PyscnTomlConfig) {
|
||||
// First, merge from [clones] section if it exists (highest priority)
|
||||
l.mergeClonesSection(defaults, &pyscnToml.Clones)
|
||||
|
||||
// Then, merge from individual sections as fallback (for backward compatibility)
|
||||
// Only apply if not already set by [clones] section
|
||||
|
||||
// Analysis config
|
||||
if pyscnToml.Analysis.MinLines > 0 {
|
||||
defaults.Analysis.MinLines = pyscnToml.Analysis.MinLines
|
||||
@@ -257,6 +315,122 @@ func (l *TomlConfigLoader) mergePyscnTomlConfigs(defaults *CloneConfig, pyscnTom
|
||||
}
|
||||
}
|
||||
|
||||
// mergeClonesSection merges settings from the [clones] section
|
||||
func (l *TomlConfigLoader) mergeClonesSection(defaults *CloneConfig, clones *ClonesConfig) {
|
||||
// Analysis settings
|
||||
if clones.MinLines > 0 {
|
||||
defaults.Analysis.MinLines = clones.MinLines
|
||||
}
|
||||
if clones.MinNodes > 0 {
|
||||
defaults.Analysis.MinNodes = clones.MinNodes
|
||||
}
|
||||
if clones.MaxEditDistance > 0 {
|
||||
defaults.Analysis.MaxEditDistance = clones.MaxEditDistance
|
||||
}
|
||||
if clones.CostModelType != "" {
|
||||
defaults.Analysis.CostModelType = clones.CostModelType
|
||||
}
|
||||
if clones.IgnoreLiterals != nil {
|
||||
defaults.Analysis.IgnoreLiterals = *clones.IgnoreLiterals
|
||||
}
|
||||
if clones.IgnoreIdentifiers != nil {
|
||||
defaults.Analysis.IgnoreIdentifiers = *clones.IgnoreIdentifiers
|
||||
}
|
||||
|
||||
// Thresholds
|
||||
if clones.Type1Threshold > 0 {
|
||||
defaults.Thresholds.Type1Threshold = clones.Type1Threshold
|
||||
}
|
||||
if clones.Type2Threshold > 0 {
|
||||
defaults.Thresholds.Type2Threshold = clones.Type2Threshold
|
||||
}
|
||||
if clones.Type3Threshold > 0 {
|
||||
defaults.Thresholds.Type3Threshold = clones.Type3Threshold
|
||||
}
|
||||
if clones.Type4Threshold > 0 {
|
||||
defaults.Thresholds.Type4Threshold = clones.Type4Threshold
|
||||
}
|
||||
if clones.SimilarityThreshold > 0 {
|
||||
defaults.Thresholds.SimilarityThreshold = clones.SimilarityThreshold
|
||||
}
|
||||
|
||||
// Filtering
|
||||
if clones.MinSimilarity >= 0 {
|
||||
defaults.Filtering.MinSimilarity = clones.MinSimilarity
|
||||
}
|
||||
if clones.MaxSimilarity > 0 {
|
||||
defaults.Filtering.MaxSimilarity = clones.MaxSimilarity
|
||||
}
|
||||
if len(clones.EnabledCloneTypes) > 0 {
|
||||
defaults.Filtering.EnabledCloneTypes = clones.EnabledCloneTypes
|
||||
}
|
||||
if clones.MaxResults > 0 {
|
||||
defaults.Filtering.MaxResults = clones.MaxResults
|
||||
}
|
||||
|
||||
// Grouping
|
||||
if clones.GroupingMode != "" {
|
||||
defaults.Grouping.Mode = clones.GroupingMode
|
||||
}
|
||||
if clones.GroupingThreshold > 0 {
|
||||
defaults.Grouping.Threshold = clones.GroupingThreshold
|
||||
}
|
||||
if clones.KCoreK > 0 {
|
||||
defaults.Grouping.KCoreK = clones.KCoreK
|
||||
}
|
||||
|
||||
// LSH settings (this is the critical part!)
|
||||
if clones.LSHEnabled != "" {
|
||||
defaults.LSH.Enabled = clones.LSHEnabled
|
||||
}
|
||||
if clones.LSHAutoThreshold > 0 {
|
||||
defaults.LSH.AutoThreshold = clones.LSHAutoThreshold
|
||||
}
|
||||
if clones.LSHSimilarityThreshold > 0 {
|
||||
defaults.LSH.SimilarityThreshold = clones.LSHSimilarityThreshold
|
||||
}
|
||||
if clones.LSHBands > 0 {
|
||||
defaults.LSH.Bands = clones.LSHBands
|
||||
}
|
||||
if clones.LSHRows > 0 {
|
||||
defaults.LSH.Rows = clones.LSHRows
|
||||
}
|
||||
if clones.LSHHashes > 0 {
|
||||
defaults.LSH.Hashes = clones.LSHHashes
|
||||
}
|
||||
|
||||
// Performance
|
||||
if clones.MaxMemoryMB > 0 {
|
||||
defaults.Performance.MaxMemoryMB = clones.MaxMemoryMB
|
||||
}
|
||||
if clones.BatchSize > 0 {
|
||||
defaults.Performance.BatchSize = clones.BatchSize
|
||||
}
|
||||
if clones.EnableBatching != nil {
|
||||
defaults.Performance.EnableBatching = *clones.EnableBatching
|
||||
}
|
||||
if clones.MaxGoroutines > 0 {
|
||||
defaults.Performance.MaxGoroutines = clones.MaxGoroutines
|
||||
}
|
||||
if clones.TimeoutSeconds > 0 {
|
||||
defaults.Performance.TimeoutSeconds = clones.TimeoutSeconds
|
||||
}
|
||||
|
||||
// Output
|
||||
if clones.ShowDetails != nil {
|
||||
defaults.Output.ShowDetails = *clones.ShowDetails
|
||||
}
|
||||
if clones.ShowContent != nil {
|
||||
defaults.Output.ShowContent = *clones.ShowContent
|
||||
}
|
||||
if clones.SortBy != "" {
|
||||
defaults.Output.SortBy = clones.SortBy
|
||||
}
|
||||
if clones.GroupClones != nil {
|
||||
defaults.Output.GroupClones = *clones.GroupClones
|
||||
}
|
||||
}
|
||||
|
||||
// GetSupportedConfigFiles returns the list of supported TOML config files
|
||||
// in order of precedence
|
||||
func (l *TomlConfigLoader) GetSupportedConfigFiles() []string {
|
||||
|
||||
@@ -129,6 +129,13 @@ func (c *CloneConfigurationLoader) cloneConfigToCloneRequest(cloneCfg *config.Cl
|
||||
Recursive: cloneCfg.Input.Recursive,
|
||||
IncludePatterns: cloneCfg.Input.IncludePatterns,
|
||||
ExcludePatterns: cloneCfg.Input.ExcludePatterns,
|
||||
// LSH settings
|
||||
LSHEnabled: cloneCfg.LSH.Enabled,
|
||||
LSHAutoThreshold: cloneCfg.LSH.AutoThreshold,
|
||||
LSHSimilarityThreshold: cloneCfg.LSH.SimilarityThreshold,
|
||||
LSHBands: cloneCfg.LSH.Bands,
|
||||
LSHRows: cloneCfg.LSH.Rows,
|
||||
LSHHashes: cloneCfg.LSH.Hashes,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -135,6 +135,28 @@ func (s *CloneService) DetectClonesInFiles(ctx context.Context, filePaths []stri
|
||||
|
||||
// Starting actual clone detection (this is the slow part)
|
||||
|
||||
// Determine whether to use LSH based on configuration
|
||||
useLSH := false
|
||||
if req.LSHEnabled == "true" {
|
||||
useLSH = true
|
||||
fmt.Fprintf(os.Stderr, "LSH: Explicitly enabled (lsh_enabled=true)\n")
|
||||
} else if req.LSHEnabled == "false" {
|
||||
useLSH = false
|
||||
fmt.Fprintf(os.Stderr, "LSH: Explicitly disabled (lsh_enabled=false)\n")
|
||||
} else if req.LSHEnabled == "auto" || req.LSHEnabled == "" {
|
||||
// Auto mode: enable LSH if fragment count >= threshold
|
||||
threshold := req.LSHAutoThreshold
|
||||
if threshold == 0 {
|
||||
threshold = 500 // Default threshold
|
||||
}
|
||||
useLSH = len(allFragments) >= threshold
|
||||
fmt.Fprintf(os.Stderr, "LSH: Auto-detection - %d fragments, threshold=%d, enabled=%v\n",
|
||||
len(allFragments), threshold, useLSH)
|
||||
}
|
||||
|
||||
// Update detector config with LSH decision
|
||||
detectorConfig.UseLSH = useLSH
|
||||
|
||||
// Detect clones (use LSH if enabled)
|
||||
var clonePairs []*analyzer.ClonePair
|
||||
var cloneGroups []*analyzer.CloneGroup
|
||||
@@ -265,8 +287,8 @@ func (s *CloneService) createDetectorConfig(req *domain.CloneRequest) *analyzer.
|
||||
GroupingMode: groupMode,
|
||||
GroupingThreshold: groupThreshold,
|
||||
KCoreK: kVal,
|
||||
// LSH
|
||||
UseLSH: req.UseLSH,
|
||||
// LSH (UseLSH will be set dynamically based on fragment count)
|
||||
UseLSH: false, // Will be overridden after fragment extraction
|
||||
LSHSimilarityThreshold: req.LSHSimilarityThreshold,
|
||||
LSHBands: req.LSHBands,
|
||||
LSHRows: req.LSHRows,
|
||||
|
||||
Reference in New Issue
Block a user