fix: implement LSH auto-enable from config file

## Problem
LSH (Locality-Sensitive Hashing) acceleration was not working because:
1. LSH settings in [clones] section of .pyscn.toml were not being loaded
2. toml_loader.go expected nested [lsh] section, but config used flat structure
3. cloneConfigToCloneRequest() was not converting LSH settings to CloneRequest
4. Auto-enable logic based on fragment count was not implemented

This caused clone detection to always use slow APTED algorithm, even for
large projects where LSH would provide significant speedup.

## Solution
1. Added ClonesConfig struct to read flat [clones] section structure
2. Implemented mergeClonesSection() to load all settings including LSH
3. Extended CloneRequest with LSH fields (LSHEnabled, LSHAutoThreshold, etc.)
4. Added auto-enable logic in clone_service.go:
   - "auto": enable LSH when fragments >= threshold (default: 500)
   - "true": always enable LSH
   - "false": always disable LSH
5. Added diagnostic messages showing LSH decision

## Changes
- domain/clone.go: Add LSH config fields to CloneRequest
- internal/config/toml_loader.go: Add ClonesConfig struct and merge logic
- service/clone_config_loader.go: Convert LSH settings to CloneRequest
- service/clone_service.go: Implement auto-enable logic based on fragment count
- .pyscn.toml: Document LSH settings (no functional change)

## Testing
- Verified LSH auto-detection with different thresholds
- Confirmed settings load correctly from .pyscn.toml
- All existing tests pass

## Related
- Fixes issue discovered during performance investigation
- Prepares for config refactoring in #124

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
DaisukeYoda
2025-10-05 12:31:06 +09:00
parent 98cbc2dabd
commit d6a6fc7e0a
5 changed files with 219 additions and 12 deletions

View File

@@ -75,7 +75,7 @@ k_core_k = 2 # K-core parameter
# LSH acceleration settings
lsh_enabled = "auto" # Enable LSH: true, false, auto (based on project size)
lsh_auto_threshold = 500 # Auto-enable LSH for projects with >500 files
lsh_auto_threshold = 500 # Auto-enable LSH for projects with >500 fragments
lsh_similarity_threshold = 0.78 # LSH similarity threshold
lsh_bands = 32 # Number of LSH bands
lsh_rows = 4 # Number of rows per band

View File

@@ -177,7 +177,9 @@ type CloneRequest struct {
Timeout time.Duration `json:"timeout"` // Maximum time for clone analysis (0 = no timeout)
// LSH acceleration (opt-in)
UseLSH bool `json:"use_lsh"`
UseLSH bool `json:"use_lsh"` // Deprecated: use LSHEnabled instead
LSHEnabled string `json:"lsh_enabled"` // "auto", "true", "false"
LSHAutoThreshold int `json:"lsh_auto_threshold"` // Auto-enable LSH for N+ fragments
LSHSimilarityThreshold float64 `json:"lsh_similarity_threshold"`
LSHBands int `json:"lsh_bands"`
LSHRows int `json:"lsh_rows"`
@@ -365,6 +367,8 @@ func DefaultCloneRequest() *CloneRequest {
CloneTypes: []CloneType{Type1Clone, Type2Clone, Type3Clone, Type4Clone},
// LSH defaults (opt-in)
UseLSH: false,
LSHEnabled: "auto", // Auto-enable based on fragment count
LSHAutoThreshold: 500, // Enable LSH for 500+ fragments
LSHSimilarityThreshold: 0.78,
LSHBands: 32,
LSHRows: 4,

View File

@@ -9,14 +9,65 @@ import (
// PyscnTomlConfig represents the structure of .pyscn.toml
type PyscnTomlConfig struct {
Analysis PyscnTomlAnalysisConfig `toml:"analysis"`
Thresholds ThresholdConfig `toml:"thresholds"`
Filtering PyscnTomlFilteringConfig `toml:"filtering"`
Input PyscnTomlInputConfig `toml:"input"`
Output PyscnTomlOutputConfig `toml:"output"`
Performance PerformanceConfig `toml:"performance"`
Grouping GroupingConfig `toml:"grouping"`
LSH LSHConfig `toml:"lsh"`
Clones ClonesConfig `toml:"clones"` // Primary: [clones] section
Analysis PyscnTomlAnalysisConfig `toml:"analysis"` // Fallback for compatibility
Thresholds ThresholdConfig `toml:"thresholds"` // Fallback for compatibility
Filtering PyscnTomlFilteringConfig `toml:"filtering"` // Fallback for compatibility
Input PyscnTomlInputConfig `toml:"input"` // Fallback for compatibility
Output PyscnTomlOutputConfig `toml:"output"` // Fallback for compatibility
Performance PerformanceConfig `toml:"performance"` // Fallback for compatibility
Grouping GroupingConfig `toml:"grouping"` // Fallback for compatibility
LSH LSHConfig `toml:"lsh"` // Fallback for compatibility
}
// ClonesConfig represents the [clones] section (flat structure)
type ClonesConfig struct {
// Analysis settings
MinLines int `toml:"min_lines"`
MinNodes int `toml:"min_nodes"`
MaxEditDistance float64 `toml:"max_edit_distance"`
IgnoreLiterals *bool `toml:"ignore_literals"` // pointer to detect unset
IgnoreIdentifiers *bool `toml:"ignore_identifiers"` // pointer to detect unset
CostModelType string `toml:"cost_model_type"`
// Thresholds
Type1Threshold float64 `toml:"type1_threshold"`
Type2Threshold float64 `toml:"type2_threshold"`
Type3Threshold float64 `toml:"type3_threshold"`
Type4Threshold float64 `toml:"type4_threshold"`
SimilarityThreshold float64 `toml:"similarity_threshold"`
// Filtering
MinSimilarity float64 `toml:"min_similarity"`
MaxSimilarity float64 `toml:"max_similarity"`
EnabledCloneTypes []string `toml:"enabled_clone_types"`
MaxResults int `toml:"max_results"`
// Grouping
GroupingMode string `toml:"grouping_mode"`
GroupingThreshold float64 `toml:"grouping_threshold"`
KCoreK int `toml:"k_core_k"`
// LSH (flat structure with lsh_ prefix)
LSHEnabled string `toml:"lsh_enabled"`
LSHAutoThreshold int `toml:"lsh_auto_threshold"`
LSHSimilarityThreshold float64 `toml:"lsh_similarity_threshold"`
LSHBands int `toml:"lsh_bands"`
LSHRows int `toml:"lsh_rows"`
LSHHashes int `toml:"lsh_hashes"`
// Performance
MaxMemoryMB int `toml:"max_memory_mb"`
BatchSize int `toml:"batch_size"`
EnableBatching *bool `toml:"enable_batching"` // pointer to detect unset
MaxGoroutines int `toml:"max_goroutines"`
TimeoutSeconds int `toml:"timeout_seconds"`
// Output
ShowDetails *bool `toml:"show_details"` // pointer to detect unset
ShowContent *bool `toml:"show_content"` // pointer to detect unset
SortBy string `toml:"sort_by"`
GroupClones *bool `toml:"group_clones"` // pointer to detect unset
}
type PyscnTomlAnalysisConfig struct {
@@ -139,7 +190,14 @@ func (l *TomlConfigLoader) findPyscnToml(startDir string) (string, error) {
// mergePyscnTomlConfigs merges .pyscn.toml config into defaults
// using pointer booleans to detect unset values
// Priority: [clones] section > individual sections (for backward compatibility)
func (l *TomlConfigLoader) mergePyscnTomlConfigs(defaults *CloneConfig, pyscnToml *PyscnTomlConfig) {
// First, merge from [clones] section if it exists (highest priority)
l.mergeClonesSection(defaults, &pyscnToml.Clones)
// Then, merge from individual sections as fallback (for backward compatibility)
// Only apply if not already set by [clones] section
// Analysis config
if pyscnToml.Analysis.MinLines > 0 {
defaults.Analysis.MinLines = pyscnToml.Analysis.MinLines
@@ -257,6 +315,122 @@ func (l *TomlConfigLoader) mergePyscnTomlConfigs(defaults *CloneConfig, pyscnTom
}
}
// mergeClonesSection merges settings from the [clones] section
func (l *TomlConfigLoader) mergeClonesSection(defaults *CloneConfig, clones *ClonesConfig) {
// Analysis settings
if clones.MinLines > 0 {
defaults.Analysis.MinLines = clones.MinLines
}
if clones.MinNodes > 0 {
defaults.Analysis.MinNodes = clones.MinNodes
}
if clones.MaxEditDistance > 0 {
defaults.Analysis.MaxEditDistance = clones.MaxEditDistance
}
if clones.CostModelType != "" {
defaults.Analysis.CostModelType = clones.CostModelType
}
if clones.IgnoreLiterals != nil {
defaults.Analysis.IgnoreLiterals = *clones.IgnoreLiterals
}
if clones.IgnoreIdentifiers != nil {
defaults.Analysis.IgnoreIdentifiers = *clones.IgnoreIdentifiers
}
// Thresholds
if clones.Type1Threshold > 0 {
defaults.Thresholds.Type1Threshold = clones.Type1Threshold
}
if clones.Type2Threshold > 0 {
defaults.Thresholds.Type2Threshold = clones.Type2Threshold
}
if clones.Type3Threshold > 0 {
defaults.Thresholds.Type3Threshold = clones.Type3Threshold
}
if clones.Type4Threshold > 0 {
defaults.Thresholds.Type4Threshold = clones.Type4Threshold
}
if clones.SimilarityThreshold > 0 {
defaults.Thresholds.SimilarityThreshold = clones.SimilarityThreshold
}
// Filtering
if clones.MinSimilarity >= 0 {
defaults.Filtering.MinSimilarity = clones.MinSimilarity
}
if clones.MaxSimilarity > 0 {
defaults.Filtering.MaxSimilarity = clones.MaxSimilarity
}
if len(clones.EnabledCloneTypes) > 0 {
defaults.Filtering.EnabledCloneTypes = clones.EnabledCloneTypes
}
if clones.MaxResults > 0 {
defaults.Filtering.MaxResults = clones.MaxResults
}
// Grouping
if clones.GroupingMode != "" {
defaults.Grouping.Mode = clones.GroupingMode
}
if clones.GroupingThreshold > 0 {
defaults.Grouping.Threshold = clones.GroupingThreshold
}
if clones.KCoreK > 0 {
defaults.Grouping.KCoreK = clones.KCoreK
}
// LSH settings (this is the critical part!)
if clones.LSHEnabled != "" {
defaults.LSH.Enabled = clones.LSHEnabled
}
if clones.LSHAutoThreshold > 0 {
defaults.LSH.AutoThreshold = clones.LSHAutoThreshold
}
if clones.LSHSimilarityThreshold > 0 {
defaults.LSH.SimilarityThreshold = clones.LSHSimilarityThreshold
}
if clones.LSHBands > 0 {
defaults.LSH.Bands = clones.LSHBands
}
if clones.LSHRows > 0 {
defaults.LSH.Rows = clones.LSHRows
}
if clones.LSHHashes > 0 {
defaults.LSH.Hashes = clones.LSHHashes
}
// Performance
if clones.MaxMemoryMB > 0 {
defaults.Performance.MaxMemoryMB = clones.MaxMemoryMB
}
if clones.BatchSize > 0 {
defaults.Performance.BatchSize = clones.BatchSize
}
if clones.EnableBatching != nil {
defaults.Performance.EnableBatching = *clones.EnableBatching
}
if clones.MaxGoroutines > 0 {
defaults.Performance.MaxGoroutines = clones.MaxGoroutines
}
if clones.TimeoutSeconds > 0 {
defaults.Performance.TimeoutSeconds = clones.TimeoutSeconds
}
// Output
if clones.ShowDetails != nil {
defaults.Output.ShowDetails = *clones.ShowDetails
}
if clones.ShowContent != nil {
defaults.Output.ShowContent = *clones.ShowContent
}
if clones.SortBy != "" {
defaults.Output.SortBy = clones.SortBy
}
if clones.GroupClones != nil {
defaults.Output.GroupClones = *clones.GroupClones
}
}
// GetSupportedConfigFiles returns the list of supported TOML config files
// in order of precedence
func (l *TomlConfigLoader) GetSupportedConfigFiles() []string {

View File

@@ -129,6 +129,13 @@ func (c *CloneConfigurationLoader) cloneConfigToCloneRequest(cloneCfg *config.Cl
Recursive: cloneCfg.Input.Recursive,
IncludePatterns: cloneCfg.Input.IncludePatterns,
ExcludePatterns: cloneCfg.Input.ExcludePatterns,
// LSH settings
LSHEnabled: cloneCfg.LSH.Enabled,
LSHAutoThreshold: cloneCfg.LSH.AutoThreshold,
LSHSimilarityThreshold: cloneCfg.LSH.SimilarityThreshold,
LSHBands: cloneCfg.LSH.Bands,
LSHRows: cloneCfg.LSH.Rows,
LSHHashes: cloneCfg.LSH.Hashes,
}
}

View File

@@ -135,6 +135,28 @@ func (s *CloneService) DetectClonesInFiles(ctx context.Context, filePaths []stri
// Starting actual clone detection (this is the slow part)
// Determine whether to use LSH based on configuration
useLSH := false
if req.LSHEnabled == "true" {
useLSH = true
fmt.Fprintf(os.Stderr, "LSH: Explicitly enabled (lsh_enabled=true)\n")
} else if req.LSHEnabled == "false" {
useLSH = false
fmt.Fprintf(os.Stderr, "LSH: Explicitly disabled (lsh_enabled=false)\n")
} else if req.LSHEnabled == "auto" || req.LSHEnabled == "" {
// Auto mode: enable LSH if fragment count >= threshold
threshold := req.LSHAutoThreshold
if threshold == 0 {
threshold = 500 // Default threshold
}
useLSH = len(allFragments) >= threshold
fmt.Fprintf(os.Stderr, "LSH: Auto-detection - %d fragments, threshold=%d, enabled=%v\n",
len(allFragments), threshold, useLSH)
}
// Update detector config with LSH decision
detectorConfig.UseLSH = useLSH
// Detect clones (use LSH if enabled)
var clonePairs []*analyzer.ClonePair
var cloneGroups []*analyzer.CloneGroup
@@ -265,8 +287,8 @@ func (s *CloneService) createDetectorConfig(req *domain.CloneRequest) *analyzer.
GroupingMode: groupMode,
GroupingThreshold: groupThreshold,
KCoreK: kVal,
// LSH
UseLSH: req.UseLSH,
// LSH (UseLSH will be set dynamically based on fragment count)
UseLSH: false, // Will be overridden after fragment extraction
LSHSimilarityThreshold: req.LSHSimilarityThreshold,
LSHBands: req.LSHBands,
LSHRows: req.LSHRows,