mirror of
https://github.com/ludo-technologies/pyscn.git
synced 2025-10-06 00:59:45 +03:00
fix: make duplication scoring more strict and lower LSH threshold
## Problems Fixed 1. **Duplication scoring too lenient**: 5.8% duplication was scored as 100/100 (perfect) because thresholds were too high 2. **LSH threshold preventing clone detection**: 0.78 threshold filtered too many candidates, reducing detection accuracy ## Changes ### Duplication Scoring Thresholds (domain/analyze.go) - `DuplicationThresholdLow`: 10.0 → 3.0 - `DuplicationThresholdMedium`: 25.0 → 10.0 - `DuplicationThresholdHigh`: 40.0 → 20.0 New scoring behavior: - 0-3% duplication → score 100 (excellent) - 3-10% duplication → score 70 (good, needs attention) - 10-20% duplication → score 40 (poor) - >20% duplication → score 0 (critical) ### LSH Threshold (.pyscn.toml) - `lsh_similarity_threshold`: 0.78 → 0.50 - Lower threshold allows more clone candidates for APTED verification - Improves recall without significantly impacting precision ## Impact Before: `Duplication: 100/100 ✅ (5.8% duplication, 5 groups)` After: `Duplication: 70/100 👍 (5.8% duplication, 5 groups)` The score now accurately reflects that 5.8% duplication with 5 clone groups requires attention, while still maintaining an overall healthy codebase grade. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -76,7 +76,7 @@ k_core_k = 2 # K-core parameter
|
|||||||
# LSH acceleration settings
|
# LSH acceleration settings
|
||||||
lsh_enabled = "auto" # Enable LSH: true, false, auto (based on project size)
|
lsh_enabled = "auto" # Enable LSH: true, false, auto (based on project size)
|
||||||
lsh_auto_threshold = 500 # Auto-enable LSH for projects with >500 fragments
|
lsh_auto_threshold = 500 # Auto-enable LSH for projects with >500 fragments
|
||||||
lsh_similarity_threshold = 0.78 # LSH similarity threshold
|
lsh_similarity_threshold = 0.50 # LSH similarity threshold
|
||||||
lsh_bands = 32 # Number of LSH bands
|
lsh_bands = 32 # Number of LSH bands
|
||||||
lsh_rows = 4 # Number of rows per band
|
lsh_rows = 4 # Number of rows per band
|
||||||
lsh_hashes = 128 # Number of hash functions
|
lsh_hashes = 128 # Number of hash functions
|
||||||
|
|||||||
@@ -17,9 +17,9 @@ const (
|
|||||||
ComplexityPenaltyLow = 6
|
ComplexityPenaltyLow = 6
|
||||||
|
|
||||||
// Code duplication thresholds and penalties
|
// Code duplication thresholds and penalties
|
||||||
DuplicationThresholdHigh = 40.0
|
DuplicationThresholdHigh = 20.0
|
||||||
DuplicationThresholdMedium = 25.0
|
DuplicationThresholdMedium = 10.0
|
||||||
DuplicationThresholdLow = 10.0
|
DuplicationThresholdLow = 3.0
|
||||||
DuplicationPenaltyHigh = 20
|
DuplicationPenaltyHigh = 20
|
||||||
DuplicationPenaltyMedium = 12
|
DuplicationPenaltyMedium = 12
|
||||||
DuplicationPenaltyLow = 6
|
DuplicationPenaltyLow = 6
|
||||||
|
|||||||
Reference in New Issue
Block a user