mirror of
https://github.com/life4/textdistance.git
synced 2021-09-19 22:35:47 +03:00
More libs, some optimizations, same-length libs, manually collected extras
This commit is contained in:
@@ -11,5 +11,7 @@ env:
|
||||
install:
|
||||
- pip install unittest2
|
||||
- if [[ $WITH_CONSTRAINTS == 'yes' ]]; then pip install -r constraints.txt; fi
|
||||
- if [[ $WITH_NUMPY == 'yes' ]]; then pip install numpy; fi
|
||||
script:
|
||||
- python run_tests.py
|
||||
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
abydos # https://github.com/chrislit/abydos
|
||||
distance # https://github.com/doukremt/distance
|
||||
jellyfish # https://github.com/jamesturk/jellyfish
|
||||
numpy
|
||||
py_stringmatching # https://github.com/anhaidgroup/py_stringmatching
|
||||
pylev # https://github.com/toastdriven/pylev
|
||||
python-Levenshtein # https://github.com/ztane/python-Levenshtein
|
||||
pyxDamerauLevenshtein # https://github.com/gfairchild/pyxDamerauLevenshtein
|
||||
tabulate
|
||||
jellyfish
|
||||
pyxDamerauLevenshtein
|
||||
abydos
|
||||
py_stringmatching
|
||||
python-Levenshtein
|
||||
|
||||
|
||||
61
setup.py
61
setup.py
@@ -1,27 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from setuptools import setup
|
||||
import os
|
||||
import os.path
|
||||
import json
|
||||
import sys
|
||||
|
||||
|
||||
def get_extras():
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
libs_file = os.path.join(current_dir, 'textdistance', 'libraries.json')
|
||||
with open(libs_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
# get only fastest lib for all algorithms
|
||||
extras = {k: [v[0][0]] for k, v in data.items()}
|
||||
extras['all'] = list(set(v[0] for v in extras.values()))
|
||||
return extras
|
||||
|
||||
|
||||
if 'show_extras' in sys.argv:
|
||||
print(json.dumps(get_extras(), indent=2))
|
||||
exit()
|
||||
|
||||
|
||||
setup(
|
||||
@@ -38,7 +17,45 @@ setup(
|
||||
packages=['textdistance', 'textdistance.algorithms'],
|
||||
package_data={'': ['*.json']},
|
||||
requires=['python (>= 2.7)'],
|
||||
extras_require=get_extras(),
|
||||
extras_require={
|
||||
'common': [ # enough for simple usage
|
||||
'abydos',
|
||||
'jellyfish', # for DamerauLevenshtein
|
||||
'numpy', # for SmithWaterman and other
|
||||
'python-Levenshtein', # for Jaro and Levenshtein
|
||||
'pyxDamerauLevenshtein', # for DamerauLevenshtein
|
||||
],
|
||||
'all': [ # needed for benchmarking, optimization and testing
|
||||
'abydos', # from common
|
||||
'jellyfish', # from common
|
||||
'numpy', # from common
|
||||
'py_stringmatching', # maybe will be faster on your system :)
|
||||
'python-Levenshtein', # from common
|
||||
'pyxDamerauLevenshtein', # from common
|
||||
'tabulate', # for benchmark's tables
|
||||
],
|
||||
# for algos, from fastest to slowest, only faster than textdistance:
|
||||
'DamerauLevenshtein': [
|
||||
'jellyfish', # only for text
|
||||
'pyxdameraulevenshtein', # for any iterators
|
||||
],
|
||||
'Hamming': [
|
||||
'Levenshtein', # only same length and strings
|
||||
'jellyfish', # only strings, any length
|
||||
'distance', # only same length, any iterators
|
||||
'abydos', # any iterators
|
||||
],
|
||||
'Jaro': [
|
||||
'Levenshtein', # only text
|
||||
],
|
||||
'JaroWinkler': [
|
||||
'Levenshtein', # only text
|
||||
],
|
||||
'Levenshtein': [
|
||||
'Levenshtein', # only text
|
||||
# yeah, other libs slower than textdistance
|
||||
],
|
||||
},
|
||||
|
||||
url='https://github.com/orsinium/textdistance',
|
||||
download_url='https://github.com/orsinium/textdistance/tarball/master',
|
||||
|
||||
@@ -19,6 +19,8 @@ class ExternalTest(unittest.TestCase):
|
||||
|
||||
for s1, s2 in self.test_cases:
|
||||
with self.subTest(alg=alg, lib=lib.module_name, s1=s1, s2=s2):
|
||||
if not lib.check_conditions(internal_func, s1, s2):
|
||||
continue
|
||||
int_result = internal_func(s1, s2)
|
||||
s1, s2 = lib.prepare(s1, s2)
|
||||
ext_result = external_func(s1, s2)
|
||||
|
||||
@@ -28,12 +28,11 @@ func = cls(external=False)
|
||||
|
||||
STMT = """
|
||||
func('text', 'test')
|
||||
func('test', 'testit')
|
||||
# func('a' * 30, 'a' * 30)
|
||||
# func('a' * 15, 'b' * 15)
|
||||
func('qwer', 'asdf')
|
||||
func('a' * 15, 'b' * 15)
|
||||
"""
|
||||
|
||||
RUNS = 1000
|
||||
RUNS = 2000
|
||||
|
||||
|
||||
class Benchmark(object):
|
||||
@@ -67,7 +66,7 @@ class Benchmark(object):
|
||||
for alg in libraries.get_algorithms():
|
||||
yield Lib(
|
||||
algorithm=alg,
|
||||
library='textdistance',
|
||||
library='**textdistance**',
|
||||
function=alg,
|
||||
time=timeit(
|
||||
stmt=STMT,
|
||||
|
||||
@@ -10,16 +10,28 @@
|
||||
]
|
||||
],
|
||||
"Hamming": [
|
||||
[
|
||||
"Levenshtein",
|
||||
"hamming"
|
||||
],
|
||||
[
|
||||
"jellyfish",
|
||||
"hamming_distance"
|
||||
],
|
||||
[
|
||||
"distance",
|
||||
"hamming"
|
||||
],
|
||||
[
|
||||
"abydos.distance",
|
||||
"hamming"
|
||||
]
|
||||
],
|
||||
"Jaro": [
|
||||
[
|
||||
"Levenshtein",
|
||||
"jaro"
|
||||
],
|
||||
[
|
||||
"jellyfish",
|
||||
"jaro_distance"
|
||||
@@ -47,10 +59,6 @@
|
||||
[
|
||||
"jellyfish",
|
||||
"levenshtein_distance"
|
||||
],
|
||||
[
|
||||
"py_stringmatching.similarity_measure.levenshtein",
|
||||
"levenshtein"
|
||||
]
|
||||
]
|
||||
}
|
||||
@@ -111,20 +111,43 @@ class TextLibrary(LibraryBase):
|
||||
return sequences
|
||||
|
||||
|
||||
class SameLengthLibrary(LibraryBase):
|
||||
def check_conditions(self, obj, *sequences):
|
||||
if not super(SameLengthLibrary, self).check_conditions(obj, *sequences):
|
||||
return False
|
||||
# compare only same length iterators
|
||||
if min(map(len, sequences)) != max(map(len, sequences)):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
|
||||
pass
|
||||
|
||||
|
||||
libraries = LibrariesManager()
|
||||
|
||||
libraries.register('DamerauLevenshtein', LibraryBase('abydos.distance', 'damerau_levenshtein'))
|
||||
libraries.register('DamerauLevenshtein', LibraryBase('pylev', 'damerau_levenshtein'))
|
||||
libraries.register('DamerauLevenshtein', LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance'))
|
||||
libraries.register('DamerauLevenshtein', TextLibrary('jellyfish', 'damerau_levenshtein_distance'))
|
||||
|
||||
libraries.register('Hamming', LibraryBase('abydos.distance', 'hamming'))
|
||||
libraries.register('Hamming', SameLengthLibrary('distance', 'hamming'))
|
||||
libraries.register('Hamming', SameLengthTextLibrary('Levenshtein', 'hamming'))
|
||||
libraries.register('Hamming', TextLibrary('jellyfish', 'hamming_distance'))
|
||||
# libraries.register('Hamming', TextLibrary('Levenshtein', 'hamming'))
|
||||
|
||||
libraries.register('Jaro', TextLibrary('jellyfish', 'jaro_distance'))
|
||||
libraries.register('Jaro', TextLibrary('Levenshtein', 'jaro'))
|
||||
libraries.register('Jaro', TextLibrary('py_stringmatching.similarity_measure.jaro', 'jaro'))
|
||||
|
||||
# libraries.register('JaroWinkler', LibraryBase('py_stringmatching.similarity_measure.jaro_winkler', 'jaro_winkler'))
|
||||
libraries.register('JaroWinkler', TextLibrary('jellyfish', 'jaro_winkler'))
|
||||
libraries.register('JaroWinkler', TextLibrary('Levenshtein', 'jaro_winkler'))
|
||||
|
||||
libraries.register('Levenshtein', LibraryBase('abydos.distance', 'levenshtein'))
|
||||
libraries.register('Levenshtein', LibraryBase('distance', 'levenshtein'))
|
||||
libraries.register('Levenshtein', LibraryBase('pylev', 'levenshtein'))
|
||||
libraries.register('Levenshtein', TextLibrary('jellyfish', 'levenshtein_distance'))
|
||||
libraries.register('Levenshtein', TextLibrary('Levenshtein', 'distance'))
|
||||
libraries.register('Levenshtein', TextLibrary('py_stringmatching.similarity_measure.levenshtein', 'levenshtein'))
|
||||
|
||||
8
tox.ini
8
tox.ini
@@ -14,12 +14,14 @@ setenv =
|
||||
NUMPY: WITH_NUMPY=yes
|
||||
NONUMPY: WITH_NUMPY=no
|
||||
deps =
|
||||
NUMPY: numpy
|
||||
CONS: jellyfish
|
||||
CONS: pyxDamerauLevenshtein
|
||||
CONS: abydos
|
||||
CONS: distance
|
||||
CONS: jellyfish
|
||||
CONS: pylev
|
||||
CONS: py_stringmatching
|
||||
CONS: python-Levenshtein
|
||||
CONS: pyxDamerauLevenshtein
|
||||
NUMPY: numpy
|
||||
py2: unittest2
|
||||
|
||||
[testenv:flake8]
|
||||
|
||||
Reference in New Issue
Block a user