1
0
mirror of https://github.com/life4/textdistance.git synced 2021-09-19 22:35:47 +03:00

More libs, some optimizations, same-length libs, manually collected extras

This commit is contained in:
orsinium
2018-03-31 11:04:49 +05:00
parent 23965bac83
commit 23ba2d5906
8 changed files with 96 additions and 40 deletions

View File

@@ -11,5 +11,7 @@ env:
install:
- pip install unittest2
- if [[ $WITH_CONSTRAINTS == 'yes' ]]; then pip install -r constraints.txt; fi
- if [[ $WITH_NUMPY == 'yes' ]]; then pip install numpy; fi
script:
- python run_tests.py

View File

@@ -1,7 +1,10 @@
abydos # https://github.com/chrislit/abydos
distance # https://github.com/doukremt/distance
jellyfish # https://github.com/jamesturk/jellyfish
numpy
py_stringmatching # https://github.com/anhaidgroup/py_stringmatching
pylev # https://github.com/toastdriven/pylev
python-Levenshtein # https://github.com/ztane/python-Levenshtein
pyxDamerauLevenshtein # https://github.com/gfairchild/pyxDamerauLevenshtein
tabulate
jellyfish
pyxDamerauLevenshtein
abydos
py_stringmatching
python-Levenshtein

View File

@@ -1,27 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from setuptools import setup
import os
import os.path
import json
import sys
def get_extras():
current_dir = os.path.dirname(os.path.abspath(__file__))
libs_file = os.path.join(current_dir, 'textdistance', 'libraries.json')
with open(libs_file, 'r') as f:
data = json.load(f)
# get only fastest lib for all algorithms
extras = {k: [v[0][0]] for k, v in data.items()}
extras['all'] = list(set(v[0] for v in extras.values()))
return extras
if 'show_extras' in sys.argv:
print(json.dumps(get_extras(), indent=2))
exit()
setup(
@@ -38,7 +17,45 @@ setup(
packages=['textdistance', 'textdistance.algorithms'],
package_data={'': ['*.json']},
requires=['python (>= 2.7)'],
extras_require=get_extras(),
extras_require={
'common': [ # enough for simple usage
'abydos',
'jellyfish', # for DamerauLevenshtein
'numpy', # for SmithWaterman and other
'python-Levenshtein', # for Jaro and Levenshtein
'pyxDamerauLevenshtein', # for DamerauLevenshtein
],
'all': [ # needed for benchmarking, optimization and testing
'abydos', # from common
'jellyfish', # from common
'numpy', # from common
'py_stringmatching', # maybe will be faster on your system :)
'python-Levenshtein', # from common
'pyxDamerauLevenshtein', # from common
'tabulate', # for benchmark's tables
],
# for algos, from fastest to slowest, only faster than textdistance:
'DamerauLevenshtein': [
'jellyfish', # only for text
'pyxdameraulevenshtein', # for any iterators
],
'Hamming': [
'Levenshtein', # only same length and strings
'jellyfish', # only strings, any length
'distance', # only same length, any iterators
'abydos', # any iterators
],
'Jaro': [
'Levenshtein', # only text
],
'JaroWinkler': [
'Levenshtein', # only text
],
'Levenshtein': [
'Levenshtein', # only text
# yeah, other libs slower than textdistance
],
},
url='https://github.com/orsinium/textdistance',
download_url='https://github.com/orsinium/textdistance/tarball/master',

View File

@@ -19,6 +19,8 @@ class ExternalTest(unittest.TestCase):
for s1, s2 in self.test_cases:
with self.subTest(alg=alg, lib=lib.module_name, s1=s1, s2=s2):
if not lib.check_conditions(internal_func, s1, s2):
continue
int_result = internal_func(s1, s2)
s1, s2 = lib.prepare(s1, s2)
ext_result = external_func(s1, s2)

View File

@@ -28,12 +28,11 @@ func = cls(external=False)
STMT = """
func('text', 'test')
func('test', 'testit')
# func('a' * 30, 'a' * 30)
# func('a' * 15, 'b' * 15)
func('qwer', 'asdf')
func('a' * 15, 'b' * 15)
"""
RUNS = 1000
RUNS = 2000
class Benchmark(object):
@@ -67,7 +66,7 @@ class Benchmark(object):
for alg in libraries.get_algorithms():
yield Lib(
algorithm=alg,
library='textdistance',
library='**textdistance**',
function=alg,
time=timeit(
stmt=STMT,

View File

@@ -10,16 +10,28 @@
]
],
"Hamming": [
[
"Levenshtein",
"hamming"
],
[
"jellyfish",
"hamming_distance"
],
[
"distance",
"hamming"
],
[
"abydos.distance",
"hamming"
]
],
"Jaro": [
[
"Levenshtein",
"jaro"
],
[
"jellyfish",
"jaro_distance"
@@ -47,10 +59,6 @@
[
"jellyfish",
"levenshtein_distance"
],
[
"py_stringmatching.similarity_measure.levenshtein",
"levenshtein"
]
]
}

View File

@@ -111,20 +111,43 @@ class TextLibrary(LibraryBase):
return sequences
class SameLengthLibrary(LibraryBase):
def check_conditions(self, obj, *sequences):
if not super(SameLengthLibrary, self).check_conditions(obj, *sequences):
return False
# compare only same length iterators
if min(map(len, sequences)) != max(map(len, sequences)):
return False
return True
class SameLengthTextLibrary(SameLengthLibrary, TextLibrary):
pass
libraries = LibrariesManager()
libraries.register('DamerauLevenshtein', LibraryBase('abydos.distance', 'damerau_levenshtein'))
libraries.register('DamerauLevenshtein', LibraryBase('pylev', 'damerau_levenshtein'))
libraries.register('DamerauLevenshtein', LibraryBase('pyxdameraulevenshtein', 'damerau_levenshtein_distance'))
libraries.register('DamerauLevenshtein', TextLibrary('jellyfish', 'damerau_levenshtein_distance'))
libraries.register('Hamming', LibraryBase('abydos.distance', 'hamming'))
libraries.register('Hamming', SameLengthLibrary('distance', 'hamming'))
libraries.register('Hamming', SameLengthTextLibrary('Levenshtein', 'hamming'))
libraries.register('Hamming', TextLibrary('jellyfish', 'hamming_distance'))
# libraries.register('Hamming', TextLibrary('Levenshtein', 'hamming'))
libraries.register('Jaro', TextLibrary('jellyfish', 'jaro_distance'))
libraries.register('Jaro', TextLibrary('Levenshtein', 'jaro'))
libraries.register('Jaro', TextLibrary('py_stringmatching.similarity_measure.jaro', 'jaro'))
# libraries.register('JaroWinkler', LibraryBase('py_stringmatching.similarity_measure.jaro_winkler', 'jaro_winkler'))
libraries.register('JaroWinkler', TextLibrary('jellyfish', 'jaro_winkler'))
libraries.register('JaroWinkler', TextLibrary('Levenshtein', 'jaro_winkler'))
libraries.register('Levenshtein', LibraryBase('abydos.distance', 'levenshtein'))
libraries.register('Levenshtein', LibraryBase('distance', 'levenshtein'))
libraries.register('Levenshtein', LibraryBase('pylev', 'levenshtein'))
libraries.register('Levenshtein', TextLibrary('jellyfish', 'levenshtein_distance'))
libraries.register('Levenshtein', TextLibrary('Levenshtein', 'distance'))
libraries.register('Levenshtein', TextLibrary('py_stringmatching.similarity_measure.levenshtein', 'levenshtein'))

View File

@@ -14,12 +14,14 @@ setenv =
NUMPY: WITH_NUMPY=yes
NONUMPY: WITH_NUMPY=no
deps =
NUMPY: numpy
CONS: jellyfish
CONS: pyxDamerauLevenshtein
CONS: abydos
CONS: distance
CONS: jellyfish
CONS: pylev
CONS: py_stringmatching
CONS: python-Levenshtein
CONS: pyxDamerauLevenshtein
NUMPY: numpy
py2: unittest2
[testenv:flake8]