mirror of
https://github.com/life4/textdistance.git
synced 2021-09-19 22:35:47 +03:00
124 lines
3.3 KiB
Python
124 lines
3.3 KiB
Python
import json
|
|
from collections import defaultdict, namedtuple
|
|
from timeit import timeit
|
|
|
|
from tabulate import tabulate
|
|
|
|
from .libraries import not_optimized_libraries as libraries
|
|
from .libraries import LIBRARIES_FILE
|
|
|
|
|
|
# python3 -m textdistance.benchmark
|
|
|
|
|
|
Lib = namedtuple('Lib', ['algorithm', 'library', 'function', 'time', 'presets'])
|
|
|
|
|
|
EXTERNAL_SETUP = """
|
|
from {library} import {function} as func
|
|
presets = {presets}
|
|
if presets:
|
|
func = func(presets)
|
|
"""
|
|
|
|
INTERNAL_SETUP = """
|
|
from textdistance import {} as cls
|
|
func = cls(external=False)
|
|
"""
|
|
|
|
STMT = """
|
|
func('text', 'test')
|
|
func('qwer', 'asdf')
|
|
func('a' * 15, 'b' * 15)
|
|
"""
|
|
|
|
RUNS = 2000
|
|
|
|
|
|
class Benchmark(object):
|
|
@staticmethod
|
|
def get_installed():
|
|
for alg in libraries.get_algorithms():
|
|
for lib in libraries.get_libs(alg):
|
|
# try load function
|
|
if not lib.get_function():
|
|
continue
|
|
# return library info
|
|
yield Lib(
|
|
algorithm=alg,
|
|
library=lib.module_name,
|
|
function=lib.func_name,
|
|
time=float('Inf'),
|
|
presets=lib.presets,
|
|
)
|
|
|
|
@staticmethod
|
|
def get_external_benchmark(installed):
|
|
for lib in installed:
|
|
yield lib._replace(time=timeit(
|
|
stmt=STMT,
|
|
setup=EXTERNAL_SETUP.format(**lib._asdict()),
|
|
number=RUNS,
|
|
))
|
|
|
|
@staticmethod
|
|
def get_internal_benchmark():
|
|
for alg in libraries.get_algorithms():
|
|
yield Lib(
|
|
algorithm=alg,
|
|
library='**textdistance**',
|
|
function=alg,
|
|
time=timeit(
|
|
stmt=STMT,
|
|
setup=INTERNAL_SETUP.format(alg),
|
|
number=RUNS,
|
|
),
|
|
presets=None,
|
|
)
|
|
|
|
@staticmethod
|
|
def filter_benchmark(external, internal):
|
|
limits = {i.algorithm: i.time for i in internal}
|
|
return filter(lambda x: x.time < limits[x.algorithm], external)
|
|
|
|
@staticmethod
|
|
def get_table(data):
|
|
table = tabulate(
|
|
[tuple(i[:-1]) for i in data],
|
|
headers=['algorithm', 'library', 'function', 'time'],
|
|
)
|
|
table += '\nTotal: {} libs.\n\n'.format(len(data))
|
|
return table
|
|
|
|
@staticmethod
|
|
def save(libs):
|
|
data = defaultdict(list)
|
|
for lib in libs:
|
|
data[lib.algorithm].append([lib.library, lib.function])
|
|
with open(LIBRARIES_FILE, 'w') as f:
|
|
json.dump(obj=data, fp=f, indent=2, sort_keys=True)
|
|
|
|
@classmethod
|
|
def run(cls):
|
|
print('# Installed libraries:\n')
|
|
installed = list(cls.get_installed())
|
|
installed.sort()
|
|
print(cls.get_table(installed))
|
|
|
|
print('# Benchmarks (with textdistance):\n')
|
|
benchmark = list(cls.get_external_benchmark(installed))
|
|
benchmark_internal = list(cls.get_internal_benchmark())
|
|
benchmark += benchmark_internal
|
|
benchmark.sort(key=lambda x: (x.algorithm, x.time))
|
|
print(cls.get_table(benchmark))
|
|
|
|
print('# Faster than textdistance:\n')
|
|
benchmark = list(cls.filter_benchmark(benchmark, benchmark_internal))
|
|
print(cls.get_table(benchmark))
|
|
|
|
cls.save(benchmark)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
Benchmark.run()
|