56 Commits

Author SHA1 Message Date
Markus
a1e0c5bb4c Add dna to hyperactive csv. 2021-09-15 20:50:54 +02:00
Markus
e027c10b35 Add helper to convert values to dna. 2021-09-15 20:49:52 +02:00
Markus
254987d5b7 Add helper to convert values to dna. 2021-09-15 20:40:06 +02:00
Markus
750ca2da26 Update. 2021-09-14 20:19:53 +02:00
Markus
9e25c3a611 Update. 2021-09-14 19:37:56 +02:00
Markus
df9bed714a Update. 2021-09-14 19:06:23 +02:00
Markus
32daf1d3e4 Merge branch 'master' into hyperactive 2021-09-14 17:40:12 +02:00
Markus
09fd8676fb Sourcery optimize 2021-07-25 11:06:55 +02:00
Markus
18841d1ce4 Merge remote-tracking branch 'origin/hyperactive' into hyperactive 2021-07-25 11:03:57 +02:00
Markus
eb1f918d97 Merge branch 'master' into hyperactive 2021-07-25 11:03:34 +02:00
Markus
d16d6d76ef No minimum trades. Total_effect_rate manages this. 2021-07-24 20:36:34 +02:00
Markus
56bb6238d0 Add dna_rules to hyperactive implementation. 2021-07-24 20:32:18 +02:00
Markus
0b7d5422b7 Merge branch 'master' into hyperactive 2021-07-24 20:09:13 +02:00
Markus
22a31b9eee Merge branch 'master' into hyperactive
# Conflicts:
#	requirements.txt
2021-04-13 11:58:10 +02:00
Markus
af151b256c Remove daily balance and overfitting check again, as csv gets to big. Validation needs to be seperat and only performed on the best scores. For the daily_balance those backtests need to be rerun. 2021-03-22 09:50:45 +01:00
Markus
2f8f471a77 Finalize the overfitting check. 2021-03-21 16:38:56 +01:00
Markus
eb71a4eadb Finalize the overfitting check. 2021-03-21 16:37:39 +01:00
Markus
cea07d4b94 Finalize the overfitting check. 2021-03-21 16:37:09 +01:00
Markus
ef8afab006 Update hyperactive. 2021-03-21 12:33:15 +01:00
Markus
50f716c9b1 Further improve validation. 2021-03-20 21:05:16 +01:00
Markus
1586d0eeee Better handle files. 2021-03-20 20:15:50 +01:00
Markus
21a9707a92 Remove custom handling of iterations. 2021-03-20 19:17:51 +01:00
Markus
3d7c424aae Fix for nan in csv. 2021-03-20 19:15:59 +01:00
Markus
d4c50aff20 Change score of bad backtests. 2021-03-20 18:57:03 +01:00
Markus
0c86aa1d26 Check for iterations smaller then previous. 2021-03-20 18:52:12 +01:00
Markus
7eab98ef72 Merge remote-tracking branch 'origin/hyperactive' into hyperactive
# Conflicts:
#	jesse/modes/optimize_hyperactive_mode/__init__.py
2021-03-20 18:44:54 +01:00
Markus
9990c70ae7 Add validation. 2021-03-20 18:44:37 +01:00
Markus
d76334411b Add daily_returns to csv. 2021-03-20 13:55:21 +01:00
Markus
c14f130b4a Fix score for hyperactive. 2021-03-17 19:57:26 +01:00
Markus
2d7ee0b137 Merge branch 'master' into hyperactive 2021-03-15 13:41:34 +01:00
Markus
edd2239790 Merge branch 'debug' into hyperactive 2021-03-15 13:40:11 +01:00
Markus
7952faf268 Fix typo. 2021-03-15 10:32:02 +01:00
Markus
0207233a45 Merge branch 'master' into hyperactive 2021-03-15 10:30:08 +01:00
Markus
56fa2f1e8a Fix search space. 2021-03-15 10:30:02 +01:00
Markus
6dee3f3e76 Fix search space. 2021-03-15 10:28:06 +01:00
Markus
a5c9da389f Fix search space. 2021-03-15 10:25:40 +01:00
Markus
aa16daad95 Fix search space. 2021-03-15 10:19:55 +01:00
Markus
93c8fb82dd Fix search space. 2021-03-15 10:07:43 +01:00
Markus
5eb13b82e0 Use range because of missing value in 0 to 1. 2021-03-15 09:49:52 +01:00
Markus
979826ca7e Add logger in case of exception. 2021-03-15 09:35:59 +01:00
Markus
d010379ef5 Minor fix. 2021-03-14 22:12:36 +01:00
Markus
ed9a589be9 Minor fix. 2021-03-14 21:20:30 +01:00
Markus
474465aee7 Minor fix. 2021-03-14 21:15:49 +01:00
Markus
2eb109ce63 Fix not saved progress. 2021-03-14 21:13:11 +01:00
Markus
559a888483 Add combinations_count. 2021-03-14 20:01:38 +01:00
Markus
20e6ec57af Check for existing optimizer. 2021-03-14 19:41:58 +01:00
Markus
150765d267 Remove not suited optimizer. 2021-03-14 19:21:13 +01:00
Markus
16bc1016d1 Minor changes. 2021-03-14 19:14:03 +01:00
Markus
45317f953a Make optimizer configurable and add configurable iterations. 2021-03-14 18:55:05 +01:00
Markus
7bca9d042d Add optimizer. 2021-03-12 13:00:44 +01:00
Markus
a6b9d2fd86 Add optimizer. 2021-03-12 12:48:42 +01:00
Markus
36a03f16e3 Safe and load progress. 2021-03-12 12:09:36 +01:00
Markus
b5db3c82bf Safe and load progress. 2021-03-12 12:01:33 +01:00
Markus
9a99effd06 Safe and load progress. 2021-03-12 11:58:40 +01:00
Markus
607cd291dd Random Search 2021-03-12 11:05:05 +01:00
Markus
e70335fa8e Merge branch 'master' into lfix 2021-03-07 14:54:24 +01:00
5 changed files with 568 additions and 7 deletions

View File

@@ -337,6 +337,31 @@ def optimize(start_date: str, finish_date: str, optimal_total: int, cpu: int, de
optimize_mode(start_date, finish_date, optimal_total, cpu, csv, json)
@cli.command()
@click.argument('start_date', required=True, type=str)
@click.argument('finish_date', required=True, type=str)
@click.argument('optimal_total', required=True, type=int)
@click.argument('optimizer', required=True, type=str)
@click.argument('iterations', required=True, type=int)
@click.option('--cpu', default=0, show_default=True, help='The number of CPU cores that Jesse is allowed to use. If set to 0, it will use as many as is available on your machine.')
@click.option('--debug/--no-debug', default=False, help='Displays detailed logs about the genetics algorithm. Use it if you are interested int he genetics algorithm.')
def optimize_hyperactive(start_date: str, finish_date: str, optimal_total: int, optimizer: str, iterations: int, cpu: int, debug: bool) -> None:
"""
tunes the hyper-parameters of your strategy
"""
validate_cwd()
from jesse.config import config
config['app']['trading_mode'] = 'optimize'
register_custom_exception_handler()
# debug flag
config['app']['debug_mode'] = debug
from jesse.modes.optimize_hyperactive_mode import optimize_mode_hyperactive
optimize_mode_hyperactive(start_date, finish_date, optimal_total, cpu, optimizer, iterations)
@cli.command()
@click.argument('name', required=True, type=str)
def make_strategy(name: str) -> None:

View File

@@ -5,8 +5,9 @@ import random
import string
import sys
import uuid
from typing import List, Tuple, Union, Any
from pprint import pprint
from typing import List, Tuple, Union, Any
import arrow
import click
import numpy as np
@@ -121,7 +122,7 @@ def date_to_timestamp(date: str) -> int:
return arrow_to_timestamp(arrow.get(date, 'YYYY-MM-DD'))
def dna_to_hp(strategy_hp, dna: str):
def dna_to_hp(strategy_hp: list, dna: str) -> dict:
hp = {}
for gene, h in zip(dna, strategy_hp):
@@ -140,6 +141,23 @@ def dna_to_hp(strategy_hp, dna: str):
return hp
def hp_to_dna(strategy_hp: list, values: list) -> str:
hp = ""
for h in strategy_hp:
if h['type'] is int or h['type'] is float:
encoded_gene = chr(
round(
convert_number(h['max'], h['min'], 119, 40, values[h['name']])
)
)
else:
raise TypeError('Only int and float types are implemented')
hp += encoded_gene
return hp
def dump_exception() -> None:
"""
a useful debugging helper
@@ -274,6 +292,29 @@ def get_strategy_class(strategy_name: str):
return locate(f'strategies.{strategy_name}.{strategy_name}')
def hp_rules_valid(hp, rules):
check = np.full((len(rules)), False, dtype=bool)
for i, rule in enumerate(rules):
if rule['operator'] not in ["<", ">", "<=", ">="]:
raise ValueError("{} is not a supported operator. Choose from < > <= >=".format(rule['operator']))
if rule['hp_name1'] not in hp:
raise ValueError("The hp name {} doesn't exist.".format(rule['hp_name1']))
if rule['hp_name2'] not in hp:
raise ValueError("The hp name {} doesn't exist.".format(rule['hp_name2']))
if rule['operator'] == ">":
check[i] = hp[rule['hp_name1']] > hp[rule['hp_name2']]
elif rule['operator'] == "<":
check[i] = hp[rule['hp_name1']] < hp[rule['hp_name2']]
elif rule['operator'] == ">=":
check[i] = hp[rule['hp_name1']] >= hp[rule['hp_name2']]
elif rule['operator'] == "<=":
check[i] = hp[rule['hp_name1']] <= hp[rule['hp_name2']]
return np.all(check == True)
def insecure_hash(msg: str) -> str:
return hashlib.md5(msg.encode()).hexdigest()
@@ -600,11 +641,11 @@ def round_decimals_down(number: np.ndarray, decimals: int = 2) -> float:
Returns a value rounded down to a specific number of decimal places.
"""
if not isinstance(decimals, int):
raise TypeError("decimal places must be an integer")
raise TypeError("decimal places must be an integer")
elif decimals < 0:
raise ValueError("decimal places has to be 0 or more")
raise ValueError("decimal places has to be 0 or more")
elif decimals == 0:
return np.floor(number)
return np.floor(number)
factor = 10 ** decimals
return np.floor(number * factor) / factor

View File

@@ -0,0 +1,334 @@
import ast
import csv
import os
import traceback
from math import log10
from multiprocessing import cpu_count
import click
import hyperactive
import numpy as np
import pandas as pd
from hyperactive.dashboards import ProgressBoard
import jesse.helpers as jh
import jesse.services.logger as logger
import jesse.services.required_candles as required_candles
from jesse import exceptions
from jesse.config import config
from jesse.modes.backtest_mode import simulator
from jesse.routes import router
from jesse.services import metrics as stats
from jesse.services.validators import validate_routes
from jesse.store import store
# from .overfitting import CSCV
from optimization_algorithm_config import optimization_config
os.environ['NUMEXPR_MAX_THREADS'] = str(cpu_count())
class Optimizer():
def __init__(self, training_candles, optimal_total: int, cpu_cores: int, optimizer: str, iterations: int) -> None:
if len(router.routes) != 1:
raise NotImplementedError('optimize_mode mode only supports one route at the moment')
self.strategy_name = router.routes[0].strategy_name
self.optimal_total = optimal_total
self.exchange = router.routes[0].exchange
self.symbol = router.routes[0].symbol
self.timeframe = router.routes[0].timeframe
StrategyClass = jh.get_strategy_class(self.strategy_name)
self.strategy_hp = StrategyClass.hyperparameters(None)
if hasattr(StrategyClass, 'hyperparameters_rules'):
self.hyperparameters_rules = StrategyClass.hyperparameters_rules(None)
else:
self.hyperparameters_rules = None
self.solution_len = len(self.strategy_hp)
self.optimizer = optimizer
self.iterations = iterations
if self.solution_len == 0:
raise exceptions.InvalidStrategy('Targeted strategy does not implement a valid hyperparameters() method.')
if cpu_cores > cpu_count():
raise ValueError(f'Entered cpu cores number is more than available on this machine which is {cpu_count()}')
elif cpu_cores == 0:
self.cpu_cores = cpu_count()
else:
self.cpu_cores = cpu_cores
self.training_candles = training_candles
key = jh.key(self.exchange, self.symbol)
training_candles_start_date = jh.timestamp_to_time(self.training_candles[key]['candles'][0][0]).split('T')[0]
training_candles_finish_date = jh.timestamp_to_time(self.training_candles[key]['candles'][-1][0]).split('T')[0]
self.training_initial_candles = []
for c in config['app']['considering_candles']:
self.training_initial_candles.append(
required_candles.load_required_candles(c[0], c[1], training_candles_start_date,
training_candles_finish_date))
self.study_name = f'{self.strategy_name}-{self.exchange}-{self.symbol}-{ self.timeframe}-{self.optimizer}'
self.path = f'storage/optimize/csv/{self.study_name}.csv'
os.makedirs('./storage/optimize/csv', exist_ok=True)
def objective_function(self, hp: str):
score = np.nan
try:
if self.hyperparameters_rules is None or jh.hp_rules_valid(hp, self.hyperparameters_rules):
# init candle store
store.candles.init_storage(5000)
# inject required TRAINING candles to the candle store
for num, c in enumerate(config['app']['considering_candles']):
required_candles.inject_required_candles_to_store(
self.training_initial_candles[num],
c[0],
c[1]
)
# run backtest simulation
simulator(self.training_candles, hp)
training_data = stats.trades(store.completed_trades.trades, store.app.daily_balance)
total_effect_rate = log10(training_data['total']) / log10(self.optimal_total)
total_effect_rate = min(total_effect_rate, 1)
ratio_config = jh.get_config('env.optimization.ratio', 'sharpe')
if ratio_config == 'sharpe':
ratio = training_data['sharpe_ratio']
ratio_normalized = jh.normalize(ratio, -.5, 5)
elif ratio_config == 'calmar':
ratio = training_data['calmar_ratio']
ratio_normalized = jh.normalize(ratio, -.5, 30)
elif ratio_config == 'sortino':
ratio = training_data['sortino_ratio']
ratio_normalized = jh.normalize(ratio, -.5, 15)
elif ratio_config == 'omega':
ratio = training_data['omega_ratio']
ratio_normalized = jh.normalize(ratio, -.5, 5)
else:
raise ValueError(f'The entered ratio configuration `{ratio_config}` for the optimization is unknown. Choose between sharpe, calmar, sortino and omega.')
if ratio > 0:
score = total_effect_rate * ratio_normalized
except Exception as e:
logger.error("".join(traceback.TracebackException.from_exception(e).format()))
finally:
# you can access the entire dictionary from "para"
parameter_dict = hp.para_dict
# save the score in the copy of the dictionary
parameter_dict["score"] = score
parameter_dict["dna"] = jh.hp_to_dna(self.strategy_hp, hp.para_dict)
# if score:
# # save the daily_returns in the copy of the dictionary
# parameter_dict["daily_balance"] = str(store.app.daily_balance)
# else:
# parameter_dict["daily_balance"] = np.nan
# append parameter dictionary to csv
with open(self.path, "a") as f:
writer = csv.writer(f, delimiter=';')
fields = parameter_dict.values()
writer.writerow(fields)
# reset store
store.reset()
return score
def get_search_space(self):
hp = {}
for st_hp in self.strategy_hp:
if st_hp['type'] is int:
if 'step' not in st_hp:
st_hp['step'] = 1
hp[st_hp['name']] = list(range(st_hp['min'], st_hp['max'] + st_hp['step'], st_hp['step']))
elif st_hp['type'] is float:
if 'step' not in st_hp:
st_hp['step'] = 0.1
decs = str(st_hp['step'])[::-1].find('.')
hp[st_hp['name']] = list(
np.trunc(np.arange(st_hp['min'], st_hp['max'] + st_hp['step'], st_hp['step']) * 10 ** decs) / (
10 ** decs))
elif st_hp['type'] is bool:
hp[st_hp['name']] = [True, False]
else:
raise TypeError('Only int, bool and float types are implemented')
return hp
def run(self):
# create an instance of the ProgressBoard
# progress_board = ProgressBoard()
hyper = hyperactive.Hyperactive(distribution="multiprocessing",
verbosity=["progress_bar", "print_results", "print_times"])
self.search_space = self.get_search_space()
# Later use actual search space combinations to determin n_iter
# keys, values = zip(*self.search_space.items())
# combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]
# combinations_count = len(combinations)
mem = None
if jh.file_exists(self.path):
with open(self.path, "r") as f:
mem = pd.read_csv(f, sep=";", na_values='nan')
mem.drop('dna', axis=1, inplace=True)
if not mem.empty and not click.confirm(
f'Previous optimization results for {self.study_name} exists. Continue?',
default=True,
):
mem = None
if self.optimizer == "RepulsingHillClimbingOptimizer":
optimizer = hyperactive.RepulsingHillClimbingOptimizer(
epsilon=optimization_config[self.optimizer]['epsilon'],
distribution=optimization_config[self.optimizer]['distribution'],
n_neighbours=optimization_config[self.optimizer]['n_neighbours'],
rand_rest_p=optimization_config[self.optimizer]['rand_rest_p'],
repulsion_factor=optimization_config[self.optimizer]['repulsion_factor'],
)
elif self.optimizer == "SimulatedAnnealingOptimizer":
optimizer = hyperactive.SimulatedAnnealingOptimizer(
epsilon=optimization_config[self.optimizer]['epsilon'],
distribution=optimization_config[self.optimizer]['distribution'],
n_neighbours=optimization_config[self.optimizer]['n_neighbours'],
rand_rest_p=optimization_config[self.optimizer]['rand_rest_p'],
annealing_rate=optimization_config[self.optimizer]['annealing_rate'],
start_temp=optimization_config[self.optimizer]['start_temp'],
)
elif self.optimizer == "RandomSearchOptimizer":
optimizer = hyperactive.RandomSearchOptimizer()
elif self.optimizer == "RandomRestartHillClimbingOptimizer":
optimizer = hyperactive.RandomRestartHillClimbingOptimizer(
epsilon=optimization_config[self.optimizer]['epsilon'],
distribution=optimization_config[self.optimizer]['distribution'],
n_neighbours=optimization_config[self.optimizer]['n_neighbours'],
rand_rest_p=optimization_config[self.optimizer]['rand_rest_p'],
n_iter_restart=optimization_config[self.optimizer]['n_iter_restart'],
)
elif self.optimizer == "RandomAnnealingOptimizer":
optimizer = hyperactive.RandomAnnealingOptimizer(
epsilon=optimization_config[self.optimizer]['epsilon'],
distribution=optimization_config[self.optimizer]['distribution'],
n_neighbours=optimization_config[self.optimizer]['n_neighbours'],
rand_rest_p=optimization_config[self.optimizer]['rand_rest_p'],
annealing_rate=optimization_config[self.optimizer]['annealing_rate'],
start_temp=optimization_config[self.optimizer]['start_temp'],
)
elif self.optimizer == "ParallelTemperingOptimizer":
optimizer = hyperactive.ParallelTemperingOptimizer(
population=optimization_config[self.optimizer]['population'],
n_iter_swap=optimization_config[self.optimizer]['n_iter_swap'],
rand_rest_p=optimization_config[self.optimizer]['rand_rest_p'],
)
elif self.optimizer == "ParticleSwarmOptimizer":
optimizer = hyperactive.ParticleSwarmOptimizer(
population=optimization_config[self.optimizer]['population'],
inertia=optimization_config[self.optimizer]['inertia'],
cognitive_weight=optimization_config[self.optimizer]['cognitive_weight'],
social_weight=optimization_config[self.optimizer]['social_weight'],
rand_rest_p=optimization_config[self.optimizer]['rand_rest_p'],
)
elif self.optimizer == "EvolutionStrategyOptimizer":
optimizer = hyperactive.EvolutionStrategyOptimizer(
population=optimization_config[self.optimizer]['population'],
mutation_rate=optimization_config[self.optimizer]['mutation_rate'],
crossover_rate=optimization_config[self.optimizer]['crossover_rate'],
rand_rest_p=optimization_config[self.optimizer]['rand_rest_p'],
)
else:
raise ValueError(f'Entered optimizer which is {self.optimizer} is not known.')
if mem is None or mem.empty:
# init empty pandas dataframe
# search_data = pd.DataFrame(columns=list(self.search_space.keys()) + ["score", "daily_balance"])
search_data = pd.DataFrame(columns=list(self.search_space.keys()) + ["score", "dna"])
with open(self.path, "w") as f:
search_data.to_csv(f, sep=";", index=False, na_rep='nan')
hyper.add_search(self.objective_function, self.search_space, optimizer=optimizer,
n_iter=self.iterations,
n_jobs=self.cpu_cores)
else:
# mem.drop('daily_balance', 1, inplace=True)
hyper.add_search(self.objective_function, self.search_space, optimizer=optimizer, memory_warm_start=mem,
n_iter=self.iterations,
n_jobs=self.cpu_cores)
hyper.run()
# def validate_optimization(self, cscv_nbins: int = 10):
# with open(self.path, "r") as f:
# results = pd.read_csv(f, sep=";", converters={'daily_balance': from_np_array}, na_values='nan')
# results.dropna(inplace=True)
# results.drop("score", 1, inplace=True)
# multi_index = results.columns.tolist()
# multi_index.remove('daily_balance')
# results.set_index(multi_index, drop=True, inplace=True)
# new_columns = results.index.to_flat_index()
#
# daily_balance = results.daily_balance.to_numpy()
# prepared = prepare_daily_percentage(daily_balance)
# vstack = np.vstack(prepared)
#
# daily_percentage = pd.DataFrame(vstack).transpose()
# daily_percentage.columns = new_columns
#
# cscv_objective = lambda r: r.mean()
# cscv = CSCV(n_bins=cscv_nbins, objective=cscv_objective)
# cscv.add_daily_returns(daily_percentage)
# cscv.estimate_overfitting(name=self.study_name)
# first make same length
# forward fill returns
# return percentage change
def prepare_daily_percentage(a):
A = np.full((len(a), max(map(len, a))), np.nan)
for i, aa in enumerate(a):
A[i, :len(aa)] = aa
ff = jh.np_ffill(A, 1)
return np.diff(ff) / ff[:, :-1] * 100
def optimize_mode_hyperactive(start_date: str, finish_date: str, optimal_total: int, cpu_cores: int, optimizer: str,
iterations: int) -> None:
# clear the screen
click.clear()
# validate routes
validate_routes(router)
# load historical candles and divide them into training
# and testing candles (15% for test, 85% for training)
training_candles = get_training_candles(start_date, finish_date)
optimizer = Optimizer(training_candles, optimal_total, cpu_cores, optimizer, iterations)
print('Starting optimization...')
optimizer.run()
# print('Starting validation...')
# optimizer.validate_optimization()
def get_training_candles(start_date_str: str, finish_date_str: str):
# Load candles (first try cache, then database)
from jesse.modes.backtest_mode import load_candles
return load_candles(start_date_str, finish_date_str)
def from_np_array(array_string):
return np.array(ast.literal_eval(array_string))

View File

@@ -0,0 +1,151 @@
import itertools as itr
import math
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from statsmodels.distributions.empirical_distribution import ECDF
class CSCV(object):
"""Combinatorially symmetric cross-validation algorithm.
Calculate backtesting about overfitting probability distribution and performance degradation.
Attributes:
n_bins:A int of CSCV algorithm bin size to control overfitting calculation.Default is 10.
objective:A function of in sample(is) and out of sample(oos) return benchmark algorithm.Default is lambda r:r.mean().
"""
def __init__(self, n_bins=10, objective=lambda r: r.mean()):
self.n_bins = n_bins
self.objective = objective
self.bins_enumeration = [set(x) for x in itr.combinations(np.arange(10), 10 // 2)]
self.Rs = [pd.Series(dtype=float) for i in range(len(self.bins_enumeration))]
self.R_bars = [pd.Series(dtype=float) for i in range(len(self.bins_enumeration))]
def add_daily_returns(self, daily_returns):
"""Add daily_returns in algorithm.
Args:
daily_returns: A dataframe of trading daily_returns.
"""
bin_size = daily_returns.shape[0] // self.n_bins
bins = [daily_returns.iloc[i * bin_size: (i + 1) * bin_size] for i in range(self.n_bins)]
for set_id, is_set in enumerate(self.bins_enumeration):
oos_set = set(range(10)) - is_set
is_returns = pd.concat([bins[i] for i in is_set])
oos_returns = pd.concat([bins[i] for i in oos_set])
R = self.objective(is_returns)
R_bar = self.objective(oos_returns)
self.Rs[set_id] = self.Rs[set_id].append(R)
self.R_bars[set_id] = self.R_bars[set_id].append(R_bar)
def estimate_overfitting(self, name: str):
"""Estimate overfitting probability.
Generate the result on Combinatorially symmetric cross-validation algorithm.
Display related analysis charts.
Args:
plot: A bool of control plot display. Default is False.
Returns:
A dict of result include:
pbo_test: A float of overfitting probability.
logits: A float of estimated logits of OOS rankings.
R_n_star: A list of IS performance of th trategies that has the best ranking in IS.
R_bar_n_star: A list of find the OOS performance of the strategies that has the best ranking in IS.
dom_df: A dataframe of optimized_IS, non_optimized_OOS data.
"""
# calculate strategy performance in IS(R_df) and OOS(R_bar_df)
R_df = pd.DataFrame(self.Rs)
R_bar_df = pd.DataFrame(self.R_bars)
# calculate ranking of the strategies
R_rank_df = R_df.rank(axis=1, ascending=False, method='first')
R_bar_rank_df = R_bar_df.rank(axis=1, ascending=False, method='first')
# find the IS performance of th trategies that has the best ranking in IS
r_star_series = (R_df * (R_rank_df == 1)).unstack().dropna()
r_star_series = r_star_series[r_star_series != 0].sort_index(level=-1)
# find the OOS performance of the strategies that has the best ranking in IS
r_bar_star_series = (R_bar_df * (R_rank_df == 1)).unstack().dropna()
r_bar_star_series = r_bar_star_series[r_bar_star_series != 0].sort_index(level=-1)
# find the ranking of strategies which has the best ranking in IS
r_bar_rank_series = (R_bar_rank_df * (R_rank_df == 1)).unstack().dropna()
r_bar_rank_series = r_bar_rank_series[r_bar_rank_series != 0].sort_index(level=-1)
# probability of overfitting
# estimate logits of OOS rankings
logits = (1 - ((r_bar_rank_series) / (len(R_df.columns) + 1))).map(lambda p: math.log(p / (1 - p)))
prob = (logits < 0).sum() / len(logits)
# stochastic dominance
# caluclate
if len(r_bar_star_series) != 0:
y = np.linspace(
min(r_bar_star_series), max(r_bar_star_series), endpoint=True, num=1000
)
# build CDF performance of best candidate in IS
R_bar_n_star_cdf = ECDF(r_bar_star_series.values)
optimized = R_bar_n_star_cdf(y)
# build CDF performance of average candidate in IS
R_bar_mean_cdf = ECDF(R_bar_df.median(axis=1).values)
non_optimized = R_bar_mean_cdf(y)
#
dom_df = pd.DataFrame(
dict(optimized_IS=optimized, non_optimized_OOS=non_optimized)
, index=y)
dom_df["SD2"] = -(dom_df.non_optimized_OOS - dom_df.optimized_IS).cumsum()
else:
dom_df = pd.DataFrame(columns=['optimized_IS', 'non_optimized_OOS', 'SD2'])
ret = {
'pbo_test': (logits < 0).sum() / len(logits),
'logits': logits.to_list(),
'R_n_star': r_star_series.to_list(),
'R_bar_n_star': r_bar_star_series.to_list(),
'dom_df': dom_df,
}
path = 'storage/optimize/validation/{}'.format(name)
os.makedirs('./storage/optimize/validation/{}'.format(name), exist_ok=True)
# probability distribution
plt.title('Probability Distribution')
plt.hist(x=[l for l in ret['logits'] if l > -10000], bins='auto')
plt.xlabel('Logits')
plt.ylabel('Frequency')
plt.savefig('{}/Probability Distribution.png'.format(path))
# performance degradation
plt.title('Performance degradation')
plt.scatter(ret['R_n_star'], ret['R_bar_n_star'])
plt.xlabel('In-sample Performance')
plt.ylabel('Out-of-sample Performance')
plt.savefig('{}/Performance degradation.png'.format(path))
# first and second Stochastic dominance
plt.title('Stochastic dominance')
ret['dom_df'].plot(secondary_y=['SD2'])
plt.xlabel('Performance optimized vs non-optimized')
plt.ylabel('Frequency')
plt.savefig('{}/Stochastic dominance.png'.format(path))
print('Validation plots saved in {}'.format(path))
return ret

View File

@@ -30,8 +30,6 @@ def test_base_asset():
assert jh.base_asset('DEFI-USD') == 'DEFI'
def test_binary_search():
arr = [0, 11, 22, 33, 44, 54, 55]
@@ -104,6 +102,18 @@ def test_dna_to_hp():
assert jh.dna_to_hp(strategy_hp, dna) == {'hp1': 0.08518987341772151, 'hp2': 3}
def test_hp_to_dna():
strategy_hp = [
{'name': 'hp1', 'type': float, 'min': 0.01, 'max': 1.0, 'default': 0.09},
{'name': 'hp2', 'type': int, 'min': 1, 'max': 10, 'default': 2},
]
para_dict = {
"hp1": 0.08518987341772151,
"hp2": 3,
}
assert jh.hp_to_dna(strategy_hp, para_dict) == '.:'
def test_dump_exception():
# uses database, which is not existing during testing
pass