Backtest

Overall Statistics
Total Trades 4187 Average Win 0.35% Average Loss -0.21% Compounding Annual Return 23.865% Drawdown 18.500% Expectancy 0.496 Net Profit 751.154% Sharpe Ratio 1.277 Probabilistic Sharpe Ratio 75.409% Loss Rate 43% Win Rate 57% Profit-Loss Ratio 1.64 Alpha 0.096 Beta 0.744 Annual Standard Deviation 0.133 Annual Variance 0.018 Information Ratio 0.693 Tracking Error 0.102 Treynor Ratio 0.228 Total Fees $20151.84 Estimated Strategy Capacity $150000000.00 Lowest Capacity Asset GOOCV VP83T1ZUHROL

#region imports
from AlgorithmImports import *
#endregion
from sklearn.model_selection import train_test_split
from optimize import *
import numpy as np
import pandas as pd

class EnsembleModel():

    def __init__(self, algo):
        self.algo = algo
        self.models = None
        self.optimizer = None

    # Build a set of models
    def Train(self, X, y, returns):
        self.optimizer = ModelOptimizer(self.algo)
        try:
            self.models = self.optimizer.BuildModels(X, y, returns)
        except:
            pass
        

    # Return prediction as a +/- Z-Score of the prediction from the the threshold
    def Predict(self, X):
        z = 0

        if self.models:
            total_weight = sum(model.test_score for model in self.models)
            

            for model in self.models:
                try:
                    result = model.Predict(X)
                    # Weight the result based on the models test score
                    z = z + result * (model.test_score/total_weight)
                except:
                    pass
            
        return z

#region imports
from AlgorithmImports import *
#endregion
import talib
from talib import func
import numpy as np
import pandas as pd

#
# This helper class allows us to easily add features to our dataset
# add_features(df, ['EMA_7'])
#
# Other examples
# EMA_7_28 gives you the ratio of these two EMA
# EMA_7_28_diff gives you the difference of the current ratio and the last ratio for those two EMA
#

def ichimoku_a(high, low, n1 = 9, n2 = 26, n3 = 52):
    conv = 0.5 * (high.rolling(n1, min_periods=0).max()
                  + low.rolling(n1, min_periods=0).min())
    base = 0.5 * (high.rolling(n2, min_periods=0).max()
                  + low.rolling(n2, min_periods=0).min())
    spana = 0.5 * (conv + base)
    return pd.Series(spana)

def ichimoku_b(high, low, n1 = 9, n2 = 26, n3 = 52):
    spanb = 0.5 * (high.rolling(n3, min_periods=0).max()
       + low.rolling(n3, min_periods=0).min())
    return pd.Series(spanb)

def obv_rolling_percent(close, volume, n, fillna=False):
    df = pd.DataFrame([close, volume]).transpose()
    df['OBV'] = 0
    c1 = close < close.shift(1)
    c2 = close > close.shift(1)
    if c1.any():
        df.loc[c1, 'OBV'] = - volume
    if c2.any():
        df.loc[c2, 'OBV'] = volume
    obv = df['OBV'].rolling(n).sum() / df[volume.name].rolling(n).sum()
    if fillna:
        obv = obv.replace([np.inf, -np.inf], np.nan).fillna(0)
    return pd.Series(obv, name='obv')

def kst(close, r1 = 3, r2 = 7, r3 = 14, r4 = 28, n1 = 10, n2 = 3, n3 = 7, n4 = 14):
    rocma1 = ((close - close.shift(r1))
            / close.shift(r1)).rolling(n1, min_periods=0).mean()
    rocma2 = ((close - close.shift(r2))
              / close.shift(r2)).rolling(n2, min_periods=0).mean()
    rocma3 = ((close - close.shift(r3))
              / close.shift(r3)).rolling(n3, min_periods=0).mean()
    rocma4 = ((close - close.shift(r4))
              / close.shift(r4)).rolling(n4, min_periods=0).mean()
    return 100 * (rocma1 + 2 * rocma2 + 3 * rocma3 + 4 * rocma4)
    
def kst_sig(close, r1 = 3, r2 = 7, r3 = 14, r4 = 28, n1 = 10, n2 = 3, n3 = 7, n4 = 14, nsig = 6):
    kst_val = kst(clse, r1, r2, r3, r4, n1, n2, n3, n4)
    kstsig = kst_val.rolling(nsig, minperiods=0).mean()
    return kstsig

def check_feature_cache(cache, name, f, args):
    if not name in cache:
        cache[name] = f(*args)
    return cache[name]

def add_features(df, features, close="close", high="high", low="low", volume="volume"):

    fmap = {'MFI':'money_flow_index','ICHA':'ichimoku_a','ICHB':'ichimoku_b','BOLLH':'bollinger_hband', \
            'BOLLL':'bollinger_lband','KCC':'keltner_channel_central','NVI':'negative_volume_index', \
            'OBVP' : 'obv_rolling_percent', 'KST' : 'kst', 'KST_SIG' : 'kst_sig'}

    cache = {}


    for feature in features:
        # parse string.  Style is func_period1_period2:COL=ColumnName1 (column name optional)

        col = None
        col_idx = feature.find(':')
        if col_idx > -1:
            col = feature[col_idx+1:]
            feature = feature[0:col_idx]

        p = feature.split('_')
        if not col is None:
            feature = feature + "_" + col

        fname = p[0].upper()

        # If DM, DI, or HT function will have underscore in name, need to add suffix back and shift params back
        if len(p) > 1 and (p[1] in ['DM','DI'] or p[0] in ['HT']):
            fname += '_' + p[1]
            for i in range(2,len(p)):
                p[i-1] = p[i]
            del p[-1]

        p1 = p[1].upper() if len(p) > 1 else None
        p2 = p[2].upper() if len(p) > 2 else None
        p3 = p[3].upper() if len(p) > 3 else None

        if fname in fmap:
            if fmap[fname].find('.') > -1:
                s = fmap[fname].split('.')
                cls = s[0]
                f = getattr(globals()[cls], s[1])
            else:
                f = globals()[fmap[fname].lower()]
        elif fname in talib.__TA_FUNCTION_NAMES__:
            f = getattr(func, fname)
        elif fname.lower() in globals():
            f = globals()[fname.lower()]
        else:
            raise Exception(f'Could not find function. fname: {fname} feature: {feature}')


        if fname.endswith('MA') or fname == 'T3':
            args = [df[close], int(p1)]
            if p2 is None:
                df[feature] = (check_feature_cache(cache, feature, f, args ))
            elif p2 == 'DIFF':
                ma1 = check_feature_cache(cache, fname + '_' + p1, f, args)
                df[feature] = ma1.pct_change()
                p2 = None # So it doesn't get diffed at end of method
            else:
                ma1 = check_feature_cache(cache, fname + '_' + p1, f, args)
                args = [df[close], int(p2)]
                ma2 = check_feature_cache(cache, fname + '_' + p2, f, args)
                df[feature] = (ma1 - ma2) / ma1
                df[feature].replace([np.inf, -np.inf], 0, inplace=True)


        elif fname in ['CMO','TRIX','ROCP','ROCR100','ROC','BOLLH','BOLLL', 'RSI','MOM']:
            df[feature] = f(df[close], int(p1))


        elif fname in ['MINUS_DM', 'PLUS_DM']:
            df[feature] = f(df[high], df[low], int(p1))

        elif fname in ['ADX','ATR','KCC','MINUS_DI','PLUS_DI','WILLR']:
            try:
                if df[close].isnull().all():
                    df[feature] = np.nan
                elif p2 is None or p2.upper() == 'DIFF':
                    df[feature] = f(df[high], df[low], df[close], int(p1))
                    # Change all to percent, except WILLR
                    if not fname in ['WILLR','ADX']:
                        df[feature] = df[feature]/df[close]
                else:
                    #params = {'close' : df[close], 'high' : df[high],'low' : df[low], 'timeperiod' : int(p1)}
                    args = [df[high], df[low], df[close], int(p1)]
                    f1 = check_feature_cache(cache, fname + '_' + p1, f, args)
                    args = [df[high], df[low], df[close], int(p2)]
                    f2 = check_feature_cache(cache, fname + '_' + p2, f, args)
                    df[feature] = (f1-f2)/f1
                    df[feature].replace([np.inf, -np.inf], 0, inplace=True)
            except Exception as ex:
                if str(ex) == 'inputs are all NaN':
                    df[feature] = np.nan
                else:
                    raise


        elif fname in ['MFI']:
            df[feature] = f(df[high], df[low], df[close], df[volume], int(p1))

        elif fname in ['MACD', 'PPO', 'TSI']:
             df[feature] = f(df[close], int(p1), int(p2))

        elif fname in ['ICHA', 'ICHB', 'AO']:
            # Normalize
            df[feature] = f(df[high], df[low], int(p1), int(p2))
#             if fname.startswith('ICH'):
#                 df[feature] = df[feature]/ df[close]

        elif fname in ['NVI']:
            df[feature] = f(df[close], df[volume])

        elif fname in ['SAR', 'SAREXT']:
            # Normalize
            df[feature] = f(df[high], df[low]) #/ df[close]

        elif fname == 'KST':
            df[feature] = f(df[close], r1=10, r2=15, r3=20, r4=30, n1=10, n2=10, n3=10, n4=15)

        elif fname == 'KST_SIG':
            df[feature] = f(df[close], r1=10, r2=15, r3=20, r4=30, n1=10, n2=10, n3=10, n4=15, nsig=9)

        elif fname == 'AROON':
            df[feature] = f(df[high], df[low], int(p1))[int(p2)-1]

        elif fname == 'ULTOSC':
            df[feature] = f(df[high], df[low], df[close])

        elif fname in ['OBVP']:
            df[feature] = f(df[close],df[volume],int(p1))

        elif fname in ['OBTP']:
            df[feature] = f(df[close],df["Trades"],int(p1))

        elif fname in ['HT_DCPERIOD','HT_DCPHASE','HT_PHASOR','HT_SINE', 'HT_TRENDMODE','STOCHRSI' ]:
            df[feature] = f(df[close])

        else:
            raise Exception('Feature not found: ' + feature + ' ' + fname )

        # Difference those that are 1 period diffs
        if p3 == 'DIFF' or p2 == 'DIFF':
            df[feature] = df[feature].diff()


    return df
    
def get_exhaustive_ma_features_list():

    features = []
    periods = [7, 14, 28, 42, 70]

    mas = ["SMA", "DEMA", "KAMA", "EMA", "TEMA", "T3", "TRIMA", "WMA"]
    for p in periods:
        for ma in mas:
            features.append(ma + "_" + str(p))

    for p1 in periods:
        for p2 in periods:
            if p2 <= p1:
                continue

            for ma in mas:
                ma1 = ma + str(p1)
                ma2 = ma + str(p2)
                features.append(ma + "_" + str(p1) + "_" + str(p2))
                features.append(features[-1] + "_diff")

    return features

def get_exhaustive_features_list(diffs=True):

    features = get_exhaustive_ma_features_list()

    periods = [3,7,14,20,30]
    band_periods = [7,14,21,28]
    hi_lo_periods = [[4,9],[7,14],[10,21],[12,26]]

    pfs = ['CMO','MINUS_DI','MINUS_DM','PLUS_DI','PLUS_DM','ROCP','ATR','ADX',
                       'TRIX','WILLR']
    # Left out: MOM, VIN, VIP, ROC,ROCR,OBVM,EOM,DPI,RSI,STOCH,STOCH_SIG,FI
    for p in periods:

        features.append('AROON_' + str(p) + '_1')
        features.append('AROON_' + str(p) + '_2')
        for pf in pfs:
            if pf == 'ADX' and p == 3:
                continue
            features.append(pf + '_' + str(p))
            if diffs:
                features.append(features[-1] + "_diff")
    if diffs:
        for p1 in periods:
            for p2 in periods:
                if p2 <= p1:
                    continue

                for pf in pfs:
                    pf1 = pf + str(p1)
                    pf2 = pf + str(p2)
                    features.append(pf + "_" + str(p1) + "_" + str(p2))
                    features.append(features[-1] + "_diff")


    hlfs = ['ICHA','ICHB']
    # Exclude PPO, same as our MA setups
    # hi lo Indicators
    for hl in hi_lo_periods:
        for hlf in hlfs:
            features.append(hlf + '_' + str(hl[0]) + '_' + str(hl[1]))

    # other indicators
    otfs = ['SAR','SAREXT','HT_DCPERIOD','ULTOSC','OBVP_10','OBVP_20','OBVP_40','OBVP_80']
    for otf in otfs:
        features.append(otf)

    return features

from AlgorithmImports import *
from risk import BracketRiskModel, TrailingStopRiskManagementModelLiquidation

from feature import add_features
from ensemble import EnsembleModel
from datetime import timedelta
import pandas as pd
import numpy as np
import statistics
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import resample, shuffle
import math

class TransdimensionalDynamicThrustAssembly(QCAlgorithm):

    def Initialize(self):

        self.SetStartDate(2010, 1, 1)  # Set Start Date
        self.SetEndDate(2020, 1, 1)  # Set End Date
        self.InitCash = 100000
        self.SetCash(self.InitCash)  
        self.SetBrokerageModel(BrokerageName.InteractiveBrokersBrokerage)
        
        # time frames and frequencies
        self.resolution = Resolution.Daily
        self.rebalancefunc = Expiry.EndOfQuarter
        self.lookback = 250
        self.metamodel_rolling_window = 50

        # ML parameters
        self.years = 4
        self.n_estimators = 10
        self.min_samples_split = 140
        self.num_iter = 10
        self.num_test_records = 60

        # UniverseSize determines the number of ETF constituents to select from each ETF. The top x by weight are selected.
        self.UniverseSize = 9
        
        self.kelly_multiplier = 1

        self.SetWarmup(self.lookback)

        # if this flag is set to false, none of the meta labeling code gets called, thus saving a lot of expensive compute
        self.meta_labelling = False

        if self.meta_labelling:
            import ht_auth
            ht_auth.SetToken(self.GetParameter("mlfinlab-api-key"))
            import mlfinlab as ml
            self.ml = ml
            from mlfinlab.bet_sizing import bet_sizing
            self.bet_sizing = bet_sizing

        # add SPY as the benchmark but don't trade it (that's what self.exclusions is for, it's being checked at various points)!
        self.SetBenchmark('SPY')
        self.MKT = self.AddEquity('SPY', self.resolution).Symbol
        self.mkt = []
        self.exclusions = ['SPY']

        self.UniverseSettings.Resolution = self.resolution
        
        features = ['HT_DCPERIOD', 'HT_DCPHASE', 'MOM_5', 'ADX_10_diff','MOM_5_diff']

        self.etfs = ["SPY"]

        for etf in self.etfs:
            self.AddUniverse(self.Universe.ETF(etf, Market.USA, self.UniverseSettings, self.ETFConstituentsFilter))

        # Select the alpha model
        self.SetAlpha(WeightedClassifierAlpha(self, features, self.lookback, self.metamodel_rolling_window, self.meta_labelling, self.years))

        # Equally weigh securities in portfolio, based on insights
        self.SetPortfolioConstruction(InsightWeightingPortfolioConstructionModel())

        # Set VWAP Execution Model (Immidiate Execution worked well, but VWAP execution seems to produce slightly better results)
        self.SetExecution(VolumeWeightedAveragePriceExecutionModel())

        # Set Risk Management Model - none of the risk models imroved the performance over the alpha model's own flat insights, so for now NullRiskModel seems best
        # Bracket Risk Model (using SL and TP) and Trailing Stop (using SL only) are both modified to prevent the Alpha Model from re-opening positions that the risk model has closed
        # self.stopLoss = 0.02
        # self.takeProfit = 0.95
        self.SetRiskManagement(NullRiskManagementModel())
        # self.SetRiskManagement(BracketRiskModel(self.stopLoss, self.takeProfit))
        # self.SetRiskManagement(TrailingStopRiskManagementModelLiquidation(self.stopLoss))


    def ETFConstituentsFilter(self, constituents: List[ETFConstituentData]) -> List[Symbol]:
        # Get the UniverseSize securities with the largest weight in the index
        selected = sorted([c for c in constituents if c.Weight],
            key=lambda c: c.Weight, reverse=True)[:self.UniverseSize]

        if not selected:
            self.Log("ETF symbols without weight. Returning Universe Unchanged: "+str([c.Value for c in Universe.Unchanged]))
            return Universe.Unchanged
        else:
            self.Log("ETF symbols selected: "+str([c.Symbol.Value for c in selected]))
            return [c.Symbol for c in selected]

    def OnEndOfDay(self, symbol):
        # plot the benchmark on the main equities chart for comparison
        if not self.IsWarmingUp: 
            mkt_price = self.History(self.MKT, 2, Resolution.Daily)['close'].unstack(level= 0).iloc[-1]
            self.mkt.append(mkt_price)
            mkt_perf = self.InitCash * self.mkt[-1] / self.mkt[0] 
            self.Plot('Strategy Equity', self.MKT, mkt_perf)

class WeightedClassifierAlpha(AlphaModel):

    def __init__(self, algo, features, lookback, metamodel_rolling_window, meta_labelling, years):
        self.algo = algo
        self.models = {}
        self.features = features
        self.rebalanceTime = datetime.min
        self.symbols = []
        self.lookback = lookback
        self.metamodel_rolling_window = metamodel_rolling_window
        self.years = years
        self.kelly_size = {}
        self.avg_win = {}
        self.avg_loss = {}

        self.meta_labelling = meta_labelling
        self.primary_history = {}
        self.metamodel = {}
        self.triple_barrier_events = {}

    def Update(self, algo, data):

        insights = []

        if not self.symbols:
            return insights

        try:

            # This can be adjusted to only allow trading for symbols with a certain Kelly size (as determined during the last training of the model)
            min_kelly = 0

            # Load History and Features          
            end = self.algo.Time
            start = end - timedelta(days=self.lookback)

            X = self.GetXforPrimaryModel(self.symbols, start, end)
            time = X.index[-1][0]
            X = X.loc[pd.IndexSlice[time, :], :]

            insights = []

            if not self.algo.IsWarmingUp:

                # Predict each symbols direction
                for symbol in self.symbols:
                    symbol = SymbolCache.GetSymbol(symbol)

                    if symbol in self.kelly_size:
                        kelly_size = self.kelly_size[symbol]
                        if math.isnan(kelly_size):
                            kelly_size = 0
                            self.algo.Log("NaN Kelly Size for "+str(symbol.Value))
                    else: 
                        kelly_size = 0
                        if symbol.Value not in self.algo.exclusions:
                            self.algo.Log("No Kelly Size for "+str(symbol.Value))

                    if kelly_size >= min_kelly:

                        X_symbol = X.loc[pd.IndexSlice[:, [symbol.ID.ToString()]], :]

                        if self.models and data.ContainsKey(symbol) and data[symbol]:
                            if symbol in self.models:

                                model = self.models[symbol]
                                result = model.Predict(X_symbol)

                                direction = InsightDirection.Flat

                                metapred = 1
                                metapred_prob = 1
                                size = self.kelly_size[symbol] * self.algo.kelly_multiplier

                                if self.meta_labelling:
                                    if symbol in self.primary_history.keys() and symbol in self.metamodel:
                                        self.UpdateRollingHistoryWindow(symbol, data[symbol].EndTime, data[symbol].Close, result)
                                        X_meta, y_meta, li, tbi = self.GetXyforMetaModel(self.primary_history[symbol], symbol,0, False)

                                        metapred = self.metamodel[symbol].predict(X_meta.iloc[-1:])[0]
                                        metapred_prob = prob[-1][1]

                                        if self.avg_win[symbol] == 0 or self.avg_loss[symbol] == 0:
                                            size = 0
                                        else:
                                            size = metapred_prob - ((1-metapred_prob)/(self.avg_win[symbol]/self.avg_loss[symbol]))

                                        # Alternatibely: using MLFinLab's bet_size_probability instead of kelly to determine  size:
                                        # prob = self.metamodel[symbol].predict_proba(X_meta.fillna(0))
                                        # zlst = list(zip(*prob))
                                        # size = self.algo.bet_sizing.bet_size_probability(self.triple_barrier_events[symbol], pd.Series(zlst[1]), 2).iloc[-1]

                                if math.isnan(size):
                                    size = 0

                                # using result as measure of confidence and (Kelly) size as weight. 
                                # (For InsightWeightingPortfolioConstructionModel confidence has no effect, this is just in case it might be useful for aby other PCM)
                                if result > 0 and metapred and size > 0:
                                    insights.append(
                                        Insight.Price(symbol, timedelta(days=2), InsightDirection.Up, result, None, None, size))

                                elif result < 0:
                                    size = 0
                                    insights.append(
                                        Insight.Price(symbol, timedelta(days=1), InsightDirection.Flat, -result, None, None, size))

            return insights

        except Exception as e:
            self.algo.Log('Unexpected error during insights generation:' + str(e))
            raise

    def OnSecuritiesChanged(self, algorithm: QCAlgorithm, changes: SecurityChanges) -> None:

        pred_hist = pd.DataFrame

        for security in changes.RemovedSecurities:
            if security.Symbol in self.symbols:
                self.symbols.remove(security.Symbol)

            self.metamodel.pop(security.Symbol, None)
            self.triple_barrier_events.pop(security.Symbol, None)
            if self.meta_labelling:
                self.primary_history.pop(security.Symbol, None)

            if security.Invested:
                self.algo.Liquidate(security.Symbol, "Removed from Universe")
        
        for security in changes.AddedSecurities:
            if security.Symbol.Value not in self.algo.exclusions:
                self.symbols.extend([security.Symbol])

        if self.algo.Time > self.rebalanceTime:
            # it's time to re-train all models (and not just the newly added ones)
            symbols = self.symbols
            self.rebalanceTime = self.algo.rebalancefunc(self.algo.Time)
            self.algo.Debug(str(self.algo.Time)+" Rebalancing. Training all "+str([symbol.Value for symbol in symbols]))

        else:
            symbols = [security.Symbol for security in changes.AddedSecurities]
            self.algo.Debug(str(self.algo.Time)+" Not yet time for Rebalancing. Training new symbols "+str([symbol.Value for symbol in symbols]))

        if symbols:
            for symbol in symbols:
                self.models[symbol] = self.TrainModel(symbol, self.years)

            # initial history of newly added securities for meta labelling
            if self.meta_labelling:
                for symbol in symbols:
                    if symbol.Value not in self.algo.exclusions:

                        try:
                            end = self.algo.Time - timedelta(days=1)
                            start = end - timedelta(days=self.lookback * 2)

                            X = self.GetXforPrimaryModel([symbol], start, end)

                            historystart = self.algo.Time - timedelta(days=self.lookback)
                            nearest = X.reset_index().time.searchsorted(historystart)
                            historystart = X.index[nearest-1][0]

                            pred_hist= self.algo.History(symbol, historystart, end, self.algo.resolution).droplevel(0, axis=0)
                            pred_hist.index.names = ['date_time']
                            pred_hist = pred_hist[['close']]
                            pred_hist['result']=0.0

                            iter_date = self.algo.Time - timedelta(days=self.lookback)

                            self.primary_history[symbol] = pd.DataFrame(columns=['time','close','result']).set_index('time')

                            while iter_date <= end:
                                nearest = X.reset_index().time.searchsorted(iter_date)
                                if 0 <= nearest < len(X.index):
                                    time = X.index[nearest][0]

                                    X_slice = X.loc[pd.IndexSlice[time, :], :]

                                    X_symbol = X_slice.loc[pd.IndexSlice[:, [symbol.ID.ToString()]], :]

                                    model = self.models[symbol]
                                    result = model.Predict(X_symbol)
                                    pred_hist.loc[time]['result'] = result
                                    if self.meta_labelling:
                                        self.UpdateRollingHistoryWindow(symbol, time, pred_hist.loc[time]['close'], result)
                                iter_date += timedelta(days=1)

                            self.metamodel[symbol], self.triple_barrier_events[symbol] = self.TrainMetaModel(pred_hist, symbol)
                            self.algo.Debug(str(self.algo.Time)+" Meta Model for symbol "+str(symbol.Value))
                        except:
                            self.algo.Error(str(self.algo.Time)+" No Meta Model for symbol "+str(symbol.Value))

    def GetHistoryAndFeatures(self, symbols, start, end):

        history = self.algo.History(symbols, start, end, self.algo.resolution)
  
        gb = history.groupby('symbol', group_keys=False)
        history['returns_1w'] = -gb.close.pct_change(-5)
        history['returns_1d'] = -gb.close.pct_change(-1)
        history['returns_1m'] = -gb.close.pct_change(-21)

        gb = history.groupby('symbol', group_keys=False)
        history = gb.apply(lambda x: add_features(x, self.features))

        history = history.reset_index().set_index('time').sort_index()
        history.index = history.index.normalize()
        history = history.set_index('symbol', append=True)

        return history

    def GetXforPrimaryModel(self, symbols, start, end):
        history = self.GetHistoryAndFeatures(symbols, start, end)

        # Drop our Y columns before training
        X = history.drop(columns=['returns_1d', 'returns_1w', 'returns_1m', 'close','high','low','open','volume'])
        columns = X.columns[X.isna().any()].tolist()
        return X.dropna()

    def GetXyforMetaModel(self, df, symbol, shift, getlabels = True):
        
        labels = None
        y = None
        labels_index = None
        triple_barrier_events = None
        
        df['side'] = np.nan

        long_signals = df['result'] >= 0
        short_signals = df['result'] < 0
        df.loc[long_signals, 'side'] = 1
        df.loc[short_signals, 'side'] = -1

        # Remove Look ahead biase by lagging the signal
        df['side'] = df['side'].shift(shift)

        raw_data = df.copy()

        # we only need to generate labels when we train. We use the same method to get our X to predict the primary model's performance, so we can pass getlabels = False to save some time
        if getlabels:
            # Drop the NaN values from our data set
            df.dropna(axis=0, how='any', inplace=True)

            # Compute daily volatility
            try:
                if len(df['close']) < self.metamodel_rolling_window:
                    metamodel_rolling_window = len(df['close'])
                else:
                    metamodel_rolling_window = self.metamodel_rolling_window
                daily_vol = self.algo.ml.util.get_daily_vol(close=df['close'], lookback=metamodel_rolling_window)
            except:
                self.algo.Error(str(self.algo.Time)+" daily_vol ERROR")

            # Apply Symmetric CUSUM Filter and get timestamps for events
            # Note: Only the CUSUM filter needs a point estimate for volatility
            cusum_events = self.algo.ml.filters.cusum_filter(df['close'], threshold=daily_vol.mean()*0.5)

            # Compute vertical barrier
            vertical_barriers = self.algo.ml.labeling.add_vertical_barrier(t_events=cusum_events, close=df['close'], num_days=1)

            # pt_sl = [1, 2]
            pt_sl = [0, 0]
            # min_ret = 0.005
            min_ret = 0.0
            try:
                triple_barrier_events = self.algo.ml.labeling.get_events(close=df['close'],
                                                            t_events=cusum_events,
                                                            pt_sl=pt_sl,
                                                            target=daily_vol,
                                                            min_ret=min_ret,
                                                            num_threads=3,
                                                            vertical_barrier_times=vertical_barriers,
                                                            # vertical_barrier_times=False,
                                                            side_prediction=df['side'])
            except:
                self.algo.Error(str(self.algo.Time)+" events ERROR")

            labels = self.algo.ml.labeling.get_bins(triple_barrier_events, df['close'])

        # Log Returns
        raw_data['log_ret'] = np.log(raw_data['close']).diff()

        # Momentum
        raw_data['mom1'] = raw_data['close'].pct_change(periods=1)
        raw_data['mom2'] = raw_data['close'].pct_change(periods=2)
        raw_data['mom3'] = raw_data['close'].pct_change(periods=3)
        raw_data['mom4'] = raw_data['close'].pct_change(periods=4)
        raw_data['mom5'] = raw_data['close'].pct_change(periods=5)

        # Volatility
        raw_data['volatility_50'] = raw_data['log_ret'].rolling(window=50, min_periods=50, center=False).std()
        raw_data['volatility_31'] = raw_data['log_ret'].rolling(window=31, min_periods=31, center=False).std()
        raw_data['volatility_15'] = raw_data['log_ret'].rolling(window=15, min_periods=15, center=False).std()

        # # Serial Correlation (Takes about 4 minutes)
        window_autocorr = 50

        raw_data['autocorr_1'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=1), raw=False)
        raw_data['autocorr_2'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=2), raw=False)
        raw_data['autocorr_3'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=3), raw=False)
        raw_data['autocorr_4'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=4), raw=False)
        raw_data['autocorr_5'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=5), raw=False)

        # Get the various log -t returns
        raw_data['log_t1'] = raw_data['log_ret'].shift(1)
        raw_data['log_t2'] = raw_data['log_ret'].shift(2)
        raw_data['log_t3'] = raw_data['log_ret'].shift(3)
        raw_data['log_t4'] = raw_data['log_ret'].shift(4)
        raw_data['log_t5'] = raw_data['log_ret'].shift(5)

        # Re compute sides
        raw_data['side'] = np.nan

        long_signals = raw_data['result'] >= 0
        short_signals = raw_data['result'] < 0

        raw_data.loc[long_signals, 'side'] = 1
        raw_data.loc[short_signals, 'side'] = -1

        # Remove look ahead bias
        raw_data = raw_data.shift(shift)

        # Get features at event dates
        X= raw_data

        # # Drop unwanted columns
        X.drop(['close',], axis=1, inplace=True)

        if labels is not None:
            y = labels['bin']
            labels_index = labels.index

        return X, y, labels_index, triple_barrier_events

    def UpdateRollingHistoryWindow(self, symbol, time, close, result):

        new_row = pd.DataFrame([{'close': close, 'result': result}], index=[time])
        self.primary_history[symbol] = pd.concat([self.primary_history[symbol], new_row], ignore_index=False).drop_duplicates(keep='last')
        self.primary_history[symbol] = self.primary_history[symbol].iloc[-(self.metamodel_rolling_window+2):]

    def TrainModel(self, symbol, years):

        model = None

        try:
            # Load History and Features
            total_days = 365 * years
            start = self.algo.Time - timedelta(days=total_days)
            end = self.algo.Time

            history = self.GetHistoryAndFeatures(symbol, start, end)

            # Set our Y to predict if symbols will be positive in 1 week
            history = history.dropna()
            y = history['returns_1w'] > 0
            returns = history['returns_1w']
            closes = history['close']

            # Drop Y and close
            X = history.drop(columns=['returns_1d', 'returns_1w', 'returns_1m', 'close','high','low','open','volume'])

            # build model for the symbol
            if symbol.Value not in self.algo.exclusions:
                symbol = SymbolCache.GetSymbol(symbol)

                # idx = pd.IndexSlice
                X_symbol = X.loc[pd.IndexSlice[:, [symbol.ID.ToString()]], :]
                y_symbol = y.loc[X_symbol.index]
                # we're also adding ahistory of returns, which we pass on to the model to calculate kelly sizes upon Training
                returns_symbol = returns.loc[X_symbol.index]

                model = EnsembleModel(self.algo)
                model.Train(X_symbol, y_symbol, returns_symbol)

                if model.models:
                    # every time the model is trained we take the mean value of all num_iter models of this symbol. That's going to be the kelly size we'll use as weight when creating insights
                    cleanlist = [i.kelly_size for i in model.models if not math.isnan(i.kelly_size)]
                    self.kelly_size[symbol] = statistics.fmean(cleanlist) if cleanlist else 0
                    # we're also grabbing the mean values for average wins and loss for the symbol as we might want to use those for the MLFinLab bet sizing
                    cleanlist = [i.avg_win for i in model.models if not math.isnan(i.avg_win)]
                    self.avg_win[symbol] = statistics.fmean(cleanlist) if cleanlist else 0 
                    cleanlist = [i.avg_loss for i in model.models if not math.isnan(i.avg_loss)]
                    self.avg_loss[symbol] = statistics.fmean(cleanlist) if cleanlist else 0 
                else:
                    self.kelly_size[symbol] = 0
                    self.avg_win[symbol] = 0
                    self.avg_loss[symbol] = 0
                    self.algo.Error("no models available ")

        except Exception as e:
            self.algo.Log('Unexpected error trying to build models:' + str(e))
            raise
        
        return model

    def TrainMetaModel(self, df, symbol):

        X, y, labels_index, triple_barrier_events = self.GetXyforMetaModel(df, symbol,1)
        X = X.loc[labels_index, :]
       
        X_training_validation = X
        y_training_validation = y
        X_train, X_validate, y_train, y_validate = train_test_split(X_training_validation, y_training_validation,
                                                                    test_size=0.15, shuffle=False)

        train_df = pd.concat([y_train, X_train], axis=1, join='inner').dropna()

        # Upsample the training data to have a 50 - 50 split
        # https://elitedatascience.com/imbalanced-classes
        if len(train_df[train_df['bin'] == 0]) > len(train_df[train_df['bin'] == 1]):
            majority = train_df[train_df['bin'] == 0]
            minority = train_df[train_df['bin'] == 1]
        else:
            majority = train_df[train_df['bin'] == 1]
            minority = train_df[train_df['bin'] == 0]

        if not minority.empty:
            new_minority = resample(minority, 
                            replace=True,  # Sample with replacement
                            n_samples=majority.shape[0],  # To match majority class
                            random_state=42)
        else:
            self.algo.Debug(str(self.algo.Time)+" MINORITY EMPTY Majority: "+ str(len(majority)))
            new_minority = minority

        train_df = pd.concat([majority, new_minority])
        train_df = shuffle(train_df, random_state=42)

        # Create training data
        y_train = train_df['bin']
        X_train= train_df.loc[:, train_df.columns != 'bin']

        n_estimator, depth = 256, 7
        c_random_state = 42

        rf = RandomForestClassifier(max_depth=depth, n_estimators=n_estimator,
                        criterion='entropy', random_state=c_random_state)

        rf.fit(X_train, y_train.values.ravel())

        return rf, triple_barrier_events

#region imports
from AlgorithmImports import *
#endregion
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

class ClassifierModel():

    def __init__(self, algo):
        self.algo = algo
        self.model = None
        self.params = None
        self.test_score = None
        self.test_dist = None
        self.threshold = None

    # Train the model
    def Train(self, X, y, params):
        self.params = params
        self.params['class_weight'] = self.GetClassWeight(y)
        self.model = ExtraTreesClassifier(**self.params)
        self.model.fit(X, y)
        self.test_dist = self.model.predict_proba(X)[:, 1]
        

    # Return prediction as a +/- Z-Score of the prediction from the the threshold
    def Predict(self, X):
        #self.algo.Debug(f'Number of columns before pca: {X.shape[1]}')
        idx = X.index
        #X = np.nan_to_num(X, nan=0, posinf=0, neginf=0)
        probs = pd.Series(self.model.predict_proba(X)[:, 1])
        d = self.test_dist
        return (probs[0] - self.threshold) / d.std()
        
    def GetClassWeight(self, y_train):
        pos = y_train.sum()/(len(y_train))
        neg = 1 - pos
        pw = neg/pos
        nw = pos/neg

        return {0:nw,1:pw}

#region imports
from AlgorithmImports import *
#endregion
from sklearn.ensemble import ExtraTreesClassifier
from scipy.stats.mstats import gmean
import numpy as np
import pandas as pd
import math
from sklearn.metrics import auc,roc_curve,precision_recall_curve, roc_auc_score, accuracy_score
from model import ClassifierModel


class ModelOptimizer():

    def __init__(self, algo):
        self.algo = algo
        self.validation_score = 0
        self.test_score = 0
        self.test_dist = None
        self.threshold = None

    def BuildModels(self, X, y, returns):
        
        # Need to continue to play with min samples split and estimators and maybe class weights
        params = {'random_state':0,'n_estimators': self.algo.n_estimators, 'min_samples_split': self.algo.min_samples_split, 
                  'class_weight' : {0:1,1:1},
                  'max_features': 'auto', 'max_depth': 3, 'criterion': 'gini', 'bootstrap': True}
        
        # Train and validate using walk forward validation
        max_score = -1
        max_y = None
        max_probs = None
        num_iter = self.algo.num_iter
        num_test_records = self.algo.num_test_records
        num_records = len(X)
        num_train_records = num_records - num_test_records
        batch_size = 4
        models = []
        
        results = []
        
        # Iterate ove the dataset and create a number (num_iter) of models.

        trendtrades = [True] * (num_test_records-1)

        for i in range(0, num_iter):
            
            probs = []
            y_test_all = []
            returns_test_all = []
            # closes_test_all = []
            end_idx = num_records  
            
            # This is important.  Copy params so each model keeps a distinct copy with it's own random seed
            model_params = params.copy()
            model_params['random_state']= i

            # Loop through and do walk forward validation
            j = num_train_records
            while j < end_idx:
                
                # Add the next segment of data
                idx = j + batch_size
                if idx >= end_idx:
                    idx = end_idx-1
                X_train, X_test = X[0:j], X[j:idx]
                y_train, y_test = y[0:j], y[j:idx]
                returns_train, returns_test = returns[0:j], returns[j:idx]
                j = j + batch_size
        
                # Set the class weight based positives vs negatives
                model_params['class_weight'] = self.get_class_weights(y_train)
        
                # Train the model
                clf = ExtraTreesClassifier(**model_params)
                clf.fit(X_train, y_train)
                
                # Predict using the model
                p = pd.Series(clf.predict_proba(X_test)[:, 1])
                
                # Add the predicions
                probs.extend(p)
                y_test_all.extend(y_test)
                returns_test_all.extend(returns_test)
            
            # Build model on all the data
            model = ClassifierModel(self.algo)
            model.Train(X, y, model_params)
            
            # Since each models predictive results tend to skew, we calculate a threshold for what is a positive result
            model.threshold = self.get_threshold(pd.Series(y), model.test_dist)
            
            # Give the model a score for use in weighting models
            y_pred = probs > model.threshold
            score = self.score_results(y_test_all, probs, model.threshold)
            model.test_score = score

            # add win/loss stats as properties tothe model, so they can be used in the Alpha Model for bet sizing
            model.test_y = y_test_all
            model.test_returns = returns_test_all
            model.pred_y = y_pred
            model.kelly_size, model.avg_win, model.avg_loss  = self.kelly_size(y_test_all, y_pred, returns_test_all, trendtrades)
            
            models.append(model)

        return models
        
    # Calculate the model threshold for predicting positives
    def get_threshold(self, y_true, probs):
        
        # calculate percent positive
        pos = y_true.sum()/(len(y_true))
        
        # use that percent as threshold cutoff
        return sorted(probs, reverse=True)[int(round(len(probs)*pos))]
        
    def weighted_score(self, y_true, probs):
        # There are two ways we want to weight a prediction
        # 1. More recent predictions are weighted more heavily
        # 2. The larger the prediction value is weighted more heavily (positive or negative)
        
        y_test_all = np.asarray(y_true)
        probs_all = pd.Series(probs)
        splits = 6
        length = math.floor(len(probs_all)/splits)
        start = 0
        end = length-1
        scores = []
        for i in range(0,splits):
    
            y_test = y_test_all[start:end]
            if y_test.sum() != 0:
                probs = probs_all[start:end]
                precision, recall, threshold =  precision_recall_curve(y_test, probs)
                pauc = auc(recall, precision)
                if math.isnan(pauc):
                    pauc = 0
                    
                scores.append(pauc*(1+i/10))
    
            end = end + length
            start = start + length
            if (end >= len(probs_all)):
                end = len(probs_all)-1
            
        return gmean(scores)
            
    def kelly_size(self, y_true, y_pred, returns, trendtrades):
        # calculate the symbol's kelly size based on the predictions of the model
        df = pd.DataFrame({'y_true':y_true, 'y_pred':y_pred, 'returns': returns, 'trends': trendtrades})
        
        trades = df[(df['y_pred']) & (df['trends'])]
        wins = df[(df['y_true']) & (df['y_pred']) & (df['trends'])]
        losses = df[(df['y_true'] == 0) & (df['y_pred']) & (df['trends'])]

        win_rate = len(wins)/len(trades)
        loss_rate = 1-win_rate

        avg_win = wins['returns'].mean()
        avg_loss = -losses['returns'].mean()

        # used two different ways to calculate the Kelly size. (win_rate/avg_loss - loss_rate/avg_win) isn't the textbook Kelly formula, but it performed better
        # return the kelly size, but also average win and loss separately so it can be used for the MLFinLab bet sizing.
        return (win_rate/avg_loss - loss_rate/avg_win), avg_win, avg_loss  
        # return win_rate - (loss_rate/(avg_win/avg_loss))
    
        
    def score_results(self, y_true, probs, threshold):

        y_pred = probs > threshold
        return accuracy_score(y_true, y_pred)

        
    def get_class_weights(self, y):
        # Calculate class weights
        pos = y.sum()/(len(y))
        neg = 1 - pos
        pw = neg/pos
        nw = pos/neg
        return {0:nw,1:pw}

#region imports
from AlgorithmImports import *
#endregion
from QuantConnect.Algorithm.Framework.Risk import RiskManagementModel

# bracket risk model class
class BracketRiskModel(RiskManagementModel):
    '''Creates a trailing stop loss for the maximumDrawdownPercent value and a profit taker for the maximumUnrealizedProfitPercent value'''
    def __init__(self, maximumDrawdownPercent = 0.05, maximumUnrealizedProfitPercent = 0.05):
        self.maximumDrawdownPercent = -abs(maximumDrawdownPercent)
        self.trailingHighs = dict()
        self.maximumUnrealizedProfitPercent = abs(maximumUnrealizedProfitPercent)
        self.liquidated = set()

    def ManageRisk(self, algorithm, targets):
        riskAdjustedTargets = list()
        for kvp in algorithm.Securities:
            symbol = kvp.Key
            security = kvp.Value

            # Remove if not invested
            if not security.Invested:
                self.trailingHighs.pop(symbol, None)
                continue
            pnl = security.Holdings.UnrealizedProfitPercent
            
            if pnl > self.maximumUnrealizedProfitPercent or security.Symbol in self.liquidated:
                # liquidate
                algorithm.Debug(f"Profit Taken: {security.Symbol}")
                algorithm.Log(f"Profit Taken: {security.Symbol}")
                riskAdjustedTargets.append(PortfolioTarget(security.Symbol, 0))
                if algorithm.Securities[security.Symbol].Invested:
                    self.liquidated.add(security.Symbol)
                    algorithm.Log(f"Liquidating {security.Symbol}")
                return riskAdjustedTargets
                
            # Add newly invested securities
            if symbol not in self.trailingHighs:
                self.trailingHighs[symbol] = security.Holdings.AveragePrice   # Set to average holding cost
                continue

            # Check for new highs and update - set to tradebar high
            if self.trailingHighs[symbol] < security.High:
                self.trailingHighs[symbol] = security.High
                continue

            # Check for securities past the drawdown limit
            securityHigh = self.trailingHighs[symbol]
            drawdown = (security.Low / securityHigh) - 1

                
            if drawdown < self.maximumDrawdownPercent or security.Symbol in self.liquidated:
                # liquidate
                algorithm.Debug(f"Losses Taken: {security.Symbol}")
                algorithm.Log(f"Losses Taken: {security.Symbol}")
                riskAdjustedTargets.append(PortfolioTarget(symbol, 0))
                if algorithm.Securities[security.Symbol].Invested:
                    self.liquidated.add(security.Symbol)
                    algorithm.Log(f"Liquidating {security.Symbol}")
                
        return riskAdjustedTargets


class TrailingStopRiskManagementModelLiquidation(RiskManagementModel):
    '''Provides an implementation of IRiskManagementModel that limits the maximum possible loss
    measured from the highest unrealized profit'''
    def __init__(self, maximumDrawdownPercent = 0.05):
        '''Initializes a new instance of the TrailingStopRiskManagementModel class
        Args:
            maximumDrawdownPercent: The maximum percentage drawdown allowed for algorithm portfolio compared with the highest unrealized profit, defaults to 5% drawdown'''
        self.maximumDrawdownPercent = abs(maximumDrawdownPercent)
        self.trailingAbsoluteHoldingsState = dict()
        self.liquidated = set()

    def ManageRisk(self, algorithm, targets):
        '''Manages the algorithm's risk at each time step
        Args:
            algorithm: The algorithm instance
            targets: The current portfolio targets to be assessed for risk'''
        riskAdjustedTargets = list()

        for kvp in algorithm.Securities:
            symbol = kvp.Key
            security = kvp.Value

            # Remove if not invested
            if not security.Invested:
                self.trailingAbsoluteHoldingsState.pop(symbol, None)
                continue

            position = PositionSide.Long if security.Holdings.IsLong else PositionSide.Short
            absoluteHoldingsValue = security.Holdings.AbsoluteHoldingsValue
            trailingAbsoluteHoldingsState = self.trailingAbsoluteHoldingsState.get(symbol)

            # Add newly invested security (if doesn't exist) or reset holdings state (if position changed)
            if trailingAbsoluteHoldingsState == None or position != trailingAbsoluteHoldingsState.position:
                self.trailingAbsoluteHoldingsState[symbol] = trailingAbsoluteHoldingsState = self.HoldingsState(position, security.Holdings.AbsoluteHoldingsCost)

            trailingAbsoluteHoldingsValue = trailingAbsoluteHoldingsState.absoluteHoldingsValue

            # Check for new max (for long position) or min (for short position) absolute holdings value
            if ((position == PositionSide.Long and trailingAbsoluteHoldingsValue < absoluteHoldingsValue) or
                (position == PositionSide.Short and trailingAbsoluteHoldingsValue > absoluteHoldingsValue)):
                self.trailingAbsoluteHoldingsState[symbol].absoluteHoldingsValue = absoluteHoldingsValue
                continue

            drawdown = abs((trailingAbsoluteHoldingsValue - absoluteHoldingsValue) / trailingAbsoluteHoldingsValue)

            if self.maximumDrawdownPercent < drawdown or security.Symbol in self.liquidated:
                self.trailingAbsoluteHoldingsState.pop(symbol, None)
                # liquidate
                riskAdjustedTargets.append(PortfolioTarget(symbol, 0))
                if algorithm.Securities[security.Symbol].Invested:
                    self.liquidated.add(security.Symbol)
                    algorithm.Log(f"Liquidating {security.Symbol}")

        return riskAdjustedTargets

    class HoldingsState:
        def __init__(self, position, absoluteHoldingsValue):
            self.position = position
            self.absoluteHoldingsValue = absoluteHoldingsValue