Overall Statistics |
Total Trades 4187 Average Win 0.35% Average Loss -0.21% Compounding Annual Return 23.865% Drawdown 18.500% Expectancy 0.496 Net Profit 751.154% Sharpe Ratio 1.277 Probabilistic Sharpe Ratio 75.409% Loss Rate 43% Win Rate 57% Profit-Loss Ratio 1.64 Alpha 0.096 Beta 0.744 Annual Standard Deviation 0.133 Annual Variance 0.018 Information Ratio 0.693 Tracking Error 0.102 Treynor Ratio 0.228 Total Fees $20151.84 Estimated Strategy Capacity $150000000.00 Lowest Capacity Asset GOOCV VP83T1ZUHROL |
#region imports from AlgorithmImports import * #endregion from sklearn.model_selection import train_test_split from optimize import * import numpy as np import pandas as pd class EnsembleModel(): def __init__(self, algo): self.algo = algo self.models = None self.optimizer = None # Build a set of models def Train(self, X, y, returns): self.optimizer = ModelOptimizer(self.algo) try: self.models = self.optimizer.BuildModels(X, y, returns) except: pass # Return prediction as a +/- Z-Score of the prediction from the the threshold def Predict(self, X): z = 0 if self.models: total_weight = sum(model.test_score for model in self.models) for model in self.models: try: result = model.Predict(X) # Weight the result based on the models test score z = z + result * (model.test_score/total_weight) except: pass return z
#region imports from AlgorithmImports import * #endregion import talib from talib import func import numpy as np import pandas as pd # # This helper class allows us to easily add features to our dataset # add_features(df, ['EMA_7']) # # Other examples # EMA_7_28 gives you the ratio of these two EMA # EMA_7_28_diff gives you the difference of the current ratio and the last ratio for those two EMA # def ichimoku_a(high, low, n1 = 9, n2 = 26, n3 = 52): conv = 0.5 * (high.rolling(n1, min_periods=0).max() + low.rolling(n1, min_periods=0).min()) base = 0.5 * (high.rolling(n2, min_periods=0).max() + low.rolling(n2, min_periods=0).min()) spana = 0.5 * (conv + base) return pd.Series(spana) def ichimoku_b(high, low, n1 = 9, n2 = 26, n3 = 52): spanb = 0.5 * (high.rolling(n3, min_periods=0).max() + low.rolling(n3, min_periods=0).min()) return pd.Series(spanb) def obv_rolling_percent(close, volume, n, fillna=False): df = pd.DataFrame([close, volume]).transpose() df['OBV'] = 0 c1 = close < close.shift(1) c2 = close > close.shift(1) if c1.any(): df.loc[c1, 'OBV'] = - volume if c2.any(): df.loc[c2, 'OBV'] = volume obv = df['OBV'].rolling(n).sum() / df[volume.name].rolling(n).sum() if fillna: obv = obv.replace([np.inf, -np.inf], np.nan).fillna(0) return pd.Series(obv, name='obv') def kst(close, r1 = 3, r2 = 7, r3 = 14, r4 = 28, n1 = 10, n2 = 3, n3 = 7, n4 = 14): rocma1 = ((close - close.shift(r1)) / close.shift(r1)).rolling(n1, min_periods=0).mean() rocma2 = ((close - close.shift(r2)) / close.shift(r2)).rolling(n2, min_periods=0).mean() rocma3 = ((close - close.shift(r3)) / close.shift(r3)).rolling(n3, min_periods=0).mean() rocma4 = ((close - close.shift(r4)) / close.shift(r4)).rolling(n4, min_periods=0).mean() return 100 * (rocma1 + 2 * rocma2 + 3 * rocma3 + 4 * rocma4) def kst_sig(close, r1 = 3, r2 = 7, r3 = 14, r4 = 28, n1 = 10, n2 = 3, n3 = 7, n4 = 14, nsig = 6): kst_val = kst(clse, r1, r2, r3, r4, n1, n2, n3, n4) kstsig = kst_val.rolling(nsig, minperiods=0).mean() return kstsig def check_feature_cache(cache, name, f, args): if not name in cache: cache[name] = f(*args) return cache[name] def add_features(df, features, close="close", high="high", low="low", volume="volume"): fmap = {'MFI':'money_flow_index','ICHA':'ichimoku_a','ICHB':'ichimoku_b','BOLLH':'bollinger_hband', \ 'BOLLL':'bollinger_lband','KCC':'keltner_channel_central','NVI':'negative_volume_index', \ 'OBVP' : 'obv_rolling_percent', 'KST' : 'kst', 'KST_SIG' : 'kst_sig'} cache = {} for feature in features: # parse string. Style is func_period1_period2:COL=ColumnName1 (column name optional) col = None col_idx = feature.find(':') if col_idx > -1: col = feature[col_idx+1:] feature = feature[0:col_idx] p = feature.split('_') if not col is None: feature = feature + "_" + col fname = p[0].upper() # If DM, DI, or HT function will have underscore in name, need to add suffix back and shift params back if len(p) > 1 and (p[1] in ['DM','DI'] or p[0] in ['HT']): fname += '_' + p[1] for i in range(2,len(p)): p[i-1] = p[i] del p[-1] p1 = p[1].upper() if len(p) > 1 else None p2 = p[2].upper() if len(p) > 2 else None p3 = p[3].upper() if len(p) > 3 else None if fname in fmap: if fmap[fname].find('.') > -1: s = fmap[fname].split('.') cls = s[0] f = getattr(globals()[cls], s[1]) else: f = globals()[fmap[fname].lower()] elif fname in talib.__TA_FUNCTION_NAMES__: f = getattr(func, fname) elif fname.lower() in globals(): f = globals()[fname.lower()] else: raise Exception(f'Could not find function. fname: {fname} feature: {feature}') if fname.endswith('MA') or fname == 'T3': args = [df[close], int(p1)] if p2 is None: df[feature] = (check_feature_cache(cache, feature, f, args )) elif p2 == 'DIFF': ma1 = check_feature_cache(cache, fname + '_' + p1, f, args) df[feature] = ma1.pct_change() p2 = None # So it doesn't get diffed at end of method else: ma1 = check_feature_cache(cache, fname + '_' + p1, f, args) args = [df[close], int(p2)] ma2 = check_feature_cache(cache, fname + '_' + p2, f, args) df[feature] = (ma1 - ma2) / ma1 df[feature].replace([np.inf, -np.inf], 0, inplace=True) elif fname in ['CMO','TRIX','ROCP','ROCR100','ROC','BOLLH','BOLLL', 'RSI','MOM']: df[feature] = f(df[close], int(p1)) elif fname in ['MINUS_DM', 'PLUS_DM']: df[feature] = f(df[high], df[low], int(p1)) elif fname in ['ADX','ATR','KCC','MINUS_DI','PLUS_DI','WILLR']: try: if df[close].isnull().all(): df[feature] = np.nan elif p2 is None or p2.upper() == 'DIFF': df[feature] = f(df[high], df[low], df[close], int(p1)) # Change all to percent, except WILLR if not fname in ['WILLR','ADX']: df[feature] = df[feature]/df[close] else: #params = {'close' : df[close], 'high' : df[high],'low' : df[low], 'timeperiod' : int(p1)} args = [df[high], df[low], df[close], int(p1)] f1 = check_feature_cache(cache, fname + '_' + p1, f, args) args = [df[high], df[low], df[close], int(p2)] f2 = check_feature_cache(cache, fname + '_' + p2, f, args) df[feature] = (f1-f2)/f1 df[feature].replace([np.inf, -np.inf], 0, inplace=True) except Exception as ex: if str(ex) == 'inputs are all NaN': df[feature] = np.nan else: raise elif fname in ['MFI']: df[feature] = f(df[high], df[low], df[close], df[volume], int(p1)) elif fname in ['MACD', 'PPO', 'TSI']: df[feature] = f(df[close], int(p1), int(p2)) elif fname in ['ICHA', 'ICHB', 'AO']: # Normalize df[feature] = f(df[high], df[low], int(p1), int(p2)) # if fname.startswith('ICH'): # df[feature] = df[feature]/ df[close] elif fname in ['NVI']: df[feature] = f(df[close], df[volume]) elif fname in ['SAR', 'SAREXT']: # Normalize df[feature] = f(df[high], df[low]) #/ df[close] elif fname == 'KST': df[feature] = f(df[close], r1=10, r2=15, r3=20, r4=30, n1=10, n2=10, n3=10, n4=15) elif fname == 'KST_SIG': df[feature] = f(df[close], r1=10, r2=15, r3=20, r4=30, n1=10, n2=10, n3=10, n4=15, nsig=9) elif fname == 'AROON': df[feature] = f(df[high], df[low], int(p1))[int(p2)-1] elif fname == 'ULTOSC': df[feature] = f(df[high], df[low], df[close]) elif fname in ['OBVP']: df[feature] = f(df[close],df[volume],int(p1)) elif fname in ['OBTP']: df[feature] = f(df[close],df["Trades"],int(p1)) elif fname in ['HT_DCPERIOD','HT_DCPHASE','HT_PHASOR','HT_SINE', 'HT_TRENDMODE','STOCHRSI' ]: df[feature] = f(df[close]) else: raise Exception('Feature not found: ' + feature + ' ' + fname ) # Difference those that are 1 period diffs if p3 == 'DIFF' or p2 == 'DIFF': df[feature] = df[feature].diff() return df def get_exhaustive_ma_features_list(): features = [] periods = [7, 14, 28, 42, 70] mas = ["SMA", "DEMA", "KAMA", "EMA", "TEMA", "T3", "TRIMA", "WMA"] for p in periods: for ma in mas: features.append(ma + "_" + str(p)) for p1 in periods: for p2 in periods: if p2 <= p1: continue for ma in mas: ma1 = ma + str(p1) ma2 = ma + str(p2) features.append(ma + "_" + str(p1) + "_" + str(p2)) features.append(features[-1] + "_diff") return features def get_exhaustive_features_list(diffs=True): features = get_exhaustive_ma_features_list() periods = [3,7,14,20,30] band_periods = [7,14,21,28] hi_lo_periods = [[4,9],[7,14],[10,21],[12,26]] pfs = ['CMO','MINUS_DI','MINUS_DM','PLUS_DI','PLUS_DM','ROCP','ATR','ADX', 'TRIX','WILLR'] # Left out: MOM, VIN, VIP, ROC,ROCR,OBVM,EOM,DPI,RSI,STOCH,STOCH_SIG,FI for p in periods: features.append('AROON_' + str(p) + '_1') features.append('AROON_' + str(p) + '_2') for pf in pfs: if pf == 'ADX' and p == 3: continue features.append(pf + '_' + str(p)) if diffs: features.append(features[-1] + "_diff") if diffs: for p1 in periods: for p2 in periods: if p2 <= p1: continue for pf in pfs: pf1 = pf + str(p1) pf2 = pf + str(p2) features.append(pf + "_" + str(p1) + "_" + str(p2)) features.append(features[-1] + "_diff") hlfs = ['ICHA','ICHB'] # Exclude PPO, same as our MA setups # hi lo Indicators for hl in hi_lo_periods: for hlf in hlfs: features.append(hlf + '_' + str(hl[0]) + '_' + str(hl[1])) # other indicators otfs = ['SAR','SAREXT','HT_DCPERIOD','ULTOSC','OBVP_10','OBVP_20','OBVP_40','OBVP_80'] for otf in otfs: features.append(otf) return features
from AlgorithmImports import * from risk import BracketRiskModel, TrailingStopRiskManagementModelLiquidation from feature import add_features from ensemble import EnsembleModel from datetime import timedelta import pandas as pd import numpy as np import statistics import sklearn from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.utils import resample, shuffle import math class TransdimensionalDynamicThrustAssembly(QCAlgorithm): def Initialize(self): self.SetStartDate(2010, 1, 1) # Set Start Date self.SetEndDate(2020, 1, 1) # Set End Date self.InitCash = 100000 self.SetCash(self.InitCash) self.SetBrokerageModel(BrokerageName.InteractiveBrokersBrokerage) # time frames and frequencies self.resolution = Resolution.Daily self.rebalancefunc = Expiry.EndOfQuarter self.lookback = 250 self.metamodel_rolling_window = 50 # ML parameters self.years = 4 self.n_estimators = 10 self.min_samples_split = 140 self.num_iter = 10 self.num_test_records = 60 # UniverseSize determines the number of ETF constituents to select from each ETF. The top x by weight are selected. self.UniverseSize = 9 self.kelly_multiplier = 1 self.SetWarmup(self.lookback) # if this flag is set to false, none of the meta labeling code gets called, thus saving a lot of expensive compute self.meta_labelling = False if self.meta_labelling: import ht_auth ht_auth.SetToken(self.GetParameter("mlfinlab-api-key")) import mlfinlab as ml self.ml = ml from mlfinlab.bet_sizing import bet_sizing self.bet_sizing = bet_sizing # add SPY as the benchmark but don't trade it (that's what self.exclusions is for, it's being checked at various points)! self.SetBenchmark('SPY') self.MKT = self.AddEquity('SPY', self.resolution).Symbol self.mkt = [] self.exclusions = ['SPY'] self.UniverseSettings.Resolution = self.resolution features = ['HT_DCPERIOD', 'HT_DCPHASE', 'MOM_5', 'ADX_10_diff','MOM_5_diff'] self.etfs = ["SPY"] for etf in self.etfs: self.AddUniverse(self.Universe.ETF(etf, Market.USA, self.UniverseSettings, self.ETFConstituentsFilter)) # Select the alpha model self.SetAlpha(WeightedClassifierAlpha(self, features, self.lookback, self.metamodel_rolling_window, self.meta_labelling, self.years)) # Equally weigh securities in portfolio, based on insights self.SetPortfolioConstruction(InsightWeightingPortfolioConstructionModel()) # Set VWAP Execution Model (Immidiate Execution worked well, but VWAP execution seems to produce slightly better results) self.SetExecution(VolumeWeightedAveragePriceExecutionModel()) # Set Risk Management Model - none of the risk models imroved the performance over the alpha model's own flat insights, so for now NullRiskModel seems best # Bracket Risk Model (using SL and TP) and Trailing Stop (using SL only) are both modified to prevent the Alpha Model from re-opening positions that the risk model has closed # self.stopLoss = 0.02 # self.takeProfit = 0.95 self.SetRiskManagement(NullRiskManagementModel()) # self.SetRiskManagement(BracketRiskModel(self.stopLoss, self.takeProfit)) # self.SetRiskManagement(TrailingStopRiskManagementModelLiquidation(self.stopLoss)) def ETFConstituentsFilter(self, constituents: List[ETFConstituentData]) -> List[Symbol]: # Get the UniverseSize securities with the largest weight in the index selected = sorted([c for c in constituents if c.Weight], key=lambda c: c.Weight, reverse=True)[:self.UniverseSize] if not selected: self.Log("ETF symbols without weight. Returning Universe Unchanged: "+str([c.Value for c in Universe.Unchanged])) return Universe.Unchanged else: self.Log("ETF symbols selected: "+str([c.Symbol.Value for c in selected])) return [c.Symbol for c in selected] def OnEndOfDay(self, symbol): # plot the benchmark on the main equities chart for comparison if not self.IsWarmingUp: mkt_price = self.History(self.MKT, 2, Resolution.Daily)['close'].unstack(level= 0).iloc[-1] self.mkt.append(mkt_price) mkt_perf = self.InitCash * self.mkt[-1] / self.mkt[0] self.Plot('Strategy Equity', self.MKT, mkt_perf) class WeightedClassifierAlpha(AlphaModel): def __init__(self, algo, features, lookback, metamodel_rolling_window, meta_labelling, years): self.algo = algo self.models = {} self.features = features self.rebalanceTime = datetime.min self.symbols = [] self.lookback = lookback self.metamodel_rolling_window = metamodel_rolling_window self.years = years self.kelly_size = {} self.avg_win = {} self.avg_loss = {} self.meta_labelling = meta_labelling self.primary_history = {} self.metamodel = {} self.triple_barrier_events = {} def Update(self, algo, data): insights = [] if not self.symbols: return insights try: # This can be adjusted to only allow trading for symbols with a certain Kelly size (as determined during the last training of the model) min_kelly = 0 # Load History and Features end = self.algo.Time start = end - timedelta(days=self.lookback) X = self.GetXforPrimaryModel(self.symbols, start, end) time = X.index[-1][0] X = X.loc[pd.IndexSlice[time, :], :] insights = [] if not self.algo.IsWarmingUp: # Predict each symbols direction for symbol in self.symbols: symbol = SymbolCache.GetSymbol(symbol) if symbol in self.kelly_size: kelly_size = self.kelly_size[symbol] if math.isnan(kelly_size): kelly_size = 0 self.algo.Log("NaN Kelly Size for "+str(symbol.Value)) else: kelly_size = 0 if symbol.Value not in self.algo.exclusions: self.algo.Log("No Kelly Size for "+str(symbol.Value)) if kelly_size >= min_kelly: X_symbol = X.loc[pd.IndexSlice[:, [symbol.ID.ToString()]], :] if self.models and data.ContainsKey(symbol) and data[symbol]: if symbol in self.models: model = self.models[symbol] result = model.Predict(X_symbol) direction = InsightDirection.Flat metapred = 1 metapred_prob = 1 size = self.kelly_size[symbol] * self.algo.kelly_multiplier if self.meta_labelling: if symbol in self.primary_history.keys() and symbol in self.metamodel: self.UpdateRollingHistoryWindow(symbol, data[symbol].EndTime, data[symbol].Close, result) X_meta, y_meta, li, tbi = self.GetXyforMetaModel(self.primary_history[symbol], symbol,0, False) metapred = self.metamodel[symbol].predict(X_meta.iloc[-1:])[0] metapred_prob = prob[-1][1] if self.avg_win[symbol] == 0 or self.avg_loss[symbol] == 0: size = 0 else: size = metapred_prob - ((1-metapred_prob)/(self.avg_win[symbol]/self.avg_loss[symbol])) # Alternatibely: using MLFinLab's bet_size_probability instead of kelly to determine size: # prob = self.metamodel[symbol].predict_proba(X_meta.fillna(0)) # zlst = list(zip(*prob)) # size = self.algo.bet_sizing.bet_size_probability(self.triple_barrier_events[symbol], pd.Series(zlst[1]), 2).iloc[-1] if math.isnan(size): size = 0 # using result as measure of confidence and (Kelly) size as weight. # (For InsightWeightingPortfolioConstructionModel confidence has no effect, this is just in case it might be useful for aby other PCM) if result > 0 and metapred and size > 0: insights.append( Insight.Price(symbol, timedelta(days=2), InsightDirection.Up, result, None, None, size)) elif result < 0: size = 0 insights.append( Insight.Price(symbol, timedelta(days=1), InsightDirection.Flat, -result, None, None, size)) return insights except Exception as e: self.algo.Log('Unexpected error during insights generation:' + str(e)) raise def OnSecuritiesChanged(self, algorithm: QCAlgorithm, changes: SecurityChanges) -> None: pred_hist = pd.DataFrame for security in changes.RemovedSecurities: if security.Symbol in self.symbols: self.symbols.remove(security.Symbol) self.metamodel.pop(security.Symbol, None) self.triple_barrier_events.pop(security.Symbol, None) if self.meta_labelling: self.primary_history.pop(security.Symbol, None) if security.Invested: self.algo.Liquidate(security.Symbol, "Removed from Universe") for security in changes.AddedSecurities: if security.Symbol.Value not in self.algo.exclusions: self.symbols.extend([security.Symbol]) if self.algo.Time > self.rebalanceTime: # it's time to re-train all models (and not just the newly added ones) symbols = self.symbols self.rebalanceTime = self.algo.rebalancefunc(self.algo.Time) self.algo.Debug(str(self.algo.Time)+" Rebalancing. Training all "+str([symbol.Value for symbol in symbols])) else: symbols = [security.Symbol for security in changes.AddedSecurities] self.algo.Debug(str(self.algo.Time)+" Not yet time for Rebalancing. Training new symbols "+str([symbol.Value for symbol in symbols])) if symbols: for symbol in symbols: self.models[symbol] = self.TrainModel(symbol, self.years) # initial history of newly added securities for meta labelling if self.meta_labelling: for symbol in symbols: if symbol.Value not in self.algo.exclusions: try: end = self.algo.Time - timedelta(days=1) start = end - timedelta(days=self.lookback * 2) X = self.GetXforPrimaryModel([symbol], start, end) historystart = self.algo.Time - timedelta(days=self.lookback) nearest = X.reset_index().time.searchsorted(historystart) historystart = X.index[nearest-1][0] pred_hist= self.algo.History(symbol, historystart, end, self.algo.resolution).droplevel(0, axis=0) pred_hist.index.names = ['date_time'] pred_hist = pred_hist[['close']] pred_hist['result']=0.0 iter_date = self.algo.Time - timedelta(days=self.lookback) self.primary_history[symbol] = pd.DataFrame(columns=['time','close','result']).set_index('time') while iter_date <= end: nearest = X.reset_index().time.searchsorted(iter_date) if 0 <= nearest < len(X.index): time = X.index[nearest][0] X_slice = X.loc[pd.IndexSlice[time, :], :] X_symbol = X_slice.loc[pd.IndexSlice[:, [symbol.ID.ToString()]], :] model = self.models[symbol] result = model.Predict(X_symbol) pred_hist.loc[time]['result'] = result if self.meta_labelling: self.UpdateRollingHistoryWindow(symbol, time, pred_hist.loc[time]['close'], result) iter_date += timedelta(days=1) self.metamodel[symbol], self.triple_barrier_events[symbol] = self.TrainMetaModel(pred_hist, symbol) self.algo.Debug(str(self.algo.Time)+" Meta Model for symbol "+str(symbol.Value)) except: self.algo.Error(str(self.algo.Time)+" No Meta Model for symbol "+str(symbol.Value)) def GetHistoryAndFeatures(self, symbols, start, end): history = self.algo.History(symbols, start, end, self.algo.resolution) gb = history.groupby('symbol', group_keys=False) history['returns_1w'] = -gb.close.pct_change(-5) history['returns_1d'] = -gb.close.pct_change(-1) history['returns_1m'] = -gb.close.pct_change(-21) gb = history.groupby('symbol', group_keys=False) history = gb.apply(lambda x: add_features(x, self.features)) history = history.reset_index().set_index('time').sort_index() history.index = history.index.normalize() history = history.set_index('symbol', append=True) return history def GetXforPrimaryModel(self, symbols, start, end): history = self.GetHistoryAndFeatures(symbols, start, end) # Drop our Y columns before training X = history.drop(columns=['returns_1d', 'returns_1w', 'returns_1m', 'close','high','low','open','volume']) columns = X.columns[X.isna().any()].tolist() return X.dropna() def GetXyforMetaModel(self, df, symbol, shift, getlabels = True): labels = None y = None labels_index = None triple_barrier_events = None df['side'] = np.nan long_signals = df['result'] >= 0 short_signals = df['result'] < 0 df.loc[long_signals, 'side'] = 1 df.loc[short_signals, 'side'] = -1 # Remove Look ahead biase by lagging the signal df['side'] = df['side'].shift(shift) raw_data = df.copy() # we only need to generate labels when we train. We use the same method to get our X to predict the primary model's performance, so we can pass getlabels = False to save some time if getlabels: # Drop the NaN values from our data set df.dropna(axis=0, how='any', inplace=True) # Compute daily volatility try: if len(df['close']) < self.metamodel_rolling_window: metamodel_rolling_window = len(df['close']) else: metamodel_rolling_window = self.metamodel_rolling_window daily_vol = self.algo.ml.util.get_daily_vol(close=df['close'], lookback=metamodel_rolling_window) except: self.algo.Error(str(self.algo.Time)+" daily_vol ERROR") # Apply Symmetric CUSUM Filter and get timestamps for events # Note: Only the CUSUM filter needs a point estimate for volatility cusum_events = self.algo.ml.filters.cusum_filter(df['close'], threshold=daily_vol.mean()*0.5) # Compute vertical barrier vertical_barriers = self.algo.ml.labeling.add_vertical_barrier(t_events=cusum_events, close=df['close'], num_days=1) # pt_sl = [1, 2] pt_sl = [0, 0] # min_ret = 0.005 min_ret = 0.0 try: triple_barrier_events = self.algo.ml.labeling.get_events(close=df['close'], t_events=cusum_events, pt_sl=pt_sl, target=daily_vol, min_ret=min_ret, num_threads=3, vertical_barrier_times=vertical_barriers, # vertical_barrier_times=False, side_prediction=df['side']) except: self.algo.Error(str(self.algo.Time)+" events ERROR") labels = self.algo.ml.labeling.get_bins(triple_barrier_events, df['close']) # Log Returns raw_data['log_ret'] = np.log(raw_data['close']).diff() # Momentum raw_data['mom1'] = raw_data['close'].pct_change(periods=1) raw_data['mom2'] = raw_data['close'].pct_change(periods=2) raw_data['mom3'] = raw_data['close'].pct_change(periods=3) raw_data['mom4'] = raw_data['close'].pct_change(periods=4) raw_data['mom5'] = raw_data['close'].pct_change(periods=5) # Volatility raw_data['volatility_50'] = raw_data['log_ret'].rolling(window=50, min_periods=50, center=False).std() raw_data['volatility_31'] = raw_data['log_ret'].rolling(window=31, min_periods=31, center=False).std() raw_data['volatility_15'] = raw_data['log_ret'].rolling(window=15, min_periods=15, center=False).std() # # Serial Correlation (Takes about 4 minutes) window_autocorr = 50 raw_data['autocorr_1'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=1), raw=False) raw_data['autocorr_2'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=2), raw=False) raw_data['autocorr_3'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=3), raw=False) raw_data['autocorr_4'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=4), raw=False) raw_data['autocorr_5'] = raw_data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=5), raw=False) # Get the various log -t returns raw_data['log_t1'] = raw_data['log_ret'].shift(1) raw_data['log_t2'] = raw_data['log_ret'].shift(2) raw_data['log_t3'] = raw_data['log_ret'].shift(3) raw_data['log_t4'] = raw_data['log_ret'].shift(4) raw_data['log_t5'] = raw_data['log_ret'].shift(5) # Re compute sides raw_data['side'] = np.nan long_signals = raw_data['result'] >= 0 short_signals = raw_data['result'] < 0 raw_data.loc[long_signals, 'side'] = 1 raw_data.loc[short_signals, 'side'] = -1 # Remove look ahead bias raw_data = raw_data.shift(shift) # Get features at event dates X= raw_data # # Drop unwanted columns X.drop(['close',], axis=1, inplace=True) if labels is not None: y = labels['bin'] labels_index = labels.index return X, y, labels_index, triple_barrier_events def UpdateRollingHistoryWindow(self, symbol, time, close, result): new_row = pd.DataFrame([{'close': close, 'result': result}], index=[time]) self.primary_history[symbol] = pd.concat([self.primary_history[symbol], new_row], ignore_index=False).drop_duplicates(keep='last') self.primary_history[symbol] = self.primary_history[symbol].iloc[-(self.metamodel_rolling_window+2):] def TrainModel(self, symbol, years): model = None try: # Load History and Features total_days = 365 * years start = self.algo.Time - timedelta(days=total_days) end = self.algo.Time history = self.GetHistoryAndFeatures(symbol, start, end) # Set our Y to predict if symbols will be positive in 1 week history = history.dropna() y = history['returns_1w'] > 0 returns = history['returns_1w'] closes = history['close'] # Drop Y and close X = history.drop(columns=['returns_1d', 'returns_1w', 'returns_1m', 'close','high','low','open','volume']) # build model for the symbol if symbol.Value not in self.algo.exclusions: symbol = SymbolCache.GetSymbol(symbol) # idx = pd.IndexSlice X_symbol = X.loc[pd.IndexSlice[:, [symbol.ID.ToString()]], :] y_symbol = y.loc[X_symbol.index] # we're also adding ahistory of returns, which we pass on to the model to calculate kelly sizes upon Training returns_symbol = returns.loc[X_symbol.index] model = EnsembleModel(self.algo) model.Train(X_symbol, y_symbol, returns_symbol) if model.models: # every time the model is trained we take the mean value of all num_iter models of this symbol. That's going to be the kelly size we'll use as weight when creating insights cleanlist = [i.kelly_size for i in model.models if not math.isnan(i.kelly_size)] self.kelly_size[symbol] = statistics.fmean(cleanlist) if cleanlist else 0 # we're also grabbing the mean values for average wins and loss for the symbol as we might want to use those for the MLFinLab bet sizing cleanlist = [i.avg_win for i in model.models if not math.isnan(i.avg_win)] self.avg_win[symbol] = statistics.fmean(cleanlist) if cleanlist else 0 cleanlist = [i.avg_loss for i in model.models if not math.isnan(i.avg_loss)] self.avg_loss[symbol] = statistics.fmean(cleanlist) if cleanlist else 0 else: self.kelly_size[symbol] = 0 self.avg_win[symbol] = 0 self.avg_loss[symbol] = 0 self.algo.Error("no models available ") except Exception as e: self.algo.Log('Unexpected error trying to build models:' + str(e)) raise return model def TrainMetaModel(self, df, symbol): X, y, labels_index, triple_barrier_events = self.GetXyforMetaModel(df, symbol,1) X = X.loc[labels_index, :] X_training_validation = X y_training_validation = y X_train, X_validate, y_train, y_validate = train_test_split(X_training_validation, y_training_validation, test_size=0.15, shuffle=False) train_df = pd.concat([y_train, X_train], axis=1, join='inner').dropna() # Upsample the training data to have a 50 - 50 split # https://elitedatascience.com/imbalanced-classes if len(train_df[train_df['bin'] == 0]) > len(train_df[train_df['bin'] == 1]): majority = train_df[train_df['bin'] == 0] minority = train_df[train_df['bin'] == 1] else: majority = train_df[train_df['bin'] == 1] minority = train_df[train_df['bin'] == 0] if not minority.empty: new_minority = resample(minority, replace=True, # Sample with replacement n_samples=majority.shape[0], # To match majority class random_state=42) else: self.algo.Debug(str(self.algo.Time)+" MINORITY EMPTY Majority: "+ str(len(majority))) new_minority = minority train_df = pd.concat([majority, new_minority]) train_df = shuffle(train_df, random_state=42) # Create training data y_train = train_df['bin'] X_train= train_df.loc[:, train_df.columns != 'bin'] n_estimator, depth = 256, 7 c_random_state = 42 rf = RandomForestClassifier(max_depth=depth, n_estimators=n_estimator, criterion='entropy', random_state=c_random_state) rf.fit(X_train, y_train.values.ravel()) return rf, triple_barrier_events
#region imports from AlgorithmImports import * #endregion from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import train_test_split import numpy as np import pandas as pd class ClassifierModel(): def __init__(self, algo): self.algo = algo self.model = None self.params = None self.test_score = None self.test_dist = None self.threshold = None # Train the model def Train(self, X, y, params): self.params = params self.params['class_weight'] = self.GetClassWeight(y) self.model = ExtraTreesClassifier(**self.params) self.model.fit(X, y) self.test_dist = self.model.predict_proba(X)[:, 1] # Return prediction as a +/- Z-Score of the prediction from the the threshold def Predict(self, X): #self.algo.Debug(f'Number of columns before pca: {X.shape[1]}') idx = X.index #X = np.nan_to_num(X, nan=0, posinf=0, neginf=0) probs = pd.Series(self.model.predict_proba(X)[:, 1]) d = self.test_dist return (probs[0] - self.threshold) / d.std() def GetClassWeight(self, y_train): pos = y_train.sum()/(len(y_train)) neg = 1 - pos pw = neg/pos nw = pos/neg return {0:nw,1:pw}
#region imports from AlgorithmImports import * #endregion from sklearn.ensemble import ExtraTreesClassifier from scipy.stats.mstats import gmean import numpy as np import pandas as pd import math from sklearn.metrics import auc,roc_curve,precision_recall_curve, roc_auc_score, accuracy_score from model import ClassifierModel class ModelOptimizer(): def __init__(self, algo): self.algo = algo self.validation_score = 0 self.test_score = 0 self.test_dist = None self.threshold = None def BuildModels(self, X, y, returns): # Need to continue to play with min samples split and estimators and maybe class weights params = {'random_state':0,'n_estimators': self.algo.n_estimators, 'min_samples_split': self.algo.min_samples_split, 'class_weight' : {0:1,1:1}, 'max_features': 'auto', 'max_depth': 3, 'criterion': 'gini', 'bootstrap': True} # Train and validate using walk forward validation max_score = -1 max_y = None max_probs = None num_iter = self.algo.num_iter num_test_records = self.algo.num_test_records num_records = len(X) num_train_records = num_records - num_test_records batch_size = 4 models = [] results = [] # Iterate ove the dataset and create a number (num_iter) of models. trendtrades = [True] * (num_test_records-1) for i in range(0, num_iter): probs = [] y_test_all = [] returns_test_all = [] # closes_test_all = [] end_idx = num_records # This is important. Copy params so each model keeps a distinct copy with it's own random seed model_params = params.copy() model_params['random_state']= i # Loop through and do walk forward validation j = num_train_records while j < end_idx: # Add the next segment of data idx = j + batch_size if idx >= end_idx: idx = end_idx-1 X_train, X_test = X[0:j], X[j:idx] y_train, y_test = y[0:j], y[j:idx] returns_train, returns_test = returns[0:j], returns[j:idx] j = j + batch_size # Set the class weight based positives vs negatives model_params['class_weight'] = self.get_class_weights(y_train) # Train the model clf = ExtraTreesClassifier(**model_params) clf.fit(X_train, y_train) # Predict using the model p = pd.Series(clf.predict_proba(X_test)[:, 1]) # Add the predicions probs.extend(p) y_test_all.extend(y_test) returns_test_all.extend(returns_test) # Build model on all the data model = ClassifierModel(self.algo) model.Train(X, y, model_params) # Since each models predictive results tend to skew, we calculate a threshold for what is a positive result model.threshold = self.get_threshold(pd.Series(y), model.test_dist) # Give the model a score for use in weighting models y_pred = probs > model.threshold score = self.score_results(y_test_all, probs, model.threshold) model.test_score = score # add win/loss stats as properties tothe model, so they can be used in the Alpha Model for bet sizing model.test_y = y_test_all model.test_returns = returns_test_all model.pred_y = y_pred model.kelly_size, model.avg_win, model.avg_loss = self.kelly_size(y_test_all, y_pred, returns_test_all, trendtrades) models.append(model) return models # Calculate the model threshold for predicting positives def get_threshold(self, y_true, probs): # calculate percent positive pos = y_true.sum()/(len(y_true)) # use that percent as threshold cutoff return sorted(probs, reverse=True)[int(round(len(probs)*pos))] def weighted_score(self, y_true, probs): # There are two ways we want to weight a prediction # 1. More recent predictions are weighted more heavily # 2. The larger the prediction value is weighted more heavily (positive or negative) y_test_all = np.asarray(y_true) probs_all = pd.Series(probs) splits = 6 length = math.floor(len(probs_all)/splits) start = 0 end = length-1 scores = [] for i in range(0,splits): y_test = y_test_all[start:end] if y_test.sum() != 0: probs = probs_all[start:end] precision, recall, threshold = precision_recall_curve(y_test, probs) pauc = auc(recall, precision) if math.isnan(pauc): pauc = 0 scores.append(pauc*(1+i/10)) end = end + length start = start + length if (end >= len(probs_all)): end = len(probs_all)-1 return gmean(scores) def kelly_size(self, y_true, y_pred, returns, trendtrades): # calculate the symbol's kelly size based on the predictions of the model df = pd.DataFrame({'y_true':y_true, 'y_pred':y_pred, 'returns': returns, 'trends': trendtrades}) trades = df[(df['y_pred']) & (df['trends'])] wins = df[(df['y_true']) & (df['y_pred']) & (df['trends'])] losses = df[(df['y_true'] == 0) & (df['y_pred']) & (df['trends'])] win_rate = len(wins)/len(trades) loss_rate = 1-win_rate avg_win = wins['returns'].mean() avg_loss = -losses['returns'].mean() # used two different ways to calculate the Kelly size. (win_rate/avg_loss - loss_rate/avg_win) isn't the textbook Kelly formula, but it performed better # return the kelly size, but also average win and loss separately so it can be used for the MLFinLab bet sizing. return (win_rate/avg_loss - loss_rate/avg_win), avg_win, avg_loss # return win_rate - (loss_rate/(avg_win/avg_loss)) def score_results(self, y_true, probs, threshold): y_pred = probs > threshold return accuracy_score(y_true, y_pred) def get_class_weights(self, y): # Calculate class weights pos = y.sum()/(len(y)) neg = 1 - pos pw = neg/pos nw = pos/neg return {0:nw,1:pw}
#region imports from AlgorithmImports import * #endregion from QuantConnect.Algorithm.Framework.Risk import RiskManagementModel # bracket risk model class class BracketRiskModel(RiskManagementModel): '''Creates a trailing stop loss for the maximumDrawdownPercent value and a profit taker for the maximumUnrealizedProfitPercent value''' def __init__(self, maximumDrawdownPercent = 0.05, maximumUnrealizedProfitPercent = 0.05): self.maximumDrawdownPercent = -abs(maximumDrawdownPercent) self.trailingHighs = dict() self.maximumUnrealizedProfitPercent = abs(maximumUnrealizedProfitPercent) self.liquidated = set() def ManageRisk(self, algorithm, targets): riskAdjustedTargets = list() for kvp in algorithm.Securities: symbol = kvp.Key security = kvp.Value # Remove if not invested if not security.Invested: self.trailingHighs.pop(symbol, None) continue pnl = security.Holdings.UnrealizedProfitPercent if pnl > self.maximumUnrealizedProfitPercent or security.Symbol in self.liquidated: # liquidate algorithm.Debug(f"Profit Taken: {security.Symbol}") algorithm.Log(f"Profit Taken: {security.Symbol}") riskAdjustedTargets.append(PortfolioTarget(security.Symbol, 0)) if algorithm.Securities[security.Symbol].Invested: self.liquidated.add(security.Symbol) algorithm.Log(f"Liquidating {security.Symbol}") return riskAdjustedTargets # Add newly invested securities if symbol not in self.trailingHighs: self.trailingHighs[symbol] = security.Holdings.AveragePrice # Set to average holding cost continue # Check for new highs and update - set to tradebar high if self.trailingHighs[symbol] < security.High: self.trailingHighs[symbol] = security.High continue # Check for securities past the drawdown limit securityHigh = self.trailingHighs[symbol] drawdown = (security.Low / securityHigh) - 1 if drawdown < self.maximumDrawdownPercent or security.Symbol in self.liquidated: # liquidate algorithm.Debug(f"Losses Taken: {security.Symbol}") algorithm.Log(f"Losses Taken: {security.Symbol}") riskAdjustedTargets.append(PortfolioTarget(symbol, 0)) if algorithm.Securities[security.Symbol].Invested: self.liquidated.add(security.Symbol) algorithm.Log(f"Liquidating {security.Symbol}") return riskAdjustedTargets class TrailingStopRiskManagementModelLiquidation(RiskManagementModel): '''Provides an implementation of IRiskManagementModel that limits the maximum possible loss measured from the highest unrealized profit''' def __init__(self, maximumDrawdownPercent = 0.05): '''Initializes a new instance of the TrailingStopRiskManagementModel class Args: maximumDrawdownPercent: The maximum percentage drawdown allowed for algorithm portfolio compared with the highest unrealized profit, defaults to 5% drawdown''' self.maximumDrawdownPercent = abs(maximumDrawdownPercent) self.trailingAbsoluteHoldingsState = dict() self.liquidated = set() def ManageRisk(self, algorithm, targets): '''Manages the algorithm's risk at each time step Args: algorithm: The algorithm instance targets: The current portfolio targets to be assessed for risk''' riskAdjustedTargets = list() for kvp in algorithm.Securities: symbol = kvp.Key security = kvp.Value # Remove if not invested if not security.Invested: self.trailingAbsoluteHoldingsState.pop(symbol, None) continue position = PositionSide.Long if security.Holdings.IsLong else PositionSide.Short absoluteHoldingsValue = security.Holdings.AbsoluteHoldingsValue trailingAbsoluteHoldingsState = self.trailingAbsoluteHoldingsState.get(symbol) # Add newly invested security (if doesn't exist) or reset holdings state (if position changed) if trailingAbsoluteHoldingsState == None or position != trailingAbsoluteHoldingsState.position: self.trailingAbsoluteHoldingsState[symbol] = trailingAbsoluteHoldingsState = self.HoldingsState(position, security.Holdings.AbsoluteHoldingsCost) trailingAbsoluteHoldingsValue = trailingAbsoluteHoldingsState.absoluteHoldingsValue # Check for new max (for long position) or min (for short position) absolute holdings value if ((position == PositionSide.Long and trailingAbsoluteHoldingsValue < absoluteHoldingsValue) or (position == PositionSide.Short and trailingAbsoluteHoldingsValue > absoluteHoldingsValue)): self.trailingAbsoluteHoldingsState[symbol].absoluteHoldingsValue = absoluteHoldingsValue continue drawdown = abs((trailingAbsoluteHoldingsValue - absoluteHoldingsValue) / trailingAbsoluteHoldingsValue) if self.maximumDrawdownPercent < drawdown or security.Symbol in self.liquidated: self.trailingAbsoluteHoldingsState.pop(symbol, None) # liquidate riskAdjustedTargets.append(PortfolioTarget(symbol, 0)) if algorithm.Securities[security.Symbol].Invested: self.liquidated.add(security.Symbol) algorithm.Log(f"Liquidating {security.Symbol}") return riskAdjustedTargets class HoldingsState: def __init__(self, position, absoluteHoldingsValue): self.position = position self.absoluteHoldingsValue = absoluteHoldingsValue