Overall Statistics |
Total Trades 343 Average Win 0.30% Average Loss -0.17% Compounding Annual Return 9.961% Drawdown 5.100% Expectancy 0.363 Net Profit 10.843% Sharpe Ratio 1.04 Probabilistic Sharpe Ratio 50.278% Loss Rate 52% Win Rate 48% Profit-Loss Ratio 1.82 Alpha -0 Beta 0.288 Annual Standard Deviation 0.068 Annual Variance 0.005 Information Ratio -1.827 Tracking Error 0.096 Treynor Ratio 0.244 Total Fees $343.46 Estimated Strategy Capacity $1100000000.00 Lowest Capacity Asset SPY R735QTJ8XC9X |
from sklearn.model_selection import train_test_split from optimize import * import numpy as np import pandas as pd class EnsembleModel(): def __init__(self, algo): self.algo = algo self.models = None self.optimizer = None # Build a set of models def Train(self, X, y): self.optimizer = ModelOptimizer(self.algo) self.models = self.optimizer.BuildModels(X, y) # Return prediction as a +/- Z-Score of the prediction from the the threshold def Predict(self, X): total_weight = sum(model.test_score for model in self.models) z = 0 for model in self.models: result = model.Predict(X) # Weight the result based on the models test score z = z + result * (model.test_score/total_weight) return z
import talib from talib import func import numpy as np import pandas as pd # # This helper class allows us to easily add features to our dataset # add_features(df, ['EMA_7']) # # Other examples # EMA_7_28 gives you the ratio of these two EMA # EMA_7_28_diff gives you the difference of the current ratio and the last ratio for those two EMA # def ichimoku_a(high, low, n1 = 9, n2 = 26, n3 = 52): conv = 0.5 * (high.rolling(n1, min_periods=0).max() + low.rolling(n1, min_periods=0).min()) base = 0.5 * (high.rolling(n2, min_periods=0).max() + low.rolling(n2, min_periods=0).min()) spana = 0.5 * (conv + base) return pd.Series(spana) def ichimoku_b(high, low, n1 = 9, n2 = 26, n3 = 52): spanb = 0.5 * (high.rolling(n3, min_periods=0).max() + low.rolling(n3, min_periods=0).min()) return pd.Series(spanb) def obv_rolling_percent(close, volume, n, fillna=False): df = pd.DataFrame([close, volume]).transpose() df['OBV'] = 0 c1 = close < close.shift(1) c2 = close > close.shift(1) if c1.any(): df.loc[c1, 'OBV'] = - volume if c2.any(): df.loc[c2, 'OBV'] = volume obv = df['OBV'].rolling(n).sum() / df[volume.name].rolling(n).sum() if fillna: obv = obv.replace([np.inf, -np.inf], np.nan).fillna(0) return pd.Series(obv, name='obv') def kst(close, r1 = 3, r2 = 7, r3 = 14, r4 = 28, n1 = 10, n2 = 3, n3 = 7, n4 = 14): rocma1 = ((close - close.shift(r1)) / close.shift(r1)).rolling(n1, min_periods=0).mean() rocma2 = ((close - close.shift(r2)) / close.shift(r2)).rolling(n2, min_periods=0).mean() rocma3 = ((close - close.shift(r3)) / close.shift(r3)).rolling(n3, min_periods=0).mean() rocma4 = ((close - close.shift(r4)) / close.shift(r4)).rolling(n4, min_periods=0).mean() return 100 * (rocma1 + 2 * rocma2 + 3 * rocma3 + 4 * rocma4) def kst_sig(close, r1 = 3, r2 = 7, r3 = 14, r4 = 28, n1 = 10, n2 = 3, n3 = 7, n4 = 14, nsig = 6): kst_val = kst(clse, r1, r2, r3, r4, n1, n2, n3, n4) kstsig = kst_val.rolling(nsig, minperiods=0).mean() return kstsig def check_feature_cache(cache, name, f, args): if not name in cache: cache[name] = f(*args) return cache[name] def add_features(df, features, close="close", high="high", low="low", volume="volume"): fmap = {'MFI':'money_flow_index','ICHA':'ichimoku_a','ICHB':'ichimoku_b','BOLLH':'bollinger_hband', \ 'BOLLL':'bollinger_lband','KCC':'keltner_channel_central','NVI':'negative_volume_index', \ 'OBVP' : 'obv_rolling_percent', 'KST' : 'kst', 'KST_SIG' : 'kst_sig'} cache = {} for feature in features: # parse string. Style is func_period1_period2:COL=ColumnName1 (column name optional) col = None col_idx = feature.find(':') if col_idx > -1: col = feature[col_idx+1:] feature = feature[0:col_idx] p = feature.split('_') if not col is None: feature = feature + "_" + col fname = p[0].upper() # If DM, DI, or HT function will have underscore in name, need to add suffix back and shift params back if len(p) > 1 and (p[1] in ['DM','DI'] or p[0] in ['HT']): fname += '_' + p[1] for i in range(2,len(p)): p[i-1] = p[i] del p[-1] p1 = p[1].upper() if len(p) > 1 else None p2 = p[2].upper() if len(p) > 2 else None p3 = p[3].upper() if len(p) > 3 else None if fname in fmap: if fmap[fname].find('.') > -1: s = fmap[fname].split('.') cls = s[0] f = getattr(globals()[cls], s[1]) else: f = globals()[fmap[fname].lower()] elif fname in talib.__TA_FUNCTION_NAMES__: f = getattr(func, fname) elif fname.lower() in globals(): f = globals()[fname.lower()] else: raise Exception(f'Could not find function. fname: {fname} feature: {feature}') if fname.endswith('MA') or fname == 'T3': args = [df[close], int(p1)] if p2 is None: df[feature] = (check_feature_cache(cache, feature, f, args )) elif p2 == 'DIFF': ma1 = check_feature_cache(cache, fname + '_' + p1, f, args) df[feature] = ma1.pct_change() p2 = None # So it doesn't get diffed at end of method else: ma1 = check_feature_cache(cache, fname + '_' + p1, f, args) args = [df[close], int(p2)] ma2 = check_feature_cache(cache, fname + '_' + p2, f, args) df[feature] = (ma1 - ma2) / ma1 df[feature].replace([np.inf, -np.inf], 0, inplace=True) elif fname in ['CMO','TRIX','ROCP','ROCR100','ROC','BOLLH','BOLLL', 'RSI','MOM']: df[feature] = f(df[close], int(p1)) elif fname in ['MINUS_DM', 'PLUS_DM']: df[feature] = f(df[high], df[low], int(p1)) elif fname in ['ADX','ATR','KCC','MINUS_DI','PLUS_DI','WILLR']: try: if df[close].isnull().all(): df[feature] = np.nan elif p2 is None or p2.upper() == 'DIFF': df[feature] = f(df[high], df[low], df[close], int(p1)) # Change all to percent, except WILLR if not fname in ['WILLR','ADX']: df[feature] = df[feature]/df[close] else: #params = {'close' : df[close], 'high' : df[high],'low' : df[low], 'timeperiod' : int(p1)} args = [df[high], df[low], df[close], int(p1)] f1 = check_feature_cache(cache, fname + '_' + p1, f, args) args = [df[high], df[low], df[close], int(p2)] f2 = check_feature_cache(cache, fname + '_' + p2, f, args) df[feature] = (f1-f2)/f1 df[feature].replace([np.inf, -np.inf], 0, inplace=True) except Exception as ex: if str(ex) == 'inputs are all NaN': df[feature] = np.nan else: raise elif fname in ['MFI']: df[feature] = f(df[high], df[low], df[close], df[volume], int(p1)) elif fname in ['MACD', 'PPO', 'TSI']: df[feature] = f(df[close], int(p1), int(p2)) elif fname in ['ICHA', 'ICHB', 'AO']: # Normalize df[feature] = f(df[high], df[low], int(p1), int(p2)) # if fname.startswith('ICH'): # df[feature] = df[feature]/ df[close] elif fname in ['NVI']: df[feature] = f(df[close], df[volume]) elif fname in ['SAR', 'SAREXT']: # Normalize df[feature] = f(df[high], df[low]) #/ df[close] elif fname == 'KST': df[feature] = f(df[close], r1=10, r2=15, r3=20, r4=30, n1=10, n2=10, n3=10, n4=15) elif fname == 'KST_SIG': df[feature] = f(df[close], r1=10, r2=15, r3=20, r4=30, n1=10, n2=10, n3=10, n4=15, nsig=9) elif fname == 'AROON': df[feature] = f(df[high], df[low], int(p1))[int(p2)-1] elif fname == 'ULTOSC': df[feature] = f(df[high], df[low], df[close]) elif fname in ['OBVP']: df[feature] = f(df[close],df[volume],int(p1)) elif fname in ['OBTP']: df[feature] = f(df[close],df["Trades"],int(p1)) elif fname in ['HT_DCPERIOD','HT_DCPHASE','HT_PHASOR','HT_SINE', 'HT_TRENDMODE','STOCHRSI' ]: df[feature] = f(df[close]) else: raise Exception('Feature not found: ' + feature + ' ' + fname ) # Difference those that are 1 period diffs if p3 == 'DIFF' or p2 == 'DIFF': df[feature] = df[feature].diff() return df def get_exhaustive_ma_features_list(): features = [] periods = [7, 14, 28, 42, 70] mas = ["SMA", "DEMA", "KAMA", "EMA", "TEMA", "T3", "TRIMA", "WMA"] for p in periods: for ma in mas: features.append(ma + "_" + str(p)) for p1 in periods: for p2 in periods: if p2 <= p1: continue for ma in mas: ma1 = ma + str(p1) ma2 = ma + str(p2) features.append(ma + "_" + str(p1) + "_" + str(p2)) features.append(features[-1] + "_diff") return features def get_exhaustive_features_list(diffs=True): features = get_exhaustive_ma_features_list() periods = [3,7,14,20,30] band_periods = [7,14,21,28] hi_lo_periods = [[4,9],[7,14],[10,21],[12,26]] pfs = ['CMO','MINUS_DI','MINUS_DM','PLUS_DI','PLUS_DM','ROCP','ATR','ADX', 'TRIX','WILLR'] # Left out: MOM, VIN, VIP, ROC,ROCR,OBVM,EOM,DPI,RSI,STOCH,STOCH_SIG,FI for p in periods: features.append('AROON_' + str(p) + '_1') features.append('AROON_' + str(p) + '_2') for pf in pfs: if pf == 'ADX' and p == 3: continue features.append(pf + '_' + str(p)) if diffs: features.append(features[-1] + "_diff") if diffs: for p1 in periods: for p2 in periods: if p2 <= p1: continue for pf in pfs: pf1 = pf + str(p1) pf2 = pf + str(p2) features.append(pf + "_" + str(p1) + "_" + str(p2)) features.append(features[-1] + "_diff") hlfs = ['ICHA','ICHB'] # Exclude PPO, same as our MA setups # hi lo Indicators for hl in hi_lo_periods: for hlf in hlfs: features.append(hlf + '_' + str(hl[0]) + '_' + str(hl[1])) # other indicators otfs = ['SAR','SAREXT','HT_DCPERIOD','ULTOSC','OBVP_10','OBVP_20','OBVP_40','OBVP_80'] for otf in otfs: features.append(otf) return features
from AlgorithmImports import * from feature import add_features from ensemble import EnsembleModel from datetime import timedelta import pandas as pd class TransdimensionalDynamicThrustAssembly(QCAlgorithm): def Initialize(self): self.SetStartDate(2020, 11, 1) # Set Start Date self.SetEndDate(2021, 12, 1) # Set Start Date self.SetCash(100000) # Set Strategy Cash self.SetBrokerageModel(BrokerageName.InteractiveBrokersBrokerage) self.UniverseSettings.Resolution = Resolution.Minute self.SetWarmup(timedelta(days=15)) self.SetBenchmark('SPY') # Manual selection of ETFs # self.symbols = ["SPY","DIA","IJR","MDY","IWM","QQQ","IYE","EEM","IYW","EFA","SLV","IEF","IYM","IYF","IYH","IYR","IYC","IBB","FEZ","USO","TLT"] self.symbols = ["SPY", "QQQ"] for symbol in self.symbols: self.AddEquity(symbol, Resolution.Minute) # Manually curated universe self.SetUniverseSelection(ManualUniverseSelectionModel()) # Select the demonstration alpha model alpha = WeightedClassifierAlpha(self, self.symbols) self.SetAlpha(alpha) # Equally weigh securities in portfolio, based on insights self.SetPortfolioConstruction(InsightWeightingPortfolioConstructionModel()) # Set Immediate Execution Model self.SetExecution(ImmediateExecutionModel()) # Set Null Risk Management Model self.SetRiskManagement(NullRiskManagementModel()) class WeightedClassifierAlpha(AlphaModel): def __init__(self, algo, symbols): self.algo = algo self.symbols = symbols self.models = None self.month_count = 0 # Train models monthly algo.Schedule.On(algo.DateRules.MonthStart(), algo.TimeRules.At(1, 0), self.Train) # Rebalance weekly algo.Schedule.On(algo.DateRules.WeekStart(), algo.TimeRules.At(2, 0), self.Rebalance) self.days_count = 5 def Update(self, algo, data): return [] # Predict values and create insights def Rebalance(self): insights = [] if self.algo.IsWarmingUp: return insights if self.models is None: self.BuildModels() try: # Load History and Features end = self.algo.Time + timedelta(days=1) start = end - timedelta(days=450) history = self.GetHistoryAndFeatures(start, end) # Drop our Y columns before training idx = pd.IndexSlice X = history.drop(columns=['returns_1d', 'returns_1w', 'close','high','low','open','volume']) columns = X.columns[X.isna().any()].tolist() X = X.dropna() # Get last row of data time = X.index[-1][0] X = X.loc[idx[time, :], :] # Predict each symbols direction insights = [] for symbol in self.symbols: symbol = SymbolCache.GetSymbol(symbol) idx = pd.IndexSlice X_symbol = X.loc[idx[:, [symbol.ID.ToString()]], :] model = self.models[symbol] result = model.Predict(X_symbol) band = 0 direction = InsightDirection.Flat if result >= 0: insights.append( Insight.Price(symbol, timedelta(days=4), InsightDirection.Up, 1, None, None, result)) elif result < 0: # Notice the '-' before result. Weight must be positive for this to work correctly insights.append( Insight.Price(symbol, timedelta(days=4), InsightDirection.Down, 1, None, None, -result)) self.algo.EmitInsights(insights) except Exception as e: self.Log('Unexpected error:' + str(e)) raise def Train(self): # To speed up backtest we just trained every 3 months self.month_count += 1 if self.month_count == 3: self.month_count = 0 self.BuildModels() # self.BuildModels() def BuildModels(self): try: self.models = {} # Load History and Features total_days = 365 * 4 start = self.algo.Time - timedelta(days=total_days) end = self.algo.Time # + timedelta(days=1) history = self.GetHistoryAndFeatures(start, end) # Set our Y to predict if symbols will be positive in 1 week history = history.dropna() y = history['returns_1w'] > 0 # Drop Y and close X = history.drop(columns=['returns_1d', 'returns_1w', 'close','high','low','open','volume']) # Loop through symbols and build models for each for symbol in self.symbols: symbol = SymbolCache.GetSymbol(symbol) idx = pd.IndexSlice X_symbol = X.loc[idx[:, [symbol.ID.ToString()]], :] y_symbol = y.loc[X_symbol.index] model = EnsembleModel(self.algo) model.Train(X_symbol, y_symbol) self.models[symbol] = model except Exception as e: self.algo.Debug('Unexpected error:' + str(e)) raise def GetHistoryAndFeatures(self, start, end): history = self.algo.History(self.symbols, start, end, Resolution.Daily) gb = history.groupby('symbol', group_keys=False) history['returns_1w'] = -gb.close.pct_change(-5) history['returns_1d'] = -gb.close.pct_change(-1) # Commented out a variety of different Y variables I tested # rgb = history.returns_1d.groupby('symbol', group_keys=False) # history['return_std'] = rgb.transform(lambda x: x.rolling(window=14).std()) # history['return_std'] = rgb.transform(lambda x: x.rolling(window=200).std()) # history['return_mean'] = rgb.transform(lambda x: x.rolling(window=200).mean()) # history['return_last'] = gb.close.pct_change() # history['return_zscore'] = history.return_last/history.return_std; # history['zscore'] = (history.returns_1m-history.return_mean)/history.return_std # history['return_gap'] = gb.apply(lambda x: (x.shift(-1).open-x.close)/x.close) features = ['KAMA_50_200', 'KAMA_28_100', 'KAMA_7_25', 'KAMA_14_50', 'KAMA_50_200_diff', 'KAMA_14_50_diff', 'KAMA_28_100_diff', 'KAMA_7_25_diff'] # Commented out a larger feature set. Left as an example of what can be used. # features = ['ATR_7', 'ATR_21', 'ATR_3_7', 'ATR_3_7_diff', 'HT_DCPERIOD', 'HT_DCPHASE','ADX_7','ADX_14','ADX_20','ADX_30','ADX_50','ADX_60', # 'ADX_7_diff','ADX_14_diff','ADX_20_diff','ADX_30_diff','ADX_50_diff','ADX_60_diff','KAMA_50','KAMA_200', # 'KAMA_50_200','EMA_50_200','KAMA_28_100','KAMA_7_25','KAMA_14_50','EMA_14_50', # 'KAMA_50_200_diff','EMA_50_200_diff','KAMA_14_50_diff','EMA_14_50_diff','KAMA_28_100_diff','KAMA_7_25_diff', # 'EMA_7_28','EMA_7_28_diff', ] gb = history.groupby('symbol', group_keys=False) history = gb.apply(lambda x: add_features(x, features)) # Commented out some advanced features applying HT_DCPERIOD to other features # gb = history.groupby('symbol', group_keys=False) # history['HT_DCPERIOD_KAMA_50_200'] = gb.apply(lambda x: add_features(x, ['HT_DCPERIOD'], close='KAMA_50_200'))['HT_DCPERIOD'] # history['HT_DCPERIOD_KAMA_14_50'] = gb.apply(lambda x: add_features(x, ['HT_DCPERIOD'], close='KAMA_14_50'))['HT_DCPERIOD'] # Convert datetime to date #history['datetime'] = history.index.get_level_values('time') history = history.reset_index().set_index('time').sort_index() history.index = history.index.normalize() history = history.set_index('symbol', append=True) return history
from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import train_test_split import numpy as np import pandas as pd class ClassifierModel(): def __init__(self, algo): self.algo = algo self.model = None self.params = None self.test_score = None self.test_dist = None self.threshold = None # Train the model def Train(self, X, y, params): self.params = params self.params['class_weight'] = self.GetClassWeight(y) self.model = ExtraTreesClassifier(**self.params) self.model.fit(X, y) self.test_dist = self.model.predict_proba(X)[:, 1] # Return prediction as a +/- Z-Score of the prediction from the the threshold def Predict(self, X): #self.algo.Debug(f'Number of columns before pca: {X.shape[1]}') idx = X.index #X = np.nan_to_num(X, nan=0, posinf=0, neginf=0) probs = pd.Series(self.model.predict_proba(X)[:, 1]) d = self.test_dist return (probs[0] - self.threshold) / d.std() def GetClassWeight(self, y_train): pos = y_train.sum()/(len(y_train)) neg = 1 - pos pw = neg/pos nw = pos/neg return {0:nw,1:pw}
from sklearn.ensemble import ExtraTreesClassifier from scipy.stats.mstats import gmean import numpy as np import pandas as pd import math from sklearn.metrics import auc,roc_curve,precision_recall_curve, roc_auc_score, accuracy_score from model import ClassifierModel class ModelOptimizer(): def __init__(self, algo): self.algo = algo self.validation_score = 0 self.test_score = 0 self.test_dist = None self.threshold = None def BuildModels(self, X, y): # Need to continue to play with min samples split and estimators and maybe class weights params = {'random_state':0,'n_estimators': 10, 'min_samples_split': 200, 'class_weight' : {0:1,1:1}, 'max_features': 'auto', 'max_depth': 3, 'criterion': 'gini', 'bootstrap': True} # Train and validate using walk forward validation max_score = -1 max_y = None max_probs = None num_iter = 10 num_test_records = 60 num_records = len(X) num_train_records = num_records - num_test_records batch_size = 4 models = [] results = [] # Iterate ove the dataset and create a number (num_iter) of models. for i in range(0, num_iter): probs = [] y_test_all = [] end_idx = num_records # This is important. Copy params so each model keeps a distinct copy with it's own random seed model_params = params.copy() model_params['random_state']= i # Loop through and do walk forward validation j = num_train_records while j < end_idx: # Add the next segment of data idx = j + batch_size if idx >= end_idx: idx = end_idx-1 X_train, X_test = X[0:j], X[j:idx] y_train, y_test = y[0:j], y[j:idx] j = j + batch_size # Set the class weight based positives vs negatives model_params['class_weight'] = self.get_class_weights(y_train) # Train the model clf = ExtraTreesClassifier(**model_params) clf.fit(X_train, y_train) # Predict using the model p = pd.Series(clf.predict_proba(X_test)[:, 1]) # Add the predicions probs.extend(p) y_test_all.extend(y_test) #precisions, recalls, thresholds = precision_recall_curve(y_test_all, probs) #score = auc(recalls, precisions) # Build model on all the data model = ClassifierModel(self.algo) model.Train(X, y, model_params) # Since each models predictive results tend to skew, we calculate a threshold for what is a positive result model.threshold = self.get_threshold(pd.Series(y), model.test_dist) # Give the model a score for use in weighting models score = self.score_results(y_test_all, probs, model.threshold) model.test_score = score model.test_y = y_test_all models.append(model) #model.test_dist = pd.Series(probs) return models # Calculate the model threshold for predicting positives def get_threshold(self, y_true, probs): # calculate percent positive pos = y_true.sum()/(len(y_true)) # use that percent as threshold cutoff return sorted(probs, reverse=True)[int(round(len(probs)*pos))] def weighted_score(self, y_true, probs): # There are two ways we want to weight a prediction # 1. More recent predictions are weighted more heavily # 2. The larger the prediction value is weighted more heavily (positive or negative) # score = 0 # threshold = self.get_threshold(pd.Series(y_true), probs) # dist = pd.Series(probs) # # First entries are older, last entries newer # for i in range(0,len(y_true)): # pred = probs[i] # y = y_true[i] # zscore = (pred - threshold) / dist.std() # yp = zscore > 0 # if yp == y: # score = score + 1 + i/len(y_true) #* abs(zscore) # return score # precision, recall, threshold = precision_recall_curve(y_pred, probs) # s1 = auc(recall, precision) # s2 = roc_auc_score(y_pred, probs) # return gmean([s1,s2]) y_test_all = np.asarray(y_true) probs_all = pd.Series(probs) splits = 6 length = math.floor(len(probs_all)/splits) start = 0 end = length-1 scores = [] for i in range(0,splits): y_test = y_test_all[start:end] if y_test.sum() != 0: probs = probs_all[start:end] precision, recall, threshold = precision_recall_curve(y_test, probs) pauc = auc(recall, precision) if math.isnan(pauc): pauc = 0 scores.append(pauc*(1+i/10)) end = end + length start = start + length if (end >= len(probs_all)): end = len(probs_all)-1 return gmean(scores) def score_results(self, y_true, probs, threshold): y_pred = probs > threshold #return roc_auc_score(y_true, y_pred) return accuracy_score(y_true, y_pred) # y_test_all = np.asarray(y_true) # probs_all = pd.Series(probs) # splits = 6 # length = math.floor(len(probs_all)/splits) # start = 0 # end = length-1 # scores = [] # for i in range(0,splits): # y_test = y_test_all[start:end] # if y_test.sum() != 0: # probs = probs_all[start:end] # precision, recall, threshold = precision_recall_curve(y_test, probs) # pauc = auc(recall, precision) # if math.isnan(pauc): # pauc = 0 # scores.append(pauc) # end = end + length # start = start + length # if (end >= len(probs_all)): # end = len(probs_all)-1 # return gmean(scores) def get_class_weights(self, y): # Calculate class weights pos = y.sum()/(len(y)) neg = 1 - pos pw = neg/pos nw = pos/neg return {0:nw,1:pw}