Backtest

Overall Statistics
Total Trades 2839 Average Win 0.77% Average Loss -0.58% Compounding Annual Return 31.257% Drawdown 36.600% Expectancy 0.414 Net Profit 1421.211% Sharpe Ratio 1.216 Loss Rate 39% Win Rate 61% Profit-Loss Ratio 1.32 Alpha 0.118 Beta 1.208 Annual Standard Deviation 0.236 Annual Variance 0.056 Information Ratio 0.914 Tracking Error 0.161 Treynor Ratio 0.238 Total Fees $17166.55

import math
import numpy as np
import pandas as pd
import statsmodels.api as sm
from datetime import date, datetime, timedelta
from scipy import stats

### To do:
# 1. try smaller volume universe, it seems to have good performance
# 2. 

class ExpectedIdiosyncraticSkewness(QCAlgorithm):
    '''Step 1. Calculating Fama-French daily regression residuals
       Step 2. Using daily residuals to calculate historical monthly moments
       Step 3. Run regression of historical monthly moments to estimate regression coefficients
       Step 4. Using historical monthly moments and estimated coefficients to calculate expected skewness
       Step 5. Sorting symbols by skewness and long the ones with lowest skewness
       
       Note: Fama-French factors data are only available up to 06/28/2019.
       So, backtest is implemented up to end of June, 2019. And, live trading is not feasible for current version.
       
       Reference: 
       [1] https://academic.oup.com/rfs/article-abstract/23/1/169/1578688?redirectedFrom=PDF
       [2] https://dr.library.brocku.ca/bitstream/handle/10464/6426/Brock_Cao_Xu_2015.pdf
       [3] Fama-French official data: https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html
    '''
    
    def Initialize(self):
        
        self.SetStartDate(2009, 7, 1)                # Set Start Date
        self.SetEndDate(2019, 7, 1)                  # Set End Date
        self.SetCash(100000)                         # Set Strategy Cash
        self.AddEquity("SPY", Resolution.Daily)      # Used to check trading days
        
        self.__number_of_coarse_symbol = 200         # Set the number of coarse symbol to be further filtered by expected skewness
        
        self.predictor_list = []                     # List to save monthly predictors for each symbol
        self.initial_selection = True                # Control initial selection runs only once

        self._get_fama_french_factors()              # Download Fama French factors data as a dataframe
        
        self.UniverseSettings.Resolution = Resolution.Daily
        self.AddUniverse(self.CoarseSelectionAndSkewnessSorting)


    def CoarseSelectionAndSkewnessSorting(self, coarse):
        '''Coarse selection to get an initial fixed universe for the skewness sorting trade logic.
           Then, select the symbols to trade monthly based on skewness sorting.
        '''

        ### Run the coarse universe selection only once at the beginning of strategy
        if self.initial_selection:
            # Select symbols with fundamental data
            coarse_with_fundamental = [x for x in coarse if x.HasFundamentalData]

            # Sort descendingly by daily dollar volume
            sorted_by_volume = sorted(coarse_with_fundamental, key = lambda x: x.DollarVolume, reverse = True)
            
            self.fine = [ x.Symbol for x in sorted_by_volume[:self.__number_of_coarse_symbol] ]
            
            self.initial_selection = False
        
        ### Select symbols to trade based on expected skewness at each month end
        
        # if not last trading day at month end, return the unchanged universe
        self.month = (self.Time - timedelta(days = 1)).month
        next_trading_day = self.Securities["SPY"].Exchange.Hours.GetNextMarketOpen(self.Time, False)
        if self.month == next_trading_day.month: 
            return Universe.Unchanged
        
        self.Debug(f"Month end rebalance at: {self.Time}")
        
        # Estimate expected idiosyncratic skewness
        skewness = self.ExpectedSkewness()
        
        # Sort symbols by skewness
        skewness_sorted = skewness.sort_values(by = ['skew']).reset_index(drop = True)
        
        # Select the lowest quintile
        self.low_skew = skewness_sorted.loc[:math.ceil(self.__number_of_coarse_symbol * 0.05), 'symbol']
        self.Debug(f"Selected symbols to trade >>\n {self.low_skew}\n")
        
        return [self.Symbol(x) for x in self.low_skew]


    def OnData(self, data):
        '''Rebalance at month end. Determine weights. Place orders.
        '''

        # if not last trading day at month end, return
        next_trading_day = self.Securities["SPY"].Exchange.Hours.GetNextMarketOpen(self.Time, False)
        self.month = (self.Time - timedelta(days = 1)).month
        if self.month == next_trading_day.month: return

        # Determine weights
        weights = self.PortfolioWeights()
        
        # Place orders
        self.Liquidate()
        for symbol in self.low_skew:
            weight_i = weights[symbol]
            self.SetHoldings(symbol, weight_i)

    
    def ExpectedSkewness(self):
        '''Calculate expected skewness using historical moments and estimated regression coefficients
        '''

        ### Get predictors
        self._get_predictors()
        
        ### Estimate coefficients by regressing current skewness on historical moments
        if len(self.predictor['time'].unique()) == 1: 
            coef = [0, 1, 0]
        else:
            this_month = self.predictor['time'].iloc[-1]
            last_month = self.predictor['time'].unique()[-2]
            Y = self.predictor[self.predictor['time'] == this_month]['skew'].values
            X = self.predictor[self.predictor['time'] == last_month][['skew', 'vol']].values
            X = sm.add_constant(X)
            results = sm.OLS(Y, X, missing = 'drop').fit()
            coef = results.params
        
        ### Calculate expected skewness
        this_month = self.predictor['time'].iloc[-1]
        data_t = self.predictor[self.predictor['time'] == this_month][['skew', 'vol']].values
        ones = np.ones([len(data_t), 1])
        data_t = np.append(ones, data_t, 1)
        exp_skew = np.inner(data_t, coef)
        skew_df = self.predictor[self.predictor['time'] == this_month][['symbol']].reset_index(drop = True)
        skew_df.loc[:,'skew'] = exp_skew
        
        return skew_df


    def PortfolioWeights(self):
        '''Construct equal-weighted portfolio'''

        weights = {}
        for symbol in self.low_skew:
            weights[symbol] = 1 / len(self.low_skew)

        return weights

        
    def _get_predictors(self):
        '''Run Fama-French time-series regression to get residuals.
           Then, use residuals to calculate historical moments.
        '''

        ### Get historical returns for current month
        end_day = self.Time
        start_day = start_day = (self.Time - timedelta(days = 1)).replace(day = 1)
        history = self.History(self.fine, start_day - timedelta(days = 1), end_day, Resolution.Daily)     # Get one more day for price data
        history = history.close.unstack(level = 0)
        daily_returns = (np.log(history) - np.log(history.shift(1)))[1:].dropna()                         # Drop the first day
        daily_returns['time'] = daily_returns.index

        ### Merge FF factors to returns dataframe based on dates available in return series
        daily_returns = daily_returns.merge(self.fama_french_factors_per_day, left_on = 'time', right_on = 'time')
        
        ### Run fama-french time-series regression and calculate historical moments
        column_list = list(daily_returns.columns)
        for symbol in self.fine:
            if str(symbol) not in column_list:
                self.Debug(f"Symbol not in return series >> {symbol}")
                self.predictor_list.append([str(symbol), self.Time, np.nan, np.nan])
                continue

            Y = (daily_returns[str(symbol)] - daily_returns['RF']).values
            X = daily_returns[['Mkt_RF', 'SMB', 'HML']].values
            X = sm.add_constant(X)
            results = sm.OLS(Y, X).fit()
            hist_skew, hist_vol = stats.skew(results.resid), stats.tstd(results.resid)  # Use daily residual to calculate monthly moments
            self.predictor_list.append([str(symbol), self.Time, hist_skew, hist_vol])

        self.predictor = pd.DataFrame(self.predictor_list, columns = ['symbol', 'time', 'skew', 'vol'])


    def _get_fama_french_factors(self):
        '''Download fama-french factors data from Github cloud and read it as a DataFrame.
           Data is originally from Prof French's official homepage. I unzip the data folder and upload to Github cloud.
        '''
        
        tmp_list = []
        data_str = self.Download("https://raw.githubusercontent.com/xinweimetrics/Tutorials/master/04%20Strategy%20Library/231%20Expected%20Idiosyncratic%20Skewness/F-F_Research_Data_Factors_daily.CSV")
        data_lines = data_str.splitlines()
        data_lines = data_lines[5:-2] # drop the first 5 and last 2 lines which are not data that we need
        for line in data_lines:
            data = line.split(',')
            # add one day to match QC behavior: daily data is timestamped at UTC 00:00 the next day from the actual day that produced the data
            tmp_list.append([datetime.strptime(data[0], "%Y%m%d") + timedelta(days = 1)] + [float(i) for i in data[1:]])

        self.fama_french_factors_per_day = pd.DataFrame(tmp_list, columns = ['time', 'Mkt_RF', 'SMB', 'HML', 'RF'])
        self.Debug(f"Downloading Fama-French data succeed! :: DataFrame shape >> {self.fama_french_factors_per_day.shape}")

import math
import numpy as np
import pandas as pd
import statsmodels.api as sm
from datetime import date, datetime, timedelta
from scipy import stats


class ExpectedIdiosyncraticSkewness(QCAlgorithm):
    '''Step 1. Calculating Fama-French daily regression residuals
       Step 2. Using daily residuals to calculate historical monthly moments
       Step 3. Run regression of historical monthly moments to estimate regression coefficients
       Step 4. Using historical monthly moments and estimated coefficients to calculate expected skewness
       Step 5. Sorting symbols by skewness and long the ones with lowest skewness
       
       Note: Fama-French factors data are only available up to 06/28/2019.
       So, backtest is implemented up to end of June, 2019. And, live trading is not feasible for current version.
       
       Reference: 
       [1] https://academic.oup.com/rfs/article-abstract/23/1/169/1578688?redirectedFrom=PDF
       [2] https://dr.library.brocku.ca/bitstream/handle/10464/6426/Brock_Cao_Xu_2015.pdf
       [3] Fama-French official data: https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html
    '''
    
    def Initialize(self):
        
        self.SetStartDate(2010, 11, 1)                                  # Set Start Date
        self.SetEndDate(2012, 2, 1)                                     # Set End Date
        self.SetCash(100000)                                            # Set Strategy Cash
        self.AddEquity("SPY", Resolution.Daily)                         # Used to check trading days
        
        self.__number_of_coarse_symbol = 50                             # Set the number of coarse symbol to be further filtered by expected skewness
        
        self.symbol_weight = pd.DataFrame()                             # DataFrame to save desired weights with symbols as index
        self.month = 0                                                  # Track current calendar month
        self.next_trading_day = 0                                       # Track next trading day

        self._get_fama_french_factors()                                 # Download Fama French factors data as a dataframe
        
        self.UniverseSettings.Resolution = Resolution.Daily             # Subscribe daily data for selected symbols in universe
        self.AddUniverse(self.CoarseSelectionAndSkewnessSorting)        # Coarse Selection and Skewness Sorting


    def CoarseSelectionAndSkewnessSorting(self, coarse):
        '''Coarse selection to get an initial fixed universe for the skewness sorting trade logic.
           Then, select the symbols to trade monthly based on skewness sorting.
        '''

        # if not last trading day at month end, return the unchanged universe
        self.month = (self.Time - timedelta(days = 1)).month
        self.next_trading_day = self.Securities["SPY"].Exchange.Hours.GetNextMarketOpen(self.Time, False)
        if self.month == self.next_trading_day.month: 
            return Universe.Unchanged
        
        self.Debug(f"Month end rebalance at: {self.Time}")
        
        ### Run the coarse selection to narrow down the universe
        # Sort descendingly by daily dollar volume
        sorted_by_volume = sorted(coarse, key = lambda x: x.DollarVolume, reverse = True)
        fine = [ x.Symbol for x in sorted_by_volume[:self.__number_of_coarse_symbol] ]
        self.Debug(f"fine >> {fine}")
        
        ### Select symbols to trade based on expected skewness at each month end
        
        # Estimate expected idiosyncratic skewness
        fine_and_skew = self.CalculateExpectedSkewness(fine)
        
        # Select the lowest quintile and calculate desired weights
        self.symbol_weight = pd.DataFrame(index = fine_and_skew.loc[:math.ceil(self.__number_of_coarse_symbol * 0.05)]['symbol'])
        self.symbol_weight.loc[:,'weight'] = np.ones([len(self.symbol_weight), 1]) / len(self.symbol_weight)
        self.Debug(f"Selected symbols to trade >>\n {list(self.symbol_weight.index)}\n")
        
        return [self.Symbol(x) for x in self.symbol_weight.index] + [self.Symbol("SPY")]
        
    
    def OnSecuritiesChanged(self, changes):
        '''Liquidate symbols that are removed from the dynamic universe
        '''
        for security in changes.RemovedSecurities:
            if security.Invested:
                self.Liquidate(security.Symbol)


    def OnData(self, data):
        '''Rebalance at month end. Determine weights. Place orders.
        '''
        # if not last trading day at month end, return
        if self.month == self.next_trading_day.month: return
        
        # Placing orders
        for symbol, row in self.symbol_weight.iterrows():
            self.SetHoldings(symbol, row['weight'])

    
    def CalculateExpectedSkewness(self, fine):
        '''Calculate expected skewness using historical moments and estimated regression coefficients
        '''

        ### Get predictors
        
        # Get historical returns for two months
        monthEnd_this = self.Time
        monthEnd_lag_1 = (self.Time - timedelta(days = 1)).replace(day = 1)
        monthEnd_lag_2 = (monthEnd_lag_1 - timedelta(days = 1)).replace(day = 1)   # First day of last trading month
        self.Debug(f"this >> {monthEnd_this} :: lag1 >> {monthEnd_lag_1} :: lag2 >> {monthEnd_lag_2}")
        history = self.History(fine, monthEnd_lag_2 - timedelta(days = 1), monthEnd_this, Resolution.Daily)                         # Get one more day for price data
        # self.Debug(str(history))
        self.Debug(f"len history returns >> {len(history)}")
        history = history["close"].unstack(level = 0)
        #self.Debug(str(history))
        daily_returns = (np.log(history) - np.log(history.shift(1)))[1:]                                             # Drop the first day
        #self.Debug(str(daily_returns))
        self.Debug(f"len daily returns >> {len(daily_returns)}")

        # Merge Fama-French factors to daily returns based on dates available in return series
        daily_returns['time'] = daily_returns.index
        daily_returns = daily_returns.merge(self.fama_french_factors_per_day, left_on = 'time', right_on = 'time')
        
        self.Debug(str(daily_returns))
        
        daily_returns_this = daily_returns[daily_returns['time'] > monthEnd_lag_1]
        daily_returns_last = daily_returns[daily_returns['time'] <= monthEnd_lag_1]
        self.Debug(f"this len >> {len(daily_returns_this)} :: last len >> {len(daily_returns_last)}")
        daily_returns_dict = {monthEnd_this: daily_returns_this, monthEnd_lag_1: daily_returns_last}
        
        # For each stock and each month, run fama-french time-series regression and calculate historical moments
        column_list = list(daily_returns.columns)
        predictor_list = []
        
        for month, returns in daily_returns_dict.items():
            
            self.Debug(str(returns))
            
            for symbol in fine:
                
                if str(symbol) not in column_list:
                    # self.Debug(f"Symbol not in return series >> {symbol}")
                    predictor_list.append([str(symbol), month, np.nan, np.nan])
                    continue

                Y = (returns[str(symbol)] - returns['RF']).values
                X = returns[['Mkt_RF', 'SMB', 'HML']].values
                X = sm.add_constant(X)
                results = sm.OLS(Y, X).fit()
            
                hist_skew, hist_vol = stats.skew(results.resid), stats.tstd(results.resid)  # Use daily residual to calculate monthly moments
            
                predictor_list.append([str(symbol), month, hist_skew, hist_vol])

        predictor = pd.DataFrame(predictor_list, columns = ['symbol', 'time', 'skew', 'vol'])

        
        ### Estimate coefficients by regressing current skewness on historical moments
        Y = predictor[predictor['time'] == monthEnd_this]['skew'].values
        X = predictor[predictor['time'] == monthEnd_lag_1][['skew', 'vol']].values
        X = sm.add_constant(X)
        results = sm.OLS(Y, X, missing = 'drop').fit()
        coef = results.params
        
        ### Calculate expected skewness
        predictor_t = predictor[predictor['time'] == monthEnd_this][['skew', 'vol']].values
        ones = np.ones([len(predictor_t), 1])
        predictor_t = np.append(ones, predictor_t, 1)
        exp_skew = np.inner(predictor_t, coef)
        
        skew_df = predictor[predictor['time'] == monthEnd_this][['symbol']].reset_index(drop = True)
        skew_df.loc[:,'skew'] = exp_skew
        skew_df = skew_df.sort_values(by = ['skew']).reset_index(drop = True)
        
        return skew_df


    def _get_fama_french_factors(self):
        '''Download fama-french factors data from Github cloud and read it as a DataFrame.
           Data is originally from Prof French's official homepage. I unzip the data folder and upload to Github cloud.
        '''
        
        tmp_list = []
        data_str = self.Download("https://raw.githubusercontent.com/xinweimetrics/Tutorials/master/04%20Strategy%20Library/231%20Expected%20Idiosyncratic%20Skewness/F-F_Research_Data_Factors_daily.CSV")
        data_lines = data_str.splitlines()
        data_lines = data_lines[5:-2] # drop the first 5 and last 2 lines which are not data that we need
        for line in data_lines:
            data = line.split(',')
            # add one day to match QC behavior: daily data is timestamped at UTC 00:00 the next day from the actual day that produced the data
            tmp_list.append([datetime.strptime(data[0], "%Y%m%d") + timedelta(days = 1)] + [float(i) for i in data[1:]])

        self.fama_french_factors_per_day = pd.DataFrame(tmp_list, columns = ['time', 'Mkt_RF', 'SMB', 'HML', 'RF'])
        self.Debug(f"Downloading Fama-French data succeed! :: DataFrame shape >> {self.fama_french_factors_per_day.shape}")

import math
import numpy as np
import pandas as pd
import statsmodels.api as sm
from datetime import date, datetime, timedelta
from scipy import stats


class ExpectedIdiosyncraticSkewness(QCAlgorithm):
    '''Step 1. Calculating Fama-French daily regression residuals
       Step 2. Using daily residuals to calculate historical monthly moments
       Step 3. Run regression of historical monthly moments to estimate regression coefficients
       Step 4. Using historical monthly moments and estimated coefficients to calculate expected skewness
       Step 5. Sorting symbols by skewness and long the ones with lowest skewness
       
       Note: Fama-French factors data are only available up to 06/28/2019.
       So, backtest is implemented up to end of June, 2019. And, live trading is not feasible for current version.
       
       Reference: 
       [1] "Expected Idiosyncratic Skewness" by Boyer, Mitton and Vorkink, Rev Financ Stud, June 2009
           URL: https://academic.oup.com/rfs/article-abstract/23/1/169/1578688?redirectedFrom=PDF
       [2] https://dr.library.brocku.ca/bitstream/handle/10464/6426/Brock_Cao_Xu_2015.pdf
       [3] Fama-French official data: https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html
    '''
    
    def Initialize(self):
        
        self.SetStartDate(2009, 7, 1)                                   # Set Start Date: Right after original paper published
        self.SetEndDate(2019, 7, 1)                                     # Set End Date
        self.SetCash(100000)                                            # Set Strategy Cash
        self.AddEquity("SPY", Resolution.Daily)                         # Used to check trading days
        
        self.__number_of_coarse_symbol = 200                            # Set the number of coarse symbol to be further filtered by expected skewness
        
        self.symbol_weight = pd.DataFrame()                             # DataFrame to save desired weights with symbols as index
        self.month = 0                                                  # Track current calendar month
        self.next_trading_day = 0                                       # Track next trading day

        self._get_fama_french_factors()                                 # Download Fama French factors data as a dataframe
        
        self.UniverseSettings.Resolution = Resolution.Daily             # Subscribe daily data for selected symbols in universe
        self.AddUniverse(self.CoarseSelectionAndSkewnessSorting, self.GetWeightsInFineSelection)


    def CoarseSelectionAndSkewnessSorting(self, coarse):
        '''Coarse selection to get an initial fixed universe for the skewness sorting trade logic.
           Then, select the symbols to trade monthly based on skewness sorting.
        '''

        # if not last trading day at month end, return the unchanged universe
        self.month = (self.Time - timedelta(days = 1)).month
        self.next_trading_day = self.Securities["SPY"].Exchange.Hours.GetNextMarketOpen(self.Time, False)
        if self.month == self.next_trading_day.month: 
            return Universe.Unchanged
        
        self.Debug(f"Month end rebalance at: {self.Time}")
        
        ### Run the coarse selection to narrow down the universe
        # Sort descendingly by daily dollar volume & filter by fundamental data
        sorted_by_volume = sorted(coarse, key = lambda x: x.DollarVolume, reverse = True)
        filtered = [ x.Symbol for x in sorted_by_volume if x.HasFundamentalData and x.Price > 5 ]
        high_volume_stocks = filtered[:self.__number_of_coarse_symbol]
        
        ### Select symbols to trade based on expected skewness at each month end
        
        # Estimate expected idiosyncratic skewness
        symbol_and_skew = self.CalculateExpectedSkewness(high_volume_stocks)
        
        # Select the lowest quintile
        # fine = [ self.Symbol(x) for x in symbol_and_skew.loc[:math.ceil(self.__number_of_coarse_symbol * 0.05)]['symbol'] ]
        self.symbol_weight = pd.DataFrame(index = symbol_and_skew.loc[:math.ceil(self.__number_of_coarse_symbol * 0.05)]['symbol'], columns = ['cap', 'weight'])
        # self.symbol_weight.loc[:,'weight'] = np.ones([len(self.symbol_weight), 1]) / len(self.symbol_weight)
        # self.Debug(f"Selected symbols to trade >>\n {fine}\n")
        
        # return [self.Symbol(x) for x in self.symbol_weight.index] + [self.Symbol("SPY")]
        return [self.Symbol(x) for x in self.symbol_weight.index]
        # return fine
        
    def GetWeightsInFineSelection(self, fine):
        
        # self.symbol_weight = pd.DataFrame(index = fine, columns = ['cap', 'weight'])
        len_fine = len(self.symbol_weight)
        
        # fine = [ x for x in fine if x.EarningReports.BasicAverageShares.OneMonth > 0]
        
        i = 0
        for stock in fine:
            self.symbol_weight.iloc[i]['cap'] = stock.EarningReports.BasicAverageShares.ThreeMonths * stock.Price
            i += 1
        
        total_cap = self.symbol_weight['cap'].sum()
        self.symbol_weight['weight'] = self.symbol_weight['cap'] / total_cap
        
        return [ x.Symbol for x in fine ] + [self.Symbol("SPY")]
        
    
    def OnSecuritiesChanged(self, changes):
        '''Liquidate symbols that are removed from the dynamic universe
        '''
        for security in changes.RemovedSecurities:
            if security.Invested:
                self.Liquidate(security.Symbol)


    def OnData(self, data):
        '''Rebalance at month end. Determine weights. Place orders.
        '''
        # if not last trading day at month end, return
        if self.month == self.next_trading_day.month: return
        
        # Placing orders
        for symbol, row in self.symbol_weight.iterrows():
            self.SetHoldings(symbol, row['weight'])

    
    def CalculateExpectedSkewness(self, fine):
        '''Calculate expected skewness using historical moments and estimated regression coefficients
        '''

        ### Get predictors
        
        # Get historical returns for two months
        monthEnd_this = self.Time
        monthEnd_lag_1 = (self.Time - timedelta(days = 1)).replace(day = 1)
        monthEnd_lag_2 = (monthEnd_lag_1 - timedelta(days = 1)).replace(day = 1)   # First day of last trading month
        history = self.History(fine, monthEnd_lag_2 - timedelta(days = 1), monthEnd_this, Resolution.Daily)                         # Get one more day for price data
        history = history["close"].unstack(level = 0)
        daily_returns = (np.log(history) - np.log(history.shift(1)))[1:]                                             # Drop the first day

        # Merge Fama-French factors to daily returns based on dates available in return series
        daily_returns['time'] = daily_returns.index
        daily_returns = daily_returns.merge(self.fama_french_factors_per_day, left_on = 'time', right_on = 'time')
        
        daily_returns_this = daily_returns[daily_returns['time'] > monthEnd_lag_1]
        daily_returns_last = daily_returns[daily_returns['time'] <= monthEnd_lag_1]
        daily_returns_dict = {monthEnd_this: daily_returns_this, monthEnd_lag_1: daily_returns_last}
        
        # For each stock and each month, run fama-french time-series regression and calculate historical moments
        column_list = list(daily_returns.columns)
        predictor_list = []
        
        for month, returns in daily_returns_dict.items():
            
            for symbol in fine:
                
                if str(symbol) not in column_list:
                    # self.Debug(f"Symbol not in return series >> {symbol}")
                    predictor_list.append([str(symbol), month, np.nan, np.nan])
                    continue

                Y = (returns[str(symbol)] - returns['RF']).values
                X = returns[['Mkt_RF', 'SMB', 'HML']].values
                X = sm.add_constant(X)
                results = sm.OLS(Y, X).fit()
            
                hist_skew, hist_vol = stats.skew(results.resid), stats.tstd(results.resid)  # Use daily residual to calculate monthly moments
            
                predictor_list.append([str(symbol), month, hist_skew, hist_vol])

        predictor = pd.DataFrame(predictor_list, columns = ['symbol', 'time', 'skew', 'vol'])

        
        ### Estimate coefficients by regressing current skewness on historical moments
        Y = predictor[predictor['time'] == monthEnd_this]['skew'].values
        X = predictor[predictor['time'] == monthEnd_lag_1][['skew', 'vol']].values
        X = sm.add_constant(X)
        results = sm.OLS(Y, X, missing = 'drop').fit()
        coef = results.params
        
        ### Calculate expected skewness
        predictor_t = predictor[predictor['time'] == monthEnd_this][['skew', 'vol']].values
        ones = np.ones([len(predictor_t), 1])
        predictor_t = np.append(ones, predictor_t, 1)
        exp_skew = np.inner(predictor_t, coef)
        
        skew_df = predictor[predictor['time'] == monthEnd_this][['symbol']].reset_index(drop = True)
        skew_df.loc[:,'skew'] = exp_skew
        skew_df = skew_df.sort_values(by = ['skew']).reset_index(drop = True)
        
        return skew_df


    def _get_fama_french_factors(self):
        '''Download fama-french factors data from Github cloud and read it as a DataFrame.
           Data is originally from Prof French's official homepage. I unzip the data folder and upload to Github cloud.
        '''
        
        tmp_list = []
        data_str = self.Download("https://raw.githubusercontent.com/xinweimetrics/Tutorials/master/04%20Strategy%20Library/231%20Expected%20Idiosyncratic%20Skewness/F-F_Research_Data_Factors_daily.CSV")
        data_lines = data_str.splitlines()
        data_lines = data_lines[5:-2] # drop the first 5 and last 2 lines which are not data that we need
        for line in data_lines:
            data = line.split(',')
            # add one day to match QC behavior: daily data is timestamped at UTC 00:00 the next day from the actual day that produced the data
            tmp_list.append([datetime.strptime(data[0], "%Y%m%d") + timedelta(days = 1)] + [float(i) for i in data[1:]])

        self.fama_french_factors_per_day = pd.DataFrame(tmp_list, columns = ['time', 'Mkt_RF', 'SMB', 'HML', 'RF'])
        self.Debug(f"Downloading Fama-French data succeed! :: DataFrame shape >> {self.fama_french_factors_per_day.shape}")# Your New Python File

import pandas as pd
from pandas.tseries.offsets import BMonthEnd
from datetime import date, datetime, timedelta
import numpy as np
import statsmodels.api as sm
from scipy import stats
import math

class ExpectedIdiosyncraticSkewness(QCAlgorithm):
    '''Step 1. Calculating Fama-French daily regression residuals
       Step 2. Using daily residuals to calculate historical monthly moments
       Step 3. Run regression of historical monthly moments to estimate regression coefficients
       Step 4. Using historical monthly moments and estimated coefficients to calculate expected skewness
       Step 5. Sorting symbols by skewness and long the ones with lowest skewness
       
       Note: Fama-French factors data are only available online up to 06/28/2019.
       So, backtest is implemented up to end of June, 2019. And, live trading is not feasible for current version.
       
       Reference: 
       [1] https://academic.oup.com/rfs/article-abstract/23/1/169/1578688?redirectedFrom=PDF
       [2] https://dr.library.brocku.ca/bitstream/handle/10464/6426/Brock_Cao_Xu_2015.pdf
    '''
    
    def Initialize(self):
        
        # Alex: No need since this is the default
        self.SetTimeZone("America/New_York")         # Set Timezone   
        self.SetStartDate(2014, 7, 1)                # Set Start Date
        self.SetEndDate(2014, 9, 1)                  # Set End Date
        self.SetCash(100000)                         # Set Strategy Cash
        
        self.__number_of_coarse_symbol = 200         # Set the number of coarse symbol to be further filtered by expected skewness
        self.predictor_list = []
        self.fine = []
        
        # Initial selection to narrow down QC universe. Skewness sorting strategy will be implemented for this fixed universe.
        self.initial_selection = True                # Control flags
        self.UniverseSettings.Resolution = Resolution.Daily
        self.AddUniverse(self.CoarseSelectionFunction, self.FineSelectionFunction)

        # Download Fama French factors data as a dataframe
        self._get_fama_french_factors()

        # Rebalance at month end
        self.AddEquity("SPY", Resolution.Daily)
        self.Schedule.On(self.DateRules.MonthEnd("SPY"), self.TimeRules.AfterMarketOpen("SPY", 1), self.MonthEndRebalance)


    def CoarseSelectionFunction(self, coarse):
        '''Coarse selection to get an initial universe for the skewness sorting logic'''

        # Only run the universe selection once at the beginning of strategy
        if not self.initial_selection: return self.fine
        
        # Select symbols with fundamental data
        coarse_with_fundamental = [x for x in coarse if x.HasFundamentalData]

        # Sort descending by daily dollar volume
        sorted_by_volume = sorted(coarse_with_fundamental, key = lambda x: x.DollarVolume, reverse = True)
        self.fine = [ x.Symbol for x in sorted_by_volume[:self.__number_of_coarse_symbol] ]

        return self.fine

    # Alex: This is a tutorial, not documentation, we don't need to leave optionals 
    def FineSelectionFunction(self, fine):
        '''Optional: can add more filter in fine selection'''

        if not self.initial_selection: return self.fine

        self.initial_selection = False
        
        return self.fine


    # Alex: If OnData has only pass, remove it.
    def OnData(self, data):
        '''OnData event is the primary entry point for your algorithm. Each new data point will be pumped in here.
            Arguments:
                data: Slice object keyed by symbol containing the stock data
        '''
        pass


    def MonthEndRebalance(self):
        '''Rebalance portfolio at month end based on skewness sorting function
        '''
        
        self.Debug(f"Month End Rebalance at: {self.Time}")

        # Select symbols based on ranking of expected skewness
        self.symbol_to_trade = self.SkewnessSortingFunction()
        self.Debug(f"Selected symbols to trade >>\n {self.symbol_to_trade}\n")

        # Determine weights
        weights = self.PortfolioWeights()
        
        self.Liquidate()
        for symbol in self.symbol_to_trade:
            weight_i = weights[symbol]
            self.SetHoldings(symbol, weight_i)
        
    
    def SkewnessSortingFunction(self):
        '''Sort symbols based on expected skewness at each month end'''

        # Get historical monthly moments
        self._get_historical_moments()
        
        # Get coefficient from regression
        coef = self._get_skewness_coef()
        
        # Estimate expected idiosyncratic skewness
        skewness = self.ExpectedSkewness(coef)
        
        # Sort symbols by skewness
        skewness_sorted = skewness.sort_values(by = ['skew']).reset_index(drop = True)
        self.Debug(f"Symbols sorted by skewness (print 20 symbols) >>\n {skewness_sorted[:20]}")
        
        # Select the lowest quintile
        self.low_skew = skewness_sorted.loc[:math.ceil(self.__number_of_coarse_symbol * 0.05), 'symbol']

        return self.low_skew
    
    def ExpectedSkewness(self, coef):
        '''Calculate expected skewness using historical moments and estimated regression coefficients
        '''

        this_month = self.predictor['time'].iloc[-1]
        data_t = self.predictor[self.predictor['time'] == this_month][['skew', 'vol']].values
        ones = np.ones([len(data_t), 1])
        data_t = np.append(ones, data_t, 1)                                             # Adding constants
        exp_skew = np.inner(data_t, coef)
        # Return a df with key of symbols and value of skewness
        skew_df = self.predictor[self.predictor['time'] == this_month][['symbol']].reset_index(drop = True)
        skew_df.loc[:,'skew'] = exp_skew
        
        return skew_df

    def PortfolioWeights(self):
        '''Construct equal-weighted portfolio'''

        weights = {}
        for symbol in self.symbol_to_trade:
            weights[symbol] = 1 / len(self.symbol_to_trade)

        return weights

    def _get_skewness_coef(self):
        '''Regress current skewness on historical moments to get regression coefficients,
           which are used to calculate expected skewness
        '''
        
        if len(self.predictor['time'].unique()) == 1: 
            return [0, 1, 0]

        # Run regression and return coefficients
        this_month = self.predictor['time'].iloc[-1]
        last_month = self.predictor['time'].unique()[-2]
        Y = self.predictor[self.predictor['time'] == this_month]['skew'].values
        X = self.predictor[self.predictor['time'] == last_month][['skew', 'vol']].values
        X = sm.add_constant(X)                                                          # Adding a constant vector
        results = sm.OLS(Y, X, missing = 'drop').fit()
        beta = results.params
        
        return beta
        
    def _get_historical_moments(self):
        '''Regress daily excess return on excess market return, SMB, and HML to get Fama-French regression residuals.
           Then, use residuals to calculate historical moments.
        '''
        
        # Get historical returns for current month
        end_day = self.Time
        start_day = self.Time.replace(day = 1)                                          # Get first day of current month
        daily_returns = self._get_historical_returns(self.fine, start_day, end_day)

        # Merge FF factors to returns dataframe based on dates available in return series
        daily_returns = daily_returns.merge(self.fama_french_factors_per_day, left_on = 'time', right_on = 'time')
        column_list = list(daily_returns.columns)
        
        for symbol in self.fine:
            # Run fama-french time-series regression
            if str(symbol) not in column_list:
                self.Debug(f"Symbol not in return series >> {symbol}")
                self.predictor_list.append([str(symbol), self.Time, np.nan, np.nan])
                continue

            Y = (daily_returns[str(symbol)] - daily_returns['RF']).values
            X = daily_returns[['Mkt_RF', 'SMB', 'HML']].values
            X = sm.add_constant(X)                                                      # Adding a constant vector
            results = sm.OLS(Y, X).fit()
            hist_skew, hist_vol = stats.skew(results.resid), stats.tstd(results.resid)  # Use daily residual to calculate monthly moments
            self.predictor_list.append([str(symbol), self.Time, hist_skew, hist_vol])

        self.predictor = pd.DataFrame(self.predictor_list, columns = ['symbol', 'time', 'skew', 'vol'])
        
        return []


    def _get_historical_returns(self, symbols, start_day, end_day):
        '''Get historical returns for a given set of symbols and a given period'''

        history = self.History(symbols, start_day - timedelta(days = 1), end_day, Resolution.Daily)     # Get one more day for price data
        history = history.close.unstack(level = 0)
        historical_returns = (np.log(history) - np.log(history.shift(1)))[1:].dropna()                  # Drop the first day
        historical_returns['time'] = historical_returns.index                                           # Convert index 'time' into a column

        return historical_returns

    def _get_fama_french_factors(self):
        '''Download fama-french factors data from Dropbox and read it as a DataFrame'''
        
        tmp_list = []
        # data_str = self.Download("https://www.dropbox.com/s/rpob7ehzheuym7z/F-F_Research_Data_Factors_daily.CSV?dl=1")
        # https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_TXT.zip
        data_str = self.Download("https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_TXT.zip")
        data_lines = data_str.splitlines()
        data_lines = data_lines[5:-2] # drop the first 5 and last 2 lines which are not data that we need
        for line in data_lines:
            data = line.split(',')
            # add one day to match QC behavior: daily data is timestamped at UTC 00:00 the next day from the actual day that produced the data
            tmp_list.append([datetime.strptime(data[0], "%Y%m%d") + timedelta(days = 1)] + [float(i) for i in data[1:]])

        self.fama_french_factors_per_day = pd.DataFrame(tmp_list, columns = ['time', 'Mkt_RF', 'SMB', 'HML', 'RF'])
        self.Debug(f"Downloading Fama-French data succeed! :: DataFrame shape >> {self.fama_french_factors_per_day.shape}")

import pandas as pd
from pandas.tseries.offsets import BMonthEnd
from datetime import date, datetime, timedelta
import numpy as np
import statsmodels.api as sm
from scipy import stats
import math

class ExpectedIdiosyncraticSkewness(QCAlgorithm):
    '''Step 1. Calculating Fama-French daily regression residuals
       Step 2. Using daily residuals to calculate historical monthly moments
       Step 3. Run regression of historical monthly moments to estimate regression coefficients
       Step 4. Using historical monthly moments and estimated coefficients to calculate expected skewness
       Step 5. Sorting symbols by skewness and long the ones with lowest skewness
       
       Note: Fama-French factors data are only available online up to 06/28/2019.
       So, backtest is implemented up to end of June, 2019. And, live trading is not feasible for current version.
       
       Reference: 
       [1] https://academic.oup.com/rfs/article-abstract/23/1/169/1578688?redirectedFrom=PDF
       [2] https://dr.library.brocku.ca/bitstream/handle/10464/6426/Brock_Cao_Xu_2015.pdf
    '''
    
    def Initialize(self):
        
        self.SetStartDate(2018, 1, 2)                # Set Start Date
        self.SetEndDate(2019, 7, 1)                  # Set End Date
        self.SetCash(100000)                         # Set Strategy Cash
        
        self.__number_of_coarse_symbol = 50         # Set the number of coarse symbol to be further filtered by expected skewness
        self.predictor_list = []
        self.fine = []
        
        # Initial selection to narrow down QC universe. Skewness sorting strategy will be implemented for this fixed universe.
        self.initial_selection = True                # Control flags
        self.next_rebalance = False
        self.select_done = False
        self.UniverseSettings.Resolution = Resolution.Daily
        self.AddUniverse(self.CoarseSelectionAndSkewnessSorting)

        # Download Fama French factors data as a dataframe
        self._get_fama_french_factors()

        # Rebalance at month end
        self.AddEquity("SPY", Resolution.Daily)
        # self.Schedule.On(self.DateRules.MonthEnd("SPY"), self.TimeRules.AfterMarketOpen("SPY", 1), self.RebalanceSignal)
        # self.Schedule.On(self.DateRules.MonthEnd("SPY"), self.TimeRules.At(0, 0), self.RebalanceSignal)


    def CoarseSelectionAndSkewnessSorting(self, coarse):
        '''Coarse selection to get an initial universe for the skewness sorting logic'''

        self.Debug(f"Universe Selection >> {self.Time}")

        ### Only run the coarse universe selection once at the beginning of strategy
        if self.initial_selection:
            # Select symbols with fundamental data
            coarse_with_fundamental = [x for x in coarse if x.HasFundamentalData]

            # Sort descending by daily dollar volume
            sorted_by_volume = sorted(coarse_with_fundamental, key = lambda x: x.DollarVolume, reverse = True)
            
            self.fine = [ x.Symbol for x in sorted_by_volume[:self.__number_of_coarse_symbol] ]
            
            self.initial_selection = False
            
        ### Select symbols to trade based on expected skewness at each month end
        
        # if not self.next_rebalance: return Universe.Unchanged
        next_trading_day = self.Securities["SPY"].Exchange.Hours.GetNextMarketOpen(self.Time, False)
        self.month = (self.Time - timedelta(days = 1)).month
        if self.month == next_trading_day.month: return Universe.Unchanged
        
        self.Debug(f"Monthly rebalance at: {self.Time}")
        
        # Get historical monthly moments
        self._get_historical_moments()
        
        # Get coefficient from regression
        coef = self._get_skewness_coef()
        
        # Estimate expected idiosyncratic skewness
        skewness = self.ExpectedSkewness(coef)
        
        # Sort symbols by skewness
        skewness_sorted = skewness.sort_values(by = ['skew']).reset_index(drop = True)
        self.Debug(f"Symbols sorted by skewness (print 20 symbols) >>\n {skewness_sorted[:20]}")
        
        # Select the lowest quintile
        self.low_skew = skewness_sorted.loc[:math.ceil(self.__number_of_coarse_symbol * 0.05), 'symbol']
        #self.next_rebalance = False
        #self.select_done = True
        
        selected = [self.Symbol(x) for x in self.low_skew]
        self.Debug(f"return type >> {selected[0]}")

        self.Debug(f"selection done >> {self.Time} :: {type(self.low_skew)} :: {self.low_skew}")
        
        return selected


    # def RebalanceSignal(self):
        
    #     self.Debug(f"Rebalance Signal >> {self.Time}")
    #     self.next_rebalance = True

    def OnData(self, data):
        '''Rebalance portfolio at month end based on skewness sorting function
        '''
        self.Debug(f"On Data >> {self.Time}")
        # if not self.select_done: return
        next_trading_day = self.Securities["SPY"].Exchange.Hours.GetNextMarketOpen(self.Time, False)
        self.month = (self.Time - timedelta(days = 1)).month
        if self.month == next_trading_day.month: return
    
        self.Debug(f"Month End Rebalance at: {self.Time}")

        # Select symbols based on ranking of expected skewness
        #self.symbol_to_trade = self.SkewnessSortingFunction()
        #self.Debug(f"Selected symbols to trade >>\n {self.symbol_to_trade}\n")
        self.Debug(f"Selected symbols to trade >>\n {self.low_skew}\n")

        # Determine weights
        weights = self.PortfolioWeights()
        
        self.Liquidate()
        #for symbol in self.symbol_to_trade:
        for symbol in self.low_skew:
            weight_i = weights[symbol]
            # self.Debug(f"weight >> {weight_i} :: symbol type >> {type(symbol)} >> {symbol}")
            self.SetHoldings(symbol, weight_i)
        
        # self.select_done = False
    
    # def SkewnessSortingFunction(self):
    #     '''Sort symbols based on expected skewness at each month end'''

    #     # Get historical monthly moments
    #     self._get_historical_moments()
        
    #     # Get coefficient from regression
    #     coef = self._get_skewness_coef()
        
    #     # Estimate expected idiosyncratic skewness
    #     skewness = self.ExpectedSkewness(coef)
        
    #     # Sort symbols by skewness
    #     skewness_sorted = skewness.sort_values(by = ['skew']).reset_index(drop = True)
    #     self.Debug(f"Symbols sorted by skewness (print 20 symbols) >>\n {skewness_sorted[:20]}")
        
    #     # Select the lowest quintile
    #     self.low_skew = skewness_sorted.loc[:math.ceil(self.__number_of_coarse_symbol * 0.05), 'symbol']

    #     return self.low_skew
    
    def ExpectedSkewness(self, coef):
        '''Calculate expected skewness using historical moments and estimated regression coefficients
        '''

        this_month = self.predictor['time'].iloc[-1]
        data_t = self.predictor[self.predictor['time'] == this_month][['skew', 'vol']].values
        ones = np.ones([len(data_t), 1])
        data_t = np.append(ones, data_t, 1)                                             # Adding constants
        exp_skew = np.inner(data_t, coef)
        # Return a df with key of symbols and value of skewness
        skew_df = self.predictor[self.predictor['time'] == this_month][['symbol']].reset_index(drop = True)
        skew_df.loc[:,'skew'] = exp_skew
        
        return skew_df

    def PortfolioWeights(self):
        '''Construct equal-weighted portfolio'''

        weights = {}
        for symbol in self.low_skew:
            weights[symbol] = 1 / len(self.low_skew)

        return weights

    def _get_skewness_coef(self):
        '''Regress current skewness on historical moments to get regression coefficients,
           which are used to calculate expected skewness
        '''
        
        if len(self.predictor['time'].unique()) == 1: 
            return [0, 1, 0]

        # Run regression and return coefficients
        this_month = self.predictor['time'].iloc[-1]
        last_month = self.predictor['time'].unique()[-2]
        Y = self.predictor[self.predictor['time'] == this_month]['skew'].values
        X = self.predictor[self.predictor['time'] == last_month][['skew', 'vol']].values
        X = sm.add_constant(X)                                                          # Adding a constant vector
        results = sm.OLS(Y, X, missing = 'drop').fit()
        beta = results.params
        
        return beta
        
    def _get_historical_moments(self):
        '''Regress daily excess return on excess market return, SMB, and HML to get Fama-French regression residuals.
           Then, use residuals to calculate historical moments.
        '''
        
        # Get historical returns for current month
        end_day = self.Time
        start_day = (self.Time - timedelta(days = 1)).replace(day = 1)                                      # Get first day of current month
        history = self.History(self.fine, start_day - timedelta(days = 1), end_day, Resolution.Daily)     # Get one more day for price data
        history = history.close.unstack(level = 0)
        daily_returns = (np.log(history) - np.log(history.shift(1)))[1:].dropna()                  # Drop the first day
        daily_returns['time'] = daily_returns.index                                           # Convert index 'time' into a column

        # Merge FF factors to returns dataframe based on dates available in return series
        daily_returns = daily_returns.merge(self.fama_french_factors_per_day, left_on = 'time', right_on = 'time')
        column_list = list(daily_returns.columns)
        
        for symbol in self.fine:
            # Run fama-french time-series regression
            if str(symbol) not in column_list:
                self.Debug(f"Symbol not in return series >> {symbol}")
                self.predictor_list.append([str(symbol), self.Time, np.nan, np.nan])
                continue

            Y = (daily_returns[str(symbol)] - daily_returns['RF']).values
            X = daily_returns[['Mkt_RF', 'SMB', 'HML']].values
            X = sm.add_constant(X)                                                      # Adding a constant vector
            results = sm.OLS(Y, X).fit()
            hist_skew, hist_vol = stats.skew(results.resid), stats.tstd(results.resid)  # Use daily residual to calculate monthly moments
            self.predictor_list.append([str(symbol), self.Time, hist_skew, hist_vol])

        self.predictor = pd.DataFrame(self.predictor_list, columns = ['symbol', 'time', 'skew', 'vol'])
        
        return []


    def _get_fama_french_factors(self):
        '''Download fama-french factors data from Dropbox and read it as a DataFrame'''
        
        tmp_list = []
        data_str = self.Download("https://raw.githubusercontent.com/xinweimetrics/Tutorials/master/04%20Strategy%20Library/231%20Expected%20Idiosyncratic%20Skewness/F-F_Research_Data_Factors_daily.CSV")
        data_lines = data_str.splitlines()
        data_lines = data_lines[5:-2] # drop the first 5 and last 2 lines which are not data that we need
        for line in data_lines:
            data = line.split(',')
            # add one day to match QC behavior: daily data is timestamped at UTC 00:00 the next day from the actual day that produced the data
            tmp_list.append([datetime.strptime(data[0], "%Y%m%d") + timedelta(days = 1)] + [float(i) for i in data[1:]])

        self.fama_french_factors_per_day = pd.DataFrame(tmp_list, columns = ['time', 'Mkt_RF', 'SMB', 'HML', 'RF'])
        self.Debug(f"Downloading Fama-French data succeed! :: DataFrame shape >> {self.fama_french_factors_per_day.shape}")