Overall Statistics
Total Orders
14
Average Win
9.04%
Average Loss
-9.56%
Compounding Annual Return
14.613%
Drawdown
35.800%
Expectancy
-0.027
Start Equity
100000
End Equity
114570.54
Net Profit
14.571%
Sharpe Ratio
0.495
Sortino Ratio
0.547
Probabilistic Sharpe Ratio
25.745%
Loss Rate
50%
Win Rate
50%
Profit-Loss Ratio
0.95
Alpha
0.08
Beta
-1.626
Annual Standard Deviation
0.574
Annual Variance
0.329
Information Ratio
0.578
Tracking Error
0.708
Treynor Ratio
-0.175
Total Fees
$59.62
Estimated Strategy Capacity
$260000000.00
Lowest Capacity Asset
SQQQ UK280CGTCB51
Portfolio Turnover
2.99%
# region imports
from AlgorithmImports import *

import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer, set_seed
from pathlib import Path
from datasets import Dataset
import pytz
import torch
# endregion

class FinbertBaseModelAlgorithm(QCAlgorithm):

    def initialize(self):
        self.set_start_date(2022, 1, 1)
        self.set_end_date(2023, 1, 1)
        self.set_cash(100_000)

        spy = Symbol.create("SPY", SecurityType.EQUITY, Market.USA)
        self.universe_settings.resolution = Resolution.DAILY
        self.universe_settings.schedule.on(self.date_rules.month_start(spy))
        self._universe = self.add_universe(
            lambda fundamental: [
                self.history(
                    [f.symbol for f in sorted(fundamental, key=lambda f: f.dollar_volume)[-10:]], 
                    timedelta(365), Resolution.DAILY
                )['close'].unstack(0).pct_change().iloc[1:].std().idxmax()
            ]
        )

        set_seed(1, True)
        
        self._last_rebalance_time = datetime.min
        self.schedule.on(
            self.date_rules.month_start(spy, 1),
            self.time_rules.midnight,
            self._trade
        )

        self.set_warm_up(timedelta(30))

        self._model_name = "ProsusAI/finbert"
        self._tokenizer = BertTokenizer.from_pretrained(self._model_name) 

    def on_warmup_finished(self):
        self._trade()

    def on_securities_changed(self, changes):
        for security in changes.removed_securities:
            self.remove_security(security.dataset_symbol)
        for security in changes.added_securities:
            security.dataset_symbol = self.add_data(
                TiingoNews, security.symbol
            ).symbol

    def _trade(self):
        if (self.is_warming_up or 
            self.time - self._last_rebalance_time < timedelta(14)):
            return

        # Get the target security.
        security = self.securities[list(self._universe.selected)[0]]

        # Get samples to fine-tune the model
        samples = pd.DataFrame(columns=['text', 'label'])
        news_history = self.history(security.dataset_symbol, 30, Resolution.DAILY)
        if news_history.empty:
            return
        news_history = news_history.loc[security.dataset_symbol]['description']
        asset_history = self.history(
            security.symbol, timedelta(30), Resolution.SECOND
        ).loc[security.symbol]['close']
        for i in range(len(news_history.index)-1):
            # Get factor (article description).
            factor = news_history.iloc[i]
            if not factor:
                continue

            # Get the label (the market reaction to the news, for now).
            release_time = self._convert_to_eastern(news_history.index[i])
            next_release_time = self._convert_to_eastern(news_history.index[i+1])
            reaction_period = asset_history[
                (asset_history.index > release_time) &
                (asset_history.index < next_release_time + timedelta(seconds=1))
            ]
            if reaction_period.empty:
                continue
            label = (
                (reaction_period.iloc[-1] - reaction_period.iloc[0]) 
                / reaction_period.iloc[0]
            )
            
            # Save the training sample.
            samples.loc[len(samples), :] = [factor, label]

        samples = samples.iloc[-100:]
        
        if samples.shape[0] < 10:
            self.liquidate()
            return
        
        # Classify the market reaction into positive/negative/neutral.
        # 75% of the most negative labels => class 0 (negative)
        # 75% of the most postiive labels => class 2 (positive)
        # Remaining labels                => class 1 (netural)
        sorted_samples = samples.sort_values(by='label', ascending=False).reset_index(drop=True)
        percent_signed = 0.75
        positive_cutoff = (
            int(percent_signed 
            * len(sorted_samples[sorted_samples.label > 0]))
        )
        negative_cutoff = (
            len(sorted_samples) 
            - int(percent_signed * len(sorted_samples[sorted_samples.label < 0]))
        )
        sorted_samples.loc[list(range(negative_cutoff, len(sorted_samples))), 'label'] = 0
        sorted_samples.loc[list(range(positive_cutoff, negative_cutoff)), 'label'] = 1
        sorted_samples.loc[list(range(0, positive_cutoff)), 'label'] = 2       

        # Load the pre-trained model.
        model = TFBertForSequenceClassification.from_pretrained(
            self._model_name, num_labels=3, from_pt=True
        )
        # Compile the model.
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5), 
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        )
        # Create the training dataset.
        dataset = Dataset.from_pandas(sorted_samples)
        dataset = dataset.map(
            lambda sample: self._tokenizer(
                sample['text'], padding='max_length', truncation=True
            )
        )
        dataset = model.prepare_tf_dataset(
            dataset, shuffle=True, tokenizer=self._tokenizer
        )
        # Train the model.
        model.fit(dataset, epochs=2)
        # Prepare the input sentences.
        inputs = self._tokenizer(
            list(samples['text'].values), padding=True, truncation=True, 
            return_tensors='tf'
        )

        # Get the model outputs.
        outputs = model(**inputs) 

        # Apply softmax to the outputs to get probabilities.
        scores = tf.nn.softmax(outputs.logits, axis=-1).numpy()
        scores = self._aggregate_sentiment_scores(scores)
        
        self.plot("Sentiment Probability", "Negative", scores[0])
        self.plot("Sentiment Probability", "Neutral", scores[1])
        self.plot("Sentiment Probability", "Positive", scores[2])

        # Rebalance.
        weight = 1 if scores[2] > scores[0] else -0.25
        self.set_holdings(security.symbol, weight, True)
        self._last_rebalance_time = self.time

    def _convert_to_eastern(self, dt):
        return dt.astimezone(pytz.timezone('US/Eastern')).replace(tzinfo=None)

    def _aggregate_sentiment_scores(self, sentiment_scores):
        n = sentiment_scores.shape[0]
        
        # Generate exponentially increasing weights
        weights = np.exp(np.linspace(0, 1, n))
        
        # Normalize weights to sum to 1
        weights /= weights.sum()
        
        # Apply weights to sentiment scores
        weighted_scores = sentiment_scores * weights[:, np.newaxis]
        
        # Aggregate weighted scores by summing them
        aggregated_scores = weighted_scores.sum(axis=0)
        
        return aggregated_scores