Building A Pairs-Trading Strategy With Python From Scratch

8 months ago 60

Importing Libraries and Loading Data
To get started, we first need to load the necessary libraries and acquire historical stock price data through the Yahoo Finance API.

import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from itertools import combinations
from datetime import datetime, timedelta
import yfinance as yf

# Number of pairs in a portfolio during a period
PORTFOLIO_SIZE = 20

# Load data
data = pd.read_excel("sp500.xlsx", sheet_name="updated")
ind = data[data['date'] == '2000-01-03'].index[0]
data = data.iloc[ind:]
data['date'] = pd.to_datetime(data['date']).apply(lambda x: x.strftime('%Y-%m-%d'))

# Create tickers dictionary
tickers = {data['date'][i]: data['tickers'][i].split(",") for i in range(ind, len(data)+ind)}

# Collect all tickers
all_tickers = set()
for t in tickers.values():
all_tickers.update(t)
all_tickers = list(all_tickers)
tickers_to_remove = ['TIE', 'BSC', 'BDK', 'CBE', 'ACS', 'MEE', 'BOL']
filtered_tickers = [ticker for ticker in all_tickers if ticker not in tickers_to_remove]

Some tickers in the dataset may have data quality issues. Therefore, we filter them out to ensure cleaner inputs for the strategy. With the tickers cleaned, we proceed to download the historical price data.

historical_data = yf.download(filtered_tickers, start="1999-01-01", end="2024-04-01", group_by='ticker')
adj_close_data = historical_data.xs('Adj Close', level=1, axis=1)
# Drop columns where all values are NaN
close_prices = adj_close_data.dropna(axis=1, how='all')

The strategy leverages daily stock price data from 1999 through March 2024. For each period, we compute the SSD (Sum of Squared Differences) over a one-year lookback window, identifying the top 20 most similar pairs. These pairs are then traded over a six-month horizon. We open positions based on specific Z-score thresholds: pairs are bought or sold when the Z-score crosses ±2, and the positions are closed once the Z-score reverts to 0.

The implementation remains similar to the cryptocurrency version we discussed previously, but let’s review each component for clarity.

First, we normalize the price data and calculate SSD using the following functions:

def normalize(df, min_vals, max_vals):
return (df - min_vals) / (max_vals - min_vals)

def calculate_ssd(df):
filtered_df = df.dropna(axis=1)
return {f'{c1}-{c2}': np.sum((filtered_df[c1] - df[c2]) ** 2) for c1, c2 in combinations(filtered_df.columns, 2)}

def top_x_pairs(df, start, end):
ssd_results_dict = calculate_ssd(df)
sorted_ssd_dict = dict(sorted(ssd_results_dict.items(), key=lambda item: item[1]))
most_similar_pairs = {}
coins = set()
for pair, ssd in sorted_ssd_dict.items():
coin1, coin2 = pair.split('-')
if coin1 not in coins and coin2 not in coins:
most_similar_pairs[coin1] = (pair, ssd)
coins.add(coin1)
coins.add(coin2)
if len(most_similar_pairs) == PORTFOLIO_SIZE:
break
sorted_ssd = dict(sorted(most_similar_pairs.items(), key=lambda item: item[1][1]))
topx_pairs = list(sorted_ssd.values())[:PORTFOLIO_SIZE]
return topx_pairs

We set PORTFOLIO_SIZE to 20, selecting the top 20 pairs with the smallest SSD metric during each period. A few additional utility functions support date-based operations:

def get_previous_date(dates_list, target_date_str):
dates = [datetime.strptime(date, '%Y-%m-%d') for date in dates_list]
target_date = datetime.strptime(target_date_str, '%Y-%m-%d')
dates.sort()
previous_date = None
for date in dates:
if date >= target_date:
break
previous_date = date
return previous_date.strftime('%Y-%m-%d') if previous_date else None

def one_day_after(date_str):
date_format = "%Y-%m-%d"
date_obj = datetime.strptime(date_str, date_format)
return (date_obj + timedelta(days=1)).strftime(date_format)

def one_year_before(date_str):
date_format = "%Y-%m-%d"
original_date = datetime.strptime(date_str, date_format)
try:
return original_date.replace(year=original_date.year - 1).strftime(date_format)
except ValueError:
return original_date.replace(month=2, day=28, year=original_date.year - 1).strftime(date_format)

We calculate the strategy return over each holding period:

def strategy_return(data, commission=0.001):
pnl = 0
for df in data.values():
# Handle long positions
long_entries = df[df['buy'] == 1].index
for idx in long_entries:
exit_idx = df[(df.index > idx) & (df['long_exit'])].index
# Position details omitted here for clarity.
return pnl / len(data)

We apply additional filtering to exclude low-liquidity stocks:

def filter_stocks(date):
nearest_date = get_previous_date(dates_list, date)
stock_list = tickers[nearest_date]
formation_start_date = one_year_before(date)
stocks_data = historical_data.loc[formation_start_date:date]
# Remove stocks with missing data or low liquidity.
return filtered_stocks

Read Entire Article