In [2]:
import pandas as pd
import yfinance as yf
import requests
from requests_cache import CachedSession
In [3]:
sp500 = yf.Ticker("^GSPC").history(period='max')
sp500
Out[3]:
Open High Low Close Volume Dividends Stock Splits
Date
1927-12-30 00:00:00-05:00 17.660000 17.660000 17.660000 17.660000 0 0.0 0.0
1928-01-03 00:00:00-05:00 17.760000 17.760000 17.760000 17.760000 0 0.0 0.0
1928-01-04 00:00:00-05:00 17.719999 17.719999 17.719999 17.719999 0 0.0 0.0
1928-01-05 00:00:00-05:00 17.549999 17.549999 17.549999 17.549999 0 0.0 0.0
1928-01-06 00:00:00-05:00 17.660000 17.660000 17.660000 17.660000 0 0.0 0.0
... ... ... ... ... ... ... ...
2026-04-07 00:00:00-04:00 6601.930176 6618.259766 6534.549805 6616.850098 4555680000 0.0 0.0
2026-04-08 00:00:00-04:00 6754.359863 6793.500000 6740.279785 6782.810059 5904880000 0.0 0.0
2026-04-09 00:00:00-04:00 6783.689941 6835.310059 6761.549805 6824.660156 4912410000 0.0 0.0
2026-04-10 00:00:00-04:00 6839.240234 6845.770020 6808.459961 6816.890137 4393220000 0.0 0.0
2026-04-13 00:00:00-04:00 6806.470215 6887.000000 6790.020020 6886.240234 2881481000 0.0 0.0

24686 rows × 7 columns

In [4]:
sp500.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 24686 entries, 1927-12-30 00:00:00-05:00 to 2026-04-13 00:00:00-04:00
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          24686 non-null  float64
 1   High          24686 non-null  float64
 2   Low           24686 non-null  float64
 3   Close         24686 non-null  float64
 4   Volume        24686 non-null  int64  
 5   Dividends     24686 non-null  float64
 6   Stock Splits  24686 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 1.5 MB

DATA PREPROCESSING¶

In [5]:
#These columns only have a 0 value
sp500['Stock Splits'].value_counts()
sp500['Dividends'].value_counts()
#So we can delete them
sp500= sp500.drop(columns = ['Dividends', 'Stock Splits'])
In [6]:
sp500.plot.line(y = 'Close', use_index=True)
Out[6]:
<Axes: xlabel='Date'>
No description has been provided for this image
In [7]:
# nothing is going on until around 1990 so we keep everything after it
sp500 = sp500.loc['1990-01-01':].copy()
sp500
Out[7]:
Open High Low Close Volume
Date
1990-01-02 00:00:00-05:00 353.399994 359.690002 351.980011 359.690002 162070000
1990-01-03 00:00:00-05:00 359.690002 360.589996 357.890015 358.760010 192330000
1990-01-04 00:00:00-05:00 358.760010 358.760010 352.890015 355.670013 177000000
1990-01-05 00:00:00-05:00 355.670013 355.670013 351.350006 352.200012 158530000
1990-01-08 00:00:00-05:00 352.200012 354.239990 350.540009 353.790009 140110000
... ... ... ... ... ...
2026-04-07 00:00:00-04:00 6601.930176 6618.259766 6534.549805 6616.850098 4555680000
2026-04-08 00:00:00-04:00 6754.359863 6793.500000 6740.279785 6782.810059 5904880000
2026-04-09 00:00:00-04:00 6783.689941 6835.310059 6761.549805 6824.660156 4912410000
2026-04-10 00:00:00-04:00 6839.240234 6845.770020 6808.459961 6816.890137 4393220000
2026-04-13 00:00:00-04:00 6806.470215 6887.000000 6790.020020 6886.240234 2881481000

9136 rows × 5 columns

In [8]:
# Create a new column 'Tomorrow' that contains the closing price of the next day
# by shifting the 'Close' column up by one row (-1).
sp500['Tomorrow'] = sp500['Close'].shift(-1)
#create a target, 1 if the price tomorrow is higher than today
sp500['Target'] = (sp500['Tomorrow']>sp500['Close']).astype(int)
sp500
Out[8]:
Open High Low Close Volume Tomorrow Target
Date
1990-01-02 00:00:00-05:00 353.399994 359.690002 351.980011 359.690002 162070000 358.760010 0
1990-01-03 00:00:00-05:00 359.690002 360.589996 357.890015 358.760010 192330000 355.670013 0
1990-01-04 00:00:00-05:00 358.760010 358.760010 352.890015 355.670013 177000000 352.200012 0
1990-01-05 00:00:00-05:00 355.670013 355.670013 351.350006 352.200012 158530000 353.790009 1
1990-01-08 00:00:00-05:00 352.200012 354.239990 350.540009 353.790009 140110000 349.619995 0
... ... ... ... ... ... ... ...
2026-04-07 00:00:00-04:00 6601.930176 6618.259766 6534.549805 6616.850098 4555680000 6782.810059 1
2026-04-08 00:00:00-04:00 6754.359863 6793.500000 6740.279785 6782.810059 5904880000 6824.660156 1
2026-04-09 00:00:00-04:00 6783.689941 6835.310059 6761.549805 6824.660156 4912410000 6816.890137 0
2026-04-10 00:00:00-04:00 6839.240234 6845.770020 6808.459961 6816.890137 4393220000 6886.240234 1
2026-04-13 00:00:00-04:00 6806.470215 6887.000000 6790.020020 6886.240234 2881481000 NaN 0

9136 rows × 7 columns

In [9]:
import pandas as pd
import requests
from io import StringIO

url = "https://fred.stlouisfed.org/graph/fredgraph.csv?id=DFF"
response = requests.get(url, verify=True)  # Ensure SSL verification

if response.status_code == 200:
    data = StringIO(response.text)
    federal_rates = pd.read_csv(data)
    federal_rates['Date'] = pd.to_datetime(federal_rates['observation_date'], format='%Y-%m-%d')
    print(federal_rates.info())
    print(federal_rates.head(10))
else:
    print(f"Failed to download data: {response.status_code}")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26217 entries, 0 to 26216
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   observation_date  26217 non-null  object        
 1   DFF               26217 non-null  float64       
 2   Date              26217 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 614.6+ KB
None
  observation_date   DFF       Date
0       1954-07-01  1.13 1954-07-01
1       1954-07-02  1.25 1954-07-02
2       1954-07-03  1.25 1954-07-03
3       1954-07-04  1.25 1954-07-04
4       1954-07-05  0.88 1954-07-05
5       1954-07-06  0.25 1954-07-06
6       1954-07-07  1.00 1954-07-07
7       1954-07-08  1.25 1954-07-08
8       1954-07-09  1.25 1954-07-09
9       1954-07-10  1.25 1954-07-10
In [10]:
# Average of the last 4 rows for the 'DFF' column
average_dff = federal_rates['DFF'].tail(4).mean()

# Get the last date and add one day
next_day_date = federal_rates['Date'].iloc[-1] + pd.Timedelta(days=1)

# Check if the next day is Saturday or Sunday, and adjust to Monday if needed
if next_day_date.weekday() == 5:  # Saturday
    next_day_date += pd.Timedelta(days=2)  # Move to Monday
elif next_day_date.weekday() == 6:  # Sunday
    next_day_date += pd.Timedelta(days=1)  # Move to Monday

# Create a new row dictionary
next_day_row = {
    'Date': next_day_date,
    'DFF': average_dff  # Use the computed average for the 'DFF' column
}

# Append the new row to the DataFrame
federal_rates = pd.concat([federal_rates, pd.DataFrame([next_day_row]) ], ignore_index=True)

# Show the updated DataFrame
print(federal_rates.tail())
print(federal_rates.head())
      observation_date   DFF       Date
26213       2026-04-07  3.64 2026-04-07
26214       2026-04-08  3.64 2026-04-08
26215       2026-04-09  3.64 2026-04-09
26216       2026-04-10  3.64 2026-04-10
26217              NaN  3.64 2026-04-13
  observation_date   DFF       Date
0       1954-07-01  1.13 1954-07-01
1       1954-07-02  1.25 1954-07-02
2       1954-07-03  1.25 1954-07-03
3       1954-07-04  1.25 1954-07-04
4       1954-07-05  0.88 1954-07-05
In [11]:
# Remove the timezone from sp500['Date']
sp500 = sp500.reset_index()
sp500['Date'] = sp500['Date'].dt.tz_localize(None)

# Now perform the merge
sp500_with_rates = pd.merge(sp500, federal_rates, on='Date')
sp500_with_rates.drop(columns=['observation_date'],inplace = True)
sp500 = sp500_with_rates
In [12]:
sp500
sp500.set_index('Date', inplace=True)
In [13]:
# Define different time windows for calculating features
days = [2, 7, 30, 90, 365]
features = []

# Create new feature columns based on rolling averages and trends
for day in days:
    # Calculate rolling averages for the specified window
    rolling_averages = sp500.rolling(day).mean()

    # Create a feature for the ratio of the current Close price to its rolling average
    ratio_close_column = f"Close_Ratio_{day}"
    sp500[ratio_close_column] = sp500['Close'] / rolling_averages['Close']

    # Create a feature for the market trend based on the sum of previous Target values
    trend_column = f"Trend_{day}"
    sp500[trend_column] = sp500.shift(1).rolling(day).sum()['Target']

    # Create a feature for the ratio of the current DFF to its rolling average
    ratio_DFF_column = f"DFF_ratio_{day}"
    sp500[ratio_DFF_column] = sp500['DFF'] / rolling_averages['DFF']

    # Add the created features to the list of features
    features += [ratio_close_column, trend_column, ratio_DFF_column]


# Save the last row of the DataFrame before dropping it, this wil be the input for prediction at the end of the code
row_to_predict = sp500.tail(1)

# Drop rows with missing values
sp500.dropna(inplace=True)

sp500
Out[13]:
Open High Low Close Volume Tomorrow Target DFF Close_Ratio_2 Trend_2 ... DFF_ratio_7 Close_Ratio_30 Trend_30 DFF_ratio_30 Close_Ratio_90 Trend_90 DFF_ratio_90 Close_Ratio_365 Trend_365 DFF_ratio_365
Date
1991-06-12 381.049988 381.049988 374.459991 376.649994 166140000 377.630005 1 5.67 0.994193 1.0 ... 0.976144 0.993271 16.0 0.975009 1.005457 46.0 0.940472 1.095258 191.0 0.752889
1991-06-13 376.649994 377.899994 376.079987 377.630005 145650000 382.290009 1 5.79 1.001299 1.0 ... 1.000988 0.996088 16.0 0.996272 1.007198 46.0 0.960925 1.097942 192.0 0.769546
1991-06-14 377.630005 382.299988 377.630005 382.290009 167950000 380.130005 0 5.71 1.006132 2.0 ... 0.991319 1.008223 16.0 0.983352 1.018691 46.0 0.947264 1.111256 193.0 0.759627
1991-06-17 382.299988 382.309998 380.130005 380.130005 134230000 378.589996 0 5.94 0.997167 1.0 ... 1.028190 1.002586 15.0 1.021906 1.012274 45.0 0.984748 1.104731 193.0 0.790876
1991-06-18 380.130005 381.829987 377.989990 378.589996 155200000 375.089996 0 5.82 0.997970 0.0 ... 1.004686 0.998655 15.0 1.001262 1.007515 45.0 0.965850 1.100038 192.0 0.775592
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2026-04-06 6587.660156 6618.129883 6579.720215 6611.830078 3906440000 6616.850098 1 3.64 1.002209 2.0 ... 1.000000 0.988666 13.0 1.000000 0.969743 47.0 0.991016 1.052501 204.0 0.873430
2026-04-07 6601.930176 6618.259766 6534.549805 6616.850098 4555680000 6782.810059 1 3.64 1.000379 2.0 ... 1.000000 0.990508 14.0 1.000000 0.970619 47.0 0.991736 1.052955 204.0 0.874113
2026-04-08 6754.359863 6793.500000 6740.279785 6782.810059 5904880000 6824.660156 1 3.64 1.012385 2.0 ... 1.000000 1.015895 14.0 1.000000 0.994936 47.0 0.992457 1.078927 205.0 0.874798
2026-04-09 6783.689941 6835.310059 6761.549805 6824.660156 4912410000 6816.890137 0 3.64 1.003076 2.0 ... 1.000000 1.022783 14.0 1.000000 1.001055 47.0 0.993179 1.085124 206.0 0.875484
2026-04-10 6839.240234 6845.770020 6808.459961 6816.890137 4393220000 6886.240234 1 3.64 0.999430 1.0 ... 1.000000 1.022088 14.0 1.000000 0.999968 46.0 0.993932 1.083408 206.0 0.876171

8770 rows × 23 columns

MODEL¶

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score,accuracy_score, classification_report


# Define and train the model
model = RandomForestClassifier(n_estimators=150, min_samples_split=50, max_depth = 15 , random_state=1)

train = sp500.iloc[:-550]
test = sp500.iloc[-550:]

# Train the model
model.fit(train[features], train['Target'])

# Predict probabilities for the test set, and only the rows greater than 0.6 will be 1
probs = model.predict_proba(test[features])[:, 1]
threshold = 0.60
preds = (probs >= threshold).astype(int)

RESULTS AND METRICS¶

In [15]:
# ── Threshold Comparison ──
# Paste this right after: preds = (probs >= threshold).astype(int)
 
from sklearn.metrics import precision_score, recall_score, f1_score
 
thresholds = [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80]
rows = []
 
for t in thresholds:
    p = (probs >= t).astype(int)
    rows.append({
        'Threshold':    t,
        'Precision':    round(precision_score(test['Target'], p, zero_division=0), 4),
        'Recall':       round(recall_score(test['Target'], p, zero_division=0), 4),
        'F1 Score':     round(f1_score(test['Target'], p, zero_division=0), 4),
        'Predicted Up': int(p.sum()),
        'Total Test':   len(p)
    })
 
results = pd.DataFrame(rows)
print(results.to_string(index=False))
 Threshold  Precision  Recall  F1 Score  Predicted Up  Total Test
      0.50     0.5729  0.9199    0.7060           501         550
      0.55     0.5810  0.5865    0.5837           315         550
      0.60     0.6452  0.0641    0.1166            31         550
      0.65     0.5000  0.0064    0.0127             4         550
      0.70     0.0000  0.0000    0.0000             0         550
      0.75     0.0000  0.0000    0.0000             0         550
      0.80     0.0000  0.0000    0.0000             0         550

PREDICTING TOMORROW OUTCOME¶

In [16]:
# calling again the previously saved last row
row_to_predict[features]
Out[16]:
Close_Ratio_2 Trend_2 DFF_ratio_2 Close_Ratio_7 Trend_7 DFF_ratio_7 Close_Ratio_30 Trend_30 DFF_ratio_30 Close_Ratio_90 Trend_90 DFF_ratio_90 Close_Ratio_365 Trend_365 DFF_ratio_365
Date
2026-04-13 1.005061 1.0 1.0 1.022956 6.0 1.0 1.032448 15.0 1.0 1.01002 47.0 0.994687 1.093917 206.0 0.87686
In [17]:
# Make a prediction for the next day's market movement
prediction = model.predict(row_to_predict[features])

# Display the prediction
print(f"Predicted market movement for tomorrow: {'It will INCREASE' if prediction[0] == 1 else 'It will DECREASE'}")
Predicted market movement for tomorrow: It will DECREASE

── DCA vs. Model-Guided DCA Strategy ──¶

In [ ]:
# Paste this after your model predictions cell

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np

# -- Recalculate preds at 0.55 if needed --
threshold = 0.50
preds = (probs >= threshold).astype(int)

test_data = test.copy()
test_data['Preds'] = preds

daily_amount = 50  # dollars per day

# ── Scenario 1: Buy $50 every day (standard DCA) ──
dca_shares = 0
dca_invested = 0
dca_values = []

for i, (date, row) in enumerate(test_data.iterrows()):
    price = row['Close']
    dca_shares += daily_amount / price
    dca_invested += daily_amount
    dca_values.append(dca_shares * price)

test_data['DCA_Value'] = dca_values

# ── Scenario 2: Model-guided DCA ──
# Buy only when model predicts up.
# When model predicts down, accumulate the $50.
# Next time model predicts up, invest all accumulated cash.
model_shares = 0
model_cash_reserve = 0
model_invested = 0
model_values = []

for i, (date, row) in enumerate(test_data.iterrows()):
    price = row['Close']
    model_cash_reserve += daily_amount  # add daily $50 to reserve

    if row['Preds'] == 1:
        # Model says up — invest everything in reserve
        model_shares += model_cash_reserve / price
        model_invested += model_cash_reserve
        model_cash_reserve = 0

    # Portfolio value = shares * current price + uninvested cash
    model_values.append(model_shares * price + model_cash_reserve)

test_data['Model_DCA_Value'] = model_values

# ── Total invested (same for both: $50 * n_days) ──
total_invested = daily_amount * len(test_data)

# ── Final values ──
final_dca = test_data['DCA_Value'].iloc[-1]
final_model = test_data['Model_DCA_Value'].iloc[-1]

# ── Plot ──
fig, ax = plt.subplots(figsize=(14, 7))

# Total invested line (baseline)
invested_line = [daily_amount * (i + 1) for i in range(len(test_data))]
ax.plot(test_data.index, invested_line,
        color='#AAAAAA', linewidth=1.5, linestyle=':', alpha=0.7, label='Total Invested')

ax.plot(test_data.index, test_data['DCA_Value'],
        color='#888888', linewidth=2, alpha=0.8, label='DCA ($50/day)')
ax.plot(test_data.index, test_data['Model_DCA_Value'],
        color='#1C69D4', linewidth=2.5, label='Model-Guided DCA')

# Fill between strategies
ax.fill_between(test_data.index,
                test_data['Model_DCA_Value'],
                test_data['DCA_Value'],
                where=test_data['Model_DCA_Value'] >= test_data['DCA_Value'],
                alpha=0.15, color='#1C69D4', interpolate=True)
ax.fill_between(test_data.index,
                test_data['Model_DCA_Value'],
                test_data['DCA_Value'],
                where=test_data['Model_DCA_Value'] < test_data['DCA_Value'],
                alpha=0.15, color='#E74C3C', interpolate=True)

# Annotate final values
ax.annotate(f'${final_model:,.0f}',
            xy=(test_data.index[-1], final_model),
            xytext=(15, 10), textcoords='offset points',
            fontsize=13, fontweight='bold', color='#1C69D4')
ax.annotate(f'${final_dca:,.0f}',
            xy=(test_data.index[-1], final_dca),
            xytext=(15, -15), textcoords='offset points',
            fontsize=13, fontweight='bold', color='#888888')
ax.annotate(f'${total_invested:,.0f}',
            xy=(test_data.index[-1], invested_line[-1]),
            xytext=(15, -10), textcoords='offset points',
            fontsize=11, color='#AAAAAA')

# Formatting
ax.set_title('Dollar-Cost Averaging: Daily vs. Model-Guided',
             fontsize=18, fontweight='bold', pad=15, color='#1A1A2E')
ax.set_xlabel('Date', fontsize=12, color='#4A4A5A')
ax.set_ylabel('Portfolio Value ($)', fontsize=12, color='#4A4A5A')
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, p: f'${x:,.0f}'))
ax.legend(fontsize=12, loc='upper left', framealpha=0.9)
ax.grid(True, alpha=0.3, linestyle='--')
ax.set_facecolor('white')
fig.patch.set_facecolor('white')

plt.tight_layout()
plt.savefig('dca_comparison.png', dpi=150, bbox_inches='tight', facecolor='white')
plt.show()

print(f"\nTotal Invested:              ${total_invested:,.2f}")
print(f"DCA Final Value:             ${final_dca:,.2f}  ({(final_dca/total_invested - 1)*100:+.2f}%)")
print(f"Model-Guided DCA Final Value: ${final_model:,.2f}  ({(final_model/total_invested - 1)*100:+.2f}%)")
print(f"Difference:                   ${final_model - final_dca:,.2f}")
No description has been provided for this image
Total Invested:              $27,500.00
DCA Final Value:             $31,668.73  (+15.16%)
Model-Guided DCA Final Value: $31,670.63  (+15.17%)
Difference:                   $1.90

Conclusion¶

The Random Forest model achieves 57% precision at a 0.50 threshold — meaning when it predicts the S&P 500 will increase, it's right more often than not. To test real-world viability, I compared two dollar-cost averaging strategies over the 550-day test period: investing $50 every day vs. only investing when the model predicts an increase (accumulating cash on predicted down days). Both strategies returned ~15.17% on $27,500 invested, ending within $2 of each other. While the model doesn't generate a clear profit edge, it matches passive investing with less market exposure — suggesting the directional signals have value, but further feature engineering or alternative model architectures would be needed to translate that into a consistent trading advantage.