import pandas as pd
import yfinance as yf
import requests
from requests_cache import CachedSession
sp500 = yf.Ticker("^GSPC").history(period='max')
sp500
| Open | High | Low | Close | Volume | Dividends | Stock Splits | |
|---|---|---|---|---|---|---|---|
| Date | |||||||
| 1927-12-30 00:00:00-05:00 | 17.660000 | 17.660000 | 17.660000 | 17.660000 | 0 | 0.0 | 0.0 |
| 1928-01-03 00:00:00-05:00 | 17.760000 | 17.760000 | 17.760000 | 17.760000 | 0 | 0.0 | 0.0 |
| 1928-01-04 00:00:00-05:00 | 17.719999 | 17.719999 | 17.719999 | 17.719999 | 0 | 0.0 | 0.0 |
| 1928-01-05 00:00:00-05:00 | 17.549999 | 17.549999 | 17.549999 | 17.549999 | 0 | 0.0 | 0.0 |
| 1928-01-06 00:00:00-05:00 | 17.660000 | 17.660000 | 17.660000 | 17.660000 | 0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2026-04-07 00:00:00-04:00 | 6601.930176 | 6618.259766 | 6534.549805 | 6616.850098 | 4555680000 | 0.0 | 0.0 |
| 2026-04-08 00:00:00-04:00 | 6754.359863 | 6793.500000 | 6740.279785 | 6782.810059 | 5904880000 | 0.0 | 0.0 |
| 2026-04-09 00:00:00-04:00 | 6783.689941 | 6835.310059 | 6761.549805 | 6824.660156 | 4912410000 | 0.0 | 0.0 |
| 2026-04-10 00:00:00-04:00 | 6839.240234 | 6845.770020 | 6808.459961 | 6816.890137 | 4393220000 | 0.0 | 0.0 |
| 2026-04-13 00:00:00-04:00 | 6806.470215 | 6887.000000 | 6790.020020 | 6886.240234 | 2881481000 | 0.0 | 0.0 |
24686 rows × 7 columns
sp500.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 24686 entries, 1927-12-30 00:00:00-05:00 to 2026-04-13 00:00:00-04:00 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Open 24686 non-null float64 1 High 24686 non-null float64 2 Low 24686 non-null float64 3 Close 24686 non-null float64 4 Volume 24686 non-null int64 5 Dividends 24686 non-null float64 6 Stock Splits 24686 non-null float64 dtypes: float64(6), int64(1) memory usage: 1.5 MB
DATA PREPROCESSING¶
#These columns only have a 0 value
sp500['Stock Splits'].value_counts()
sp500['Dividends'].value_counts()
#So we can delete them
sp500= sp500.drop(columns = ['Dividends', 'Stock Splits'])
sp500.plot.line(y = 'Close', use_index=True)
<Axes: xlabel='Date'>
# nothing is going on until around 1990 so we keep everything after it
sp500 = sp500.loc['1990-01-01':].copy()
sp500
| Open | High | Low | Close | Volume | |
|---|---|---|---|---|---|
| Date | |||||
| 1990-01-02 00:00:00-05:00 | 353.399994 | 359.690002 | 351.980011 | 359.690002 | 162070000 |
| 1990-01-03 00:00:00-05:00 | 359.690002 | 360.589996 | 357.890015 | 358.760010 | 192330000 |
| 1990-01-04 00:00:00-05:00 | 358.760010 | 358.760010 | 352.890015 | 355.670013 | 177000000 |
| 1990-01-05 00:00:00-05:00 | 355.670013 | 355.670013 | 351.350006 | 352.200012 | 158530000 |
| 1990-01-08 00:00:00-05:00 | 352.200012 | 354.239990 | 350.540009 | 353.790009 | 140110000 |
| ... | ... | ... | ... | ... | ... |
| 2026-04-07 00:00:00-04:00 | 6601.930176 | 6618.259766 | 6534.549805 | 6616.850098 | 4555680000 |
| 2026-04-08 00:00:00-04:00 | 6754.359863 | 6793.500000 | 6740.279785 | 6782.810059 | 5904880000 |
| 2026-04-09 00:00:00-04:00 | 6783.689941 | 6835.310059 | 6761.549805 | 6824.660156 | 4912410000 |
| 2026-04-10 00:00:00-04:00 | 6839.240234 | 6845.770020 | 6808.459961 | 6816.890137 | 4393220000 |
| 2026-04-13 00:00:00-04:00 | 6806.470215 | 6887.000000 | 6790.020020 | 6886.240234 | 2881481000 |
9136 rows × 5 columns
# Create a new column 'Tomorrow' that contains the closing price of the next day
# by shifting the 'Close' column up by one row (-1).
sp500['Tomorrow'] = sp500['Close'].shift(-1)
#create a target, 1 if the price tomorrow is higher than today
sp500['Target'] = (sp500['Tomorrow']>sp500['Close']).astype(int)
sp500
| Open | High | Low | Close | Volume | Tomorrow | Target | |
|---|---|---|---|---|---|---|---|
| Date | |||||||
| 1990-01-02 00:00:00-05:00 | 353.399994 | 359.690002 | 351.980011 | 359.690002 | 162070000 | 358.760010 | 0 |
| 1990-01-03 00:00:00-05:00 | 359.690002 | 360.589996 | 357.890015 | 358.760010 | 192330000 | 355.670013 | 0 |
| 1990-01-04 00:00:00-05:00 | 358.760010 | 358.760010 | 352.890015 | 355.670013 | 177000000 | 352.200012 | 0 |
| 1990-01-05 00:00:00-05:00 | 355.670013 | 355.670013 | 351.350006 | 352.200012 | 158530000 | 353.790009 | 1 |
| 1990-01-08 00:00:00-05:00 | 352.200012 | 354.239990 | 350.540009 | 353.790009 | 140110000 | 349.619995 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2026-04-07 00:00:00-04:00 | 6601.930176 | 6618.259766 | 6534.549805 | 6616.850098 | 4555680000 | 6782.810059 | 1 |
| 2026-04-08 00:00:00-04:00 | 6754.359863 | 6793.500000 | 6740.279785 | 6782.810059 | 5904880000 | 6824.660156 | 1 |
| 2026-04-09 00:00:00-04:00 | 6783.689941 | 6835.310059 | 6761.549805 | 6824.660156 | 4912410000 | 6816.890137 | 0 |
| 2026-04-10 00:00:00-04:00 | 6839.240234 | 6845.770020 | 6808.459961 | 6816.890137 | 4393220000 | 6886.240234 | 1 |
| 2026-04-13 00:00:00-04:00 | 6806.470215 | 6887.000000 | 6790.020020 | 6886.240234 | 2881481000 | NaN | 0 |
9136 rows × 7 columns
import pandas as pd
import requests
from io import StringIO
url = "https://fred.stlouisfed.org/graph/fredgraph.csv?id=DFF"
response = requests.get(url, verify=True) # Ensure SSL verification
if response.status_code == 200:
data = StringIO(response.text)
federal_rates = pd.read_csv(data)
federal_rates['Date'] = pd.to_datetime(federal_rates['observation_date'], format='%Y-%m-%d')
print(federal_rates.info())
print(federal_rates.head(10))
else:
print(f"Failed to download data: {response.status_code}")
<class 'pandas.core.frame.DataFrame'> RangeIndex: 26217 entries, 0 to 26216 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 observation_date 26217 non-null object 1 DFF 26217 non-null float64 2 Date 26217 non-null datetime64[ns] dtypes: datetime64[ns](1), float64(1), object(1) memory usage: 614.6+ KB None observation_date DFF Date 0 1954-07-01 1.13 1954-07-01 1 1954-07-02 1.25 1954-07-02 2 1954-07-03 1.25 1954-07-03 3 1954-07-04 1.25 1954-07-04 4 1954-07-05 0.88 1954-07-05 5 1954-07-06 0.25 1954-07-06 6 1954-07-07 1.00 1954-07-07 7 1954-07-08 1.25 1954-07-08 8 1954-07-09 1.25 1954-07-09 9 1954-07-10 1.25 1954-07-10
# Average of the last 4 rows for the 'DFF' column
average_dff = federal_rates['DFF'].tail(4).mean()
# Get the last date and add one day
next_day_date = federal_rates['Date'].iloc[-1] + pd.Timedelta(days=1)
# Check if the next day is Saturday or Sunday, and adjust to Monday if needed
if next_day_date.weekday() == 5: # Saturday
next_day_date += pd.Timedelta(days=2) # Move to Monday
elif next_day_date.weekday() == 6: # Sunday
next_day_date += pd.Timedelta(days=1) # Move to Monday
# Create a new row dictionary
next_day_row = {
'Date': next_day_date,
'DFF': average_dff # Use the computed average for the 'DFF' column
}
# Append the new row to the DataFrame
federal_rates = pd.concat([federal_rates, pd.DataFrame([next_day_row]) ], ignore_index=True)
# Show the updated DataFrame
print(federal_rates.tail())
print(federal_rates.head())
observation_date DFF Date 26213 2026-04-07 3.64 2026-04-07 26214 2026-04-08 3.64 2026-04-08 26215 2026-04-09 3.64 2026-04-09 26216 2026-04-10 3.64 2026-04-10 26217 NaN 3.64 2026-04-13 observation_date DFF Date 0 1954-07-01 1.13 1954-07-01 1 1954-07-02 1.25 1954-07-02 2 1954-07-03 1.25 1954-07-03 3 1954-07-04 1.25 1954-07-04 4 1954-07-05 0.88 1954-07-05
# Remove the timezone from sp500['Date']
sp500 = sp500.reset_index()
sp500['Date'] = sp500['Date'].dt.tz_localize(None)
# Now perform the merge
sp500_with_rates = pd.merge(sp500, federal_rates, on='Date')
sp500_with_rates.drop(columns=['observation_date'],inplace = True)
sp500 = sp500_with_rates
sp500
sp500.set_index('Date', inplace=True)
# Define different time windows for calculating features
days = [2, 7, 30, 90, 365]
features = []
# Create new feature columns based on rolling averages and trends
for day in days:
# Calculate rolling averages for the specified window
rolling_averages = sp500.rolling(day).mean()
# Create a feature for the ratio of the current Close price to its rolling average
ratio_close_column = f"Close_Ratio_{day}"
sp500[ratio_close_column] = sp500['Close'] / rolling_averages['Close']
# Create a feature for the market trend based on the sum of previous Target values
trend_column = f"Trend_{day}"
sp500[trend_column] = sp500.shift(1).rolling(day).sum()['Target']
# Create a feature for the ratio of the current DFF to its rolling average
ratio_DFF_column = f"DFF_ratio_{day}"
sp500[ratio_DFF_column] = sp500['DFF'] / rolling_averages['DFF']
# Add the created features to the list of features
features += [ratio_close_column, trend_column, ratio_DFF_column]
# Save the last row of the DataFrame before dropping it, this wil be the input for prediction at the end of the code
row_to_predict = sp500.tail(1)
# Drop rows with missing values
sp500.dropna(inplace=True)
sp500
| Open | High | Low | Close | Volume | Tomorrow | Target | DFF | Close_Ratio_2 | Trend_2 | ... | DFF_ratio_7 | Close_Ratio_30 | Trend_30 | DFF_ratio_30 | Close_Ratio_90 | Trend_90 | DFF_ratio_90 | Close_Ratio_365 | Trend_365 | DFF_ratio_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 1991-06-12 | 381.049988 | 381.049988 | 374.459991 | 376.649994 | 166140000 | 377.630005 | 1 | 5.67 | 0.994193 | 1.0 | ... | 0.976144 | 0.993271 | 16.0 | 0.975009 | 1.005457 | 46.0 | 0.940472 | 1.095258 | 191.0 | 0.752889 |
| 1991-06-13 | 376.649994 | 377.899994 | 376.079987 | 377.630005 | 145650000 | 382.290009 | 1 | 5.79 | 1.001299 | 1.0 | ... | 1.000988 | 0.996088 | 16.0 | 0.996272 | 1.007198 | 46.0 | 0.960925 | 1.097942 | 192.0 | 0.769546 |
| 1991-06-14 | 377.630005 | 382.299988 | 377.630005 | 382.290009 | 167950000 | 380.130005 | 0 | 5.71 | 1.006132 | 2.0 | ... | 0.991319 | 1.008223 | 16.0 | 0.983352 | 1.018691 | 46.0 | 0.947264 | 1.111256 | 193.0 | 0.759627 |
| 1991-06-17 | 382.299988 | 382.309998 | 380.130005 | 380.130005 | 134230000 | 378.589996 | 0 | 5.94 | 0.997167 | 1.0 | ... | 1.028190 | 1.002586 | 15.0 | 1.021906 | 1.012274 | 45.0 | 0.984748 | 1.104731 | 193.0 | 0.790876 |
| 1991-06-18 | 380.130005 | 381.829987 | 377.989990 | 378.589996 | 155200000 | 375.089996 | 0 | 5.82 | 0.997970 | 0.0 | ... | 1.004686 | 0.998655 | 15.0 | 1.001262 | 1.007515 | 45.0 | 0.965850 | 1.100038 | 192.0 | 0.775592 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2026-04-06 | 6587.660156 | 6618.129883 | 6579.720215 | 6611.830078 | 3906440000 | 6616.850098 | 1 | 3.64 | 1.002209 | 2.0 | ... | 1.000000 | 0.988666 | 13.0 | 1.000000 | 0.969743 | 47.0 | 0.991016 | 1.052501 | 204.0 | 0.873430 |
| 2026-04-07 | 6601.930176 | 6618.259766 | 6534.549805 | 6616.850098 | 4555680000 | 6782.810059 | 1 | 3.64 | 1.000379 | 2.0 | ... | 1.000000 | 0.990508 | 14.0 | 1.000000 | 0.970619 | 47.0 | 0.991736 | 1.052955 | 204.0 | 0.874113 |
| 2026-04-08 | 6754.359863 | 6793.500000 | 6740.279785 | 6782.810059 | 5904880000 | 6824.660156 | 1 | 3.64 | 1.012385 | 2.0 | ... | 1.000000 | 1.015895 | 14.0 | 1.000000 | 0.994936 | 47.0 | 0.992457 | 1.078927 | 205.0 | 0.874798 |
| 2026-04-09 | 6783.689941 | 6835.310059 | 6761.549805 | 6824.660156 | 4912410000 | 6816.890137 | 0 | 3.64 | 1.003076 | 2.0 | ... | 1.000000 | 1.022783 | 14.0 | 1.000000 | 1.001055 | 47.0 | 0.993179 | 1.085124 | 206.0 | 0.875484 |
| 2026-04-10 | 6839.240234 | 6845.770020 | 6808.459961 | 6816.890137 | 4393220000 | 6886.240234 | 1 | 3.64 | 0.999430 | 1.0 | ... | 1.000000 | 1.022088 | 14.0 | 1.000000 | 0.999968 | 46.0 | 0.993932 | 1.083408 | 206.0 | 0.876171 |
8770 rows × 23 columns
MODEL¶
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score,accuracy_score, classification_report
# Define and train the model
model = RandomForestClassifier(n_estimators=150, min_samples_split=50, max_depth = 15 , random_state=1)
train = sp500.iloc[:-550]
test = sp500.iloc[-550:]
# Train the model
model.fit(train[features], train['Target'])
# Predict probabilities for the test set, and only the rows greater than 0.6 will be 1
probs = model.predict_proba(test[features])[:, 1]
threshold = 0.60
preds = (probs >= threshold).astype(int)
RESULTS AND METRICS¶
# ── Threshold Comparison ──
# Paste this right after: preds = (probs >= threshold).astype(int)
from sklearn.metrics import precision_score, recall_score, f1_score
thresholds = [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80]
rows = []
for t in thresholds:
p = (probs >= t).astype(int)
rows.append({
'Threshold': t,
'Precision': round(precision_score(test['Target'], p, zero_division=0), 4),
'Recall': round(recall_score(test['Target'], p, zero_division=0), 4),
'F1 Score': round(f1_score(test['Target'], p, zero_division=0), 4),
'Predicted Up': int(p.sum()),
'Total Test': len(p)
})
results = pd.DataFrame(rows)
print(results.to_string(index=False))
Threshold Precision Recall F1 Score Predicted Up Total Test
0.50 0.5729 0.9199 0.7060 501 550
0.55 0.5810 0.5865 0.5837 315 550
0.60 0.6452 0.0641 0.1166 31 550
0.65 0.5000 0.0064 0.0127 4 550
0.70 0.0000 0.0000 0.0000 0 550
0.75 0.0000 0.0000 0.0000 0 550
0.80 0.0000 0.0000 0.0000 0 550
PREDICTING TOMORROW OUTCOME¶
# calling again the previously saved last row
row_to_predict[features]
| Close_Ratio_2 | Trend_2 | DFF_ratio_2 | Close_Ratio_7 | Trend_7 | DFF_ratio_7 | Close_Ratio_30 | Trend_30 | DFF_ratio_30 | Close_Ratio_90 | Trend_90 | DFF_ratio_90 | Close_Ratio_365 | Trend_365 | DFF_ratio_365 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||
| 2026-04-13 | 1.005061 | 1.0 | 1.0 | 1.022956 | 6.0 | 1.0 | 1.032448 | 15.0 | 1.0 | 1.01002 | 47.0 | 0.994687 | 1.093917 | 206.0 | 0.87686 |
# Make a prediction for the next day's market movement
prediction = model.predict(row_to_predict[features])
# Display the prediction
print(f"Predicted market movement for tomorrow: {'It will INCREASE' if prediction[0] == 1 else 'It will DECREASE'}")
Predicted market movement for tomorrow: It will DECREASE
── DCA vs. Model-Guided DCA Strategy ──¶
# Paste this after your model predictions cell
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
# -- Recalculate preds at 0.55 if needed --
threshold = 0.50
preds = (probs >= threshold).astype(int)
test_data = test.copy()
test_data['Preds'] = preds
daily_amount = 50 # dollars per day
# ── Scenario 1: Buy $50 every day (standard DCA) ──
dca_shares = 0
dca_invested = 0
dca_values = []
for i, (date, row) in enumerate(test_data.iterrows()):
price = row['Close']
dca_shares += daily_amount / price
dca_invested += daily_amount
dca_values.append(dca_shares * price)
test_data['DCA_Value'] = dca_values
# ── Scenario 2: Model-guided DCA ──
# Buy only when model predicts up.
# When model predicts down, accumulate the $50.
# Next time model predicts up, invest all accumulated cash.
model_shares = 0
model_cash_reserve = 0
model_invested = 0
model_values = []
for i, (date, row) in enumerate(test_data.iterrows()):
price = row['Close']
model_cash_reserve += daily_amount # add daily $50 to reserve
if row['Preds'] == 1:
# Model says up — invest everything in reserve
model_shares += model_cash_reserve / price
model_invested += model_cash_reserve
model_cash_reserve = 0
# Portfolio value = shares * current price + uninvested cash
model_values.append(model_shares * price + model_cash_reserve)
test_data['Model_DCA_Value'] = model_values
# ── Total invested (same for both: $50 * n_days) ──
total_invested = daily_amount * len(test_data)
# ── Final values ──
final_dca = test_data['DCA_Value'].iloc[-1]
final_model = test_data['Model_DCA_Value'].iloc[-1]
# ── Plot ──
fig, ax = plt.subplots(figsize=(14, 7))
# Total invested line (baseline)
invested_line = [daily_amount * (i + 1) for i in range(len(test_data))]
ax.plot(test_data.index, invested_line,
color='#AAAAAA', linewidth=1.5, linestyle=':', alpha=0.7, label='Total Invested')
ax.plot(test_data.index, test_data['DCA_Value'],
color='#888888', linewidth=2, alpha=0.8, label='DCA ($50/day)')
ax.plot(test_data.index, test_data['Model_DCA_Value'],
color='#1C69D4', linewidth=2.5, label='Model-Guided DCA')
# Fill between strategies
ax.fill_between(test_data.index,
test_data['Model_DCA_Value'],
test_data['DCA_Value'],
where=test_data['Model_DCA_Value'] >= test_data['DCA_Value'],
alpha=0.15, color='#1C69D4', interpolate=True)
ax.fill_between(test_data.index,
test_data['Model_DCA_Value'],
test_data['DCA_Value'],
where=test_data['Model_DCA_Value'] < test_data['DCA_Value'],
alpha=0.15, color='#E74C3C', interpolate=True)
# Annotate final values
ax.annotate(f'${final_model:,.0f}',
xy=(test_data.index[-1], final_model),
xytext=(15, 10), textcoords='offset points',
fontsize=13, fontweight='bold', color='#1C69D4')
ax.annotate(f'${final_dca:,.0f}',
xy=(test_data.index[-1], final_dca),
xytext=(15, -15), textcoords='offset points',
fontsize=13, fontweight='bold', color='#888888')
ax.annotate(f'${total_invested:,.0f}',
xy=(test_data.index[-1], invested_line[-1]),
xytext=(15, -10), textcoords='offset points',
fontsize=11, color='#AAAAAA')
# Formatting
ax.set_title('Dollar-Cost Averaging: Daily vs. Model-Guided',
fontsize=18, fontweight='bold', pad=15, color='#1A1A2E')
ax.set_xlabel('Date', fontsize=12, color='#4A4A5A')
ax.set_ylabel('Portfolio Value ($)', fontsize=12, color='#4A4A5A')
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, p: f'${x:,.0f}'))
ax.legend(fontsize=12, loc='upper left', framealpha=0.9)
ax.grid(True, alpha=0.3, linestyle='--')
ax.set_facecolor('white')
fig.patch.set_facecolor('white')
plt.tight_layout()
plt.savefig('dca_comparison.png', dpi=150, bbox_inches='tight', facecolor='white')
plt.show()
print(f"\nTotal Invested: ${total_invested:,.2f}")
print(f"DCA Final Value: ${final_dca:,.2f} ({(final_dca/total_invested - 1)*100:+.2f}%)")
print(f"Model-Guided DCA Final Value: ${final_model:,.2f} ({(final_model/total_invested - 1)*100:+.2f}%)")
print(f"Difference: ${final_model - final_dca:,.2f}")
Total Invested: $27,500.00 DCA Final Value: $31,668.73 (+15.16%) Model-Guided DCA Final Value: $31,670.63 (+15.17%) Difference: $1.90
Conclusion¶
The Random Forest model achieves 57% precision at a 0.50 threshold — meaning when it predicts the S&P 500 will increase, it's right more often than not. To test real-world viability, I compared two dollar-cost averaging strategies over the 550-day test period: investing $50 every day vs. only investing when the model predicts an increase (accumulating cash on predicted down days). Both strategies returned ~15.17% on $27,500 invested, ending within $2 of each other. While the model doesn't generate a clear profit edge, it matches passive investing with less market exposure — suggesting the directional signals have value, but further feature engineering or alternative model architectures would be needed to translate that into a consistent trading advantage.