import pandas as pd
import yfinance as yf
import requests
from requests_cache import CachedSession

sp500 = yf.Ticker("^GSPC").history(period='max')
sp500

sp500.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 24686 entries, 1927-12-30 00:00:00-05:00 to 2026-04-13 00:00:00-04:00
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          24686 non-null  float64
 1   High          24686 non-null  float64
 2   Low           24686 non-null  float64
 3   Close         24686 non-null  float64
 4   Volume        24686 non-null  int64  
 5   Dividends     24686 non-null  float64
 6   Stock Splits  24686 non-null  float64
dtypes: float64(6), int64(1)
memory usage: 1.5 MB

#These columns only have a 0 value
sp500['Stock Splits'].value_counts()
sp500['Dividends'].value_counts()
#So we can delete them
sp500= sp500.drop(columns = ['Dividends', 'Stock Splits'])

sp500.plot.line(y = 'Close', use_index=True)

<Axes: xlabel='Date'>

# nothing is going on until around 1990 so we keep everything after it
sp500 = sp500.loc['1990-01-01':].copy()
sp500

# Create a new column 'Tomorrow' that contains the closing price of the next day
# by shifting the 'Close' column up by one row (-1).
sp500['Tomorrow'] = sp500['Close'].shift(-1)
#create a target, 1 if the price tomorrow is higher than today
sp500['Target'] = (sp500['Tomorrow']>sp500['Close']).astype(int)
sp500

import pandas as pd
import requests
from io import StringIO

url = "https://fred.stlouisfed.org/graph/fredgraph.csv?id=DFF"
response = requests.get(url, verify=True)  # Ensure SSL verification

if response.status_code == 200:
    data = StringIO(response.text)
    federal_rates = pd.read_csv(data)
    federal_rates['Date'] = pd.to_datetime(federal_rates['observation_date'], format='%Y-%m-%d')
    print(federal_rates.info())
    print(federal_rates.head(10))
else:
    print(f"Failed to download data: {response.status_code}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26217 entries, 0 to 26216
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   observation_date  26217 non-null  object        
 1   DFF               26217 non-null  float64       
 2   Date              26217 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 614.6+ KB
None
  observation_date   DFF       Date
0       1954-07-01  1.13 1954-07-01
1       1954-07-02  1.25 1954-07-02
2       1954-07-03  1.25 1954-07-03
3       1954-07-04  1.25 1954-07-04
4       1954-07-05  0.88 1954-07-05
5       1954-07-06  0.25 1954-07-06
6       1954-07-07  1.00 1954-07-07
7       1954-07-08  1.25 1954-07-08
8       1954-07-09  1.25 1954-07-09
9       1954-07-10  1.25 1954-07-10

# Average of the last 4 rows for the 'DFF' column
average_dff = federal_rates['DFF'].tail(4).mean()

# Get the last date and add one day
next_day_date = federal_rates['Date'].iloc[-1] + pd.Timedelta(days=1)

# Check if the next day is Saturday or Sunday, and adjust to Monday if needed
if next_day_date.weekday() == 5:  # Saturday
    next_day_date += pd.Timedelta(days=2)  # Move to Monday
elif next_day_date.weekday() == 6:  # Sunday
    next_day_date += pd.Timedelta(days=1)  # Move to Monday

# Create a new row dictionary
next_day_row = {
    'Date': next_day_date,
    'DFF': average_dff  # Use the computed average for the 'DFF' column
}

# Append the new row to the DataFrame
federal_rates = pd.concat([federal_rates, pd.DataFrame([next_day_row]) ], ignore_index=True)

# Show the updated DataFrame
print(federal_rates.tail())
print(federal_rates.head())

      observation_date   DFF       Date
26213       2026-04-07  3.64 2026-04-07
26214       2026-04-08  3.64 2026-04-08
26215       2026-04-09  3.64 2026-04-09
26216       2026-04-10  3.64 2026-04-10
26217              NaN  3.64 2026-04-13
  observation_date   DFF       Date
0       1954-07-01  1.13 1954-07-01
1       1954-07-02  1.25 1954-07-02
2       1954-07-03  1.25 1954-07-03
3       1954-07-04  1.25 1954-07-04
4       1954-07-05  0.88 1954-07-05

# Remove the timezone from sp500['Date']
sp500 = sp500.reset_index()
sp500['Date'] = sp500['Date'].dt.tz_localize(None)

# Now perform the merge
sp500_with_rates = pd.merge(sp500, federal_rates, on='Date')
sp500_with_rates.drop(columns=['observation_date'],inplace = True)
sp500 = sp500_with_rates

sp500
sp500.set_index('Date', inplace=True)

# Define different time windows for calculating features
days = [2, 7, 30, 90, 365]
features = []

# Create new feature columns based on rolling averages and trends
for day in days:
    # Calculate rolling averages for the specified window
    rolling_averages = sp500.rolling(day).mean()

    # Create a feature for the ratio of the current Close price to its rolling average
    ratio_close_column = f"Close_Ratio_{day}"
    sp500[ratio_close_column] = sp500['Close'] / rolling_averages['Close']

    # Create a feature for the market trend based on the sum of previous Target values
    trend_column = f"Trend_{day}"
    sp500[trend_column] = sp500.shift(1).rolling(day).sum()['Target']

    # Create a feature for the ratio of the current DFF to its rolling average
    ratio_DFF_column = f"DFF_ratio_{day}"
    sp500[ratio_DFF_column] = sp500['DFF'] / rolling_averages['DFF']

    # Add the created features to the list of features
    features += [ratio_close_column, trend_column, ratio_DFF_column]


# Save the last row of the DataFrame before dropping it, this wil be the input for prediction at the end of the code
row_to_predict = sp500.tail(1)

# Drop rows with missing values
sp500.dropna(inplace=True)

sp500

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score,accuracy_score, classification_report


# Define and train the model
model = RandomForestClassifier(n_estimators=150, min_samples_split=50, max_depth = 15 , random_state=1)

train = sp500.iloc[:-550]
test = sp500.iloc[-550:]

# Train the model
model.fit(train[features], train['Target'])

# Predict probabilities for the test set, and only the rows greater than 0.6 will be 1
probs = model.predict_proba(test[features])[:, 1]
threshold = 0.60
preds = (probs >= threshold).astype(int)

# ── Threshold Comparison ──
# Paste this right after: preds = (probs >= threshold).astype(int)
 
from sklearn.metrics import precision_score, recall_score, f1_score
 
thresholds = [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80]
rows = []
 
for t in thresholds:
    p = (probs >= t).astype(int)
    rows.append({
        'Threshold':    t,
        'Precision':    round(precision_score(test['Target'], p, zero_division=0), 4),
        'Recall':       round(recall_score(test['Target'], p, zero_division=0), 4),
        'F1 Score':     round(f1_score(test['Target'], p, zero_division=0), 4),
        'Predicted Up': int(p.sum()),
        'Total Test':   len(p)
    })
 
results = pd.DataFrame(rows)
print(results.to_string(index=False))

 Threshold  Precision  Recall  F1 Score  Predicted Up  Total Test
      0.50     0.5729  0.9199    0.7060           501         550
      0.55     0.5810  0.5865    0.5837           315         550
      0.60     0.6452  0.0641    0.1166            31         550
      0.65     0.5000  0.0064    0.0127             4         550
      0.70     0.0000  0.0000    0.0000             0         550
      0.75     0.0000  0.0000    0.0000             0         550
      0.80     0.0000  0.0000    0.0000             0         550

# calling again the previously saved last row
row_to_predict[features]

# Make a prediction for the next day's market movement
prediction = model.predict(row_to_predict[features])

# Display the prediction
print(f"Predicted market movement for tomorrow: {'It will INCREASE' if prediction[0] == 1 else 'It will DECREASE'}")

Predicted market movement for tomorrow: It will DECREASE

# Paste this after your model predictions cell

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np

# -- Recalculate preds at 0.55 if needed --
threshold = 0.50
preds = (probs >= threshold).astype(int)

test_data = test.copy()
test_data['Preds'] = preds

daily_amount = 50  # dollars per day

# ── Scenario 1: Buy $50 every day (standard DCA) ──
dca_shares = 0
dca_invested = 0
dca_values = []

for i, (date, row) in enumerate(test_data.iterrows()):
    price = row['Close']
    dca_shares += daily_amount / price
    dca_invested += daily_amount
    dca_values.append(dca_shares * price)

test_data['DCA_Value'] = dca_values

# ── Scenario 2: Model-guided DCA ──
# Buy only when model predicts up.
# When model predicts down, accumulate the $50.
# Next time model predicts up, invest all accumulated cash.
model_shares = 0
model_cash_reserve = 0
model_invested = 0
model_values = []

for i, (date, row) in enumerate(test_data.iterrows()):
    price = row['Close']
    model_cash_reserve += daily_amount  # add daily $50 to reserve

    if row['Preds'] == 1:
        # Model says up — invest everything in reserve
        model_shares += model_cash_reserve / price
        model_invested += model_cash_reserve
        model_cash_reserve = 0

    # Portfolio value = shares * current price + uninvested cash
    model_values.append(model_shares * price + model_cash_reserve)

test_data['Model_DCA_Value'] = model_values

# ── Total invested (same for both: $50 * n_days) ──
total_invested = daily_amount * len(test_data)

# ── Final values ──
final_dca = test_data['DCA_Value'].iloc[-1]
final_model = test_data['Model_DCA_Value'].iloc[-1]

# ── Plot ──
fig, ax = plt.subplots(figsize=(14, 7))

# Total invested line (baseline)
invested_line = [daily_amount * (i + 1) for i in range(len(test_data))]
ax.plot(test_data.index, invested_line,
        color='#AAAAAA', linewidth=1.5, linestyle=':', alpha=0.7, label='Total Invested')

ax.plot(test_data.index, test_data['DCA_Value'],
        color='#888888', linewidth=2, alpha=0.8, label='DCA ($50/day)')
ax.plot(test_data.index, test_data['Model_DCA_Value'],
        color='#1C69D4', linewidth=2.5, label='Model-Guided DCA')

# Fill between strategies
ax.fill_between(test_data.index,
                test_data['Model_DCA_Value'],
                test_data['DCA_Value'],
                where=test_data['Model_DCA_Value'] >= test_data['DCA_Value'],
                alpha=0.15, color='#1C69D4', interpolate=True)
ax.fill_between(test_data.index,
                test_data['Model_DCA_Value'],
                test_data['DCA_Value'],
                where=test_data['Model_DCA_Value'] < test_data['DCA_Value'],
                alpha=0.15, color='#E74C3C', interpolate=True)

# Annotate final values
ax.annotate(f'${final_model:,.0f}',
            xy=(test_data.index[-1], final_model),
            xytext=(15, 10), textcoords='offset points',
            fontsize=13, fontweight='bold', color='#1C69D4')
ax.annotate(f'${final_dca:,.0f}',
            xy=(test_data.index[-1], final_dca),
            xytext=(15, -15), textcoords='offset points',
            fontsize=13, fontweight='bold', color='#888888')
ax.annotate(f'${total_invested:,.0f}',
            xy=(test_data.index[-1], invested_line[-1]),
            xytext=(15, -10), textcoords='offset points',
            fontsize=11, color='#AAAAAA')

# Formatting
ax.set_title('Dollar-Cost Averaging: Daily vs. Model-Guided',
             fontsize=18, fontweight='bold', pad=15, color='#1A1A2E')
ax.set_xlabel('Date', fontsize=12, color='#4A4A5A')
ax.set_ylabel('Portfolio Value ($)', fontsize=12, color='#4A4A5A')
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, p: f'${x:,.0f}'))
ax.legend(fontsize=12, loc='upper left', framealpha=0.9)
ax.grid(True, alpha=0.3, linestyle='--')
ax.set_facecolor('white')
fig.patch.set_facecolor('white')

plt.tight_layout()
plt.savefig('dca_comparison.png', dpi=150, bbox_inches='tight', facecolor='white')
plt.show()

print(f"\nTotal Invested:              ${total_invested:,.2f}")
print(f"DCA Final Value:             ${final_dca:,.2f}  ({(final_dca/total_invested - 1)*100:+.2f}%)")
print(f"Model-Guided DCA Final Value: ${final_model:,.2f}  ({(final_model/total_invested - 1)*100:+.2f}%)")
print(f"Difference:                   ${final_model - final_dca:,.2f}")

Total Invested:              $27,500.00
DCA Final Value:             $31,668.73  (+15.16%)
Model-Guided DCA Final Value: $31,670.63  (+15.17%)
Difference:                   $1.90

	Open	High	Low	Close	Volume	Dividends	Stock Splits
Date
1927-12-30 00:00:00-05:00	17.660000	17.660000	17.660000	17.660000	0	0.0	0.0
1928-01-03 00:00:00-05:00	17.760000	17.760000	17.760000	17.760000	0	0.0	0.0
1928-01-04 00:00:00-05:00	17.719999	17.719999	17.719999	17.719999	0	0.0	0.0
1928-01-05 00:00:00-05:00	17.549999	17.549999	17.549999	17.549999	0	0.0	0.0
1928-01-06 00:00:00-05:00	17.660000	17.660000	17.660000	17.660000	0	0.0	0.0
...	...	...	...	...	...	...	...
2026-04-07 00:00:00-04:00	6601.930176	6618.259766	6534.549805	6616.850098	4555680000	0.0	0.0
2026-04-08 00:00:00-04:00	6754.359863	6793.500000	6740.279785	6782.810059	5904880000	0.0	0.0
2026-04-09 00:00:00-04:00	6783.689941	6835.310059	6761.549805	6824.660156	4912410000	0.0	0.0
2026-04-10 00:00:00-04:00	6839.240234	6845.770020	6808.459961	6816.890137	4393220000	0.0	0.0
2026-04-13 00:00:00-04:00	6806.470215	6887.000000	6790.020020	6886.240234	2881481000	0.0	0.0

	Open	High	Low	Close	Volume
Date
1990-01-02 00:00:00-05:00	353.399994	359.690002	351.980011	359.690002	162070000
1990-01-03 00:00:00-05:00	359.690002	360.589996	357.890015	358.760010	192330000
1990-01-04 00:00:00-05:00	358.760010	358.760010	352.890015	355.670013	177000000
1990-01-05 00:00:00-05:00	355.670013	355.670013	351.350006	352.200012	158530000
1990-01-08 00:00:00-05:00	352.200012	354.239990	350.540009	353.790009	140110000
...	...	...	...	...	...
2026-04-07 00:00:00-04:00	6601.930176	6618.259766	6534.549805	6616.850098	4555680000
2026-04-08 00:00:00-04:00	6754.359863	6793.500000	6740.279785	6782.810059	5904880000
2026-04-09 00:00:00-04:00	6783.689941	6835.310059	6761.549805	6824.660156	4912410000
2026-04-10 00:00:00-04:00	6839.240234	6845.770020	6808.459961	6816.890137	4393220000
2026-04-13 00:00:00-04:00	6806.470215	6887.000000	6790.020020	6886.240234	2881481000

	Open	High	Low	Close	Volume	Tomorrow	Target
Date
1990-01-02 00:00:00-05:00	353.399994	359.690002	351.980011	359.690002	162070000	358.760010	0
1990-01-03 00:00:00-05:00	359.690002	360.589996	357.890015	358.760010	192330000	355.670013	0
1990-01-04 00:00:00-05:00	358.760010	358.760010	352.890015	355.670013	177000000	352.200012	0
1990-01-05 00:00:00-05:00	355.670013	355.670013	351.350006	352.200012	158530000	353.790009	1
1990-01-08 00:00:00-05:00	352.200012	354.239990	350.540009	353.790009	140110000	349.619995	0
...	...	...	...	...	...	...	...
2026-04-07 00:00:00-04:00	6601.930176	6618.259766	6534.549805	6616.850098	4555680000	6782.810059	1
2026-04-08 00:00:00-04:00	6754.359863	6793.500000	6740.279785	6782.810059	5904880000	6824.660156	1
2026-04-09 00:00:00-04:00	6783.689941	6835.310059	6761.549805	6824.660156	4912410000	6816.890137	0
2026-04-10 00:00:00-04:00	6839.240234	6845.770020	6808.459961	6816.890137	4393220000	6886.240234	1
2026-04-13 00:00:00-04:00	6806.470215	6887.000000	6790.020020	6886.240234	2881481000	NaN	0

	Open	High	Low	Close	Volume	Tomorrow	Target	DFF	Close_Ratio_2	Trend_2	...	DFF_ratio_7	Close_Ratio_30	Trend_30	DFF_ratio_30	Close_Ratio_90	Trend_90	DFF_ratio_90	Close_Ratio_365	Trend_365	DFF_ratio_365
Date
1991-06-12	381.049988	381.049988	374.459991	376.649994	166140000	377.630005	1	5.67	0.994193	1.0	...	0.976144	0.993271	16.0	0.975009	1.005457	46.0	0.940472	1.095258	191.0	0.752889
1991-06-13	376.649994	377.899994	376.079987	377.630005	145650000	382.290009	1	5.79	1.001299	1.0	...	1.000988	0.996088	16.0	0.996272	1.007198	46.0	0.960925	1.097942	192.0	0.769546
1991-06-14	377.630005	382.299988	377.630005	382.290009	167950000	380.130005	0	5.71	1.006132	2.0	...	0.991319	1.008223	16.0	0.983352	1.018691	46.0	0.947264	1.111256	193.0	0.759627
1991-06-17	382.299988	382.309998	380.130005	380.130005	134230000	378.589996	0	5.94	0.997167	1.0	...	1.028190	1.002586	15.0	1.021906	1.012274	45.0	0.984748	1.104731	193.0	0.790876
1991-06-18	380.130005	381.829987	377.989990	378.589996	155200000	375.089996	0	5.82	0.997970	0.0	...	1.004686	0.998655	15.0	1.001262	1.007515	45.0	0.965850	1.100038	192.0	0.775592
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2026-04-06	6587.660156	6618.129883	6579.720215	6611.830078	3906440000	6616.850098	1	3.64	1.002209	2.0	...	1.000000	0.988666	13.0	1.000000	0.969743	47.0	0.991016	1.052501	204.0	0.873430
2026-04-07	6601.930176	6618.259766	6534.549805	6616.850098	4555680000	6782.810059	1	3.64	1.000379	2.0	...	1.000000	0.990508	14.0	1.000000	0.970619	47.0	0.991736	1.052955	204.0	0.874113
2026-04-08	6754.359863	6793.500000	6740.279785	6782.810059	5904880000	6824.660156	1	3.64	1.012385	2.0	...	1.000000	1.015895	14.0	1.000000	0.994936	47.0	0.992457	1.078927	205.0	0.874798
2026-04-09	6783.689941	6835.310059	6761.549805	6824.660156	4912410000	6816.890137	0	3.64	1.003076	2.0	...	1.000000	1.022783	14.0	1.000000	1.001055	47.0	0.993179	1.085124	206.0	0.875484
2026-04-10	6839.240234	6845.770020	6808.459961	6816.890137	4393220000	6886.240234	1	3.64	0.999430	1.0	...	1.000000	1.022088	14.0	1.000000	0.999968	46.0	0.993932	1.083408	206.0	0.876171

	Close_Ratio_2	Trend_2	DFF_ratio_2	Close_Ratio_7	Trend_7	DFF_ratio_7	Close_Ratio_30	Trend_30	DFF_ratio_30	Close_Ratio_90	Trend_90	DFF_ratio_90	Close_Ratio_365	Trend_365	DFF_ratio_365
Date
2026-04-13	1.005061	1.0	1.0	1.022956	6.0	1.0	1.032448	15.0	1.0	1.01002	47.0	0.994687	1.093917	206.0	0.87686

DATA PREPROCESSING¶

MODEL¶

RESULTS AND METRICS¶

PREDICTING TOMORROW OUTCOME¶

── DCA vs. Model-Guided DCA Strategy ──¶

Conclusion¶