Skip to content

Commit b0f9077

Browse files
authored
Merge pull request #884 from Lumiwealth/feature/thetadata-cache-layout
Refactor ThetaData cache directory layout
2 parents 3b53181 + f1a791e commit b0f9077

File tree

8 files changed

+89
-28
lines changed

8 files changed

+89
-28
lines changed

docs/remote_cache.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,18 +48,19 @@ Notes:
4848
Example ThetaData quote cache on macOS:
4949

5050
```
51-
/Users/<user>/Library/Caches/lumibot/1.0/thetadata/stock_SPY_minute_ohlc.parquet
51+
/Users/<user>/Library/Caches/lumibot/1.0/thetadata/stock/minute/ohlc/stock_SPY_minute_ohlc.parquet
5252
```
5353

5454
With the example configuration above the remote key becomes:
5555

5656
```
57-
prod/cache/v1/thetadata/stock_SPY_minute_ohlc.parquet
57+
prod/cache/v1/thetadata/stock/minute/ohlc/stock_SPY_minute_ohlc.parquet
5858
```
5959

60-
This format aligns with the intended IAM policy layout (provider/asset class
61-
segments) and keeps migration straightforward for other data sources such as
62-
Polygon or DataBento.
60+
This format aligns with the intended IAM policy layout (provider → asset class →
61+
timespan → datastyle) and keeps migration straightforward for other data sources
62+
such as Polygon or DataBento. Option-chain caches now live at
63+
`thetadata/option/option_chains/<symbol>_<date>.parquet`.
6364

6465
## Implementation Overview
6566

lumibot/backtesting/thetadata_backtesting_pandas.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,8 @@ def _finalize_day_frame(
288288
)
289289

290290
expected_last_dt = self.to_default_timezone(current_dt).replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=1)
291-
target_index = pd.date_range(end=expected_last_dt, periods=requested_length, freq="D", tz=self.tzinfo)
291+
expected_last_dt_utc = expected_last_dt.astimezone(pytz.UTC)
292+
target_index = pd.date_range(end=expected_last_dt_utc, periods=requested_length, freq="D", tz=pytz.UTC).tz_convert(self.tzinfo)
292293

293294
# DEBUG-LOG: Target index details
294295
logger.debug(

lumibot/tools/thetadata_helper.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,17 @@
2626
BOOT_GRACE_PERIOD = 5.0
2727
MAX_RESTART_ATTEMPTS = 3
2828

29+
30+
def _resolve_asset_folder(asset_obj: Asset) -> str:
31+
asset_type = getattr(asset_obj, "asset_type", None) or "stock"
32+
asset_key = str(asset_type).strip().lower()
33+
return asset_key
34+
35+
36+
def _normalize_folder_component(value: str, fallback: str) -> str:
37+
normalized = str(value or "").strip().lower().replace(" ", "_")
38+
return normalized or fallback
39+
2940
# Global process tracking for ThetaTerminal
3041
THETA_DATA_PROCESS = None
3142
THETA_DATA_PID = None
@@ -785,7 +796,11 @@ def get_trading_dates(asset: Asset, start: datetime, end: datetime):
785796
def build_cache_filename(asset: Asset, timespan: str, datastyle: str = "ohlc"):
786797
"""Helper function to create the cache filename for a given asset and timespan"""
787798

788-
lumibot_cache_folder = Path(LUMIBOT_CACHE_FOLDER) / CACHE_SUBFOLDER
799+
provider_root = Path(LUMIBOT_CACHE_FOLDER) / CACHE_SUBFOLDER
800+
asset_folder = _resolve_asset_folder(asset)
801+
timespan_folder = _normalize_folder_component(timespan, "unknown")
802+
datastyle_folder = _normalize_folder_component(datastyle, "default")
803+
base_folder = provider_root / asset_folder / timespan_folder / datastyle_folder
789804

790805
# If It's an option then also add the expiration date, strike price and right to the filename
791806
if asset.asset_type == "option":
@@ -799,7 +814,7 @@ def build_cache_filename(asset: Asset, timespan: str, datastyle: str = "ohlc"):
799814
uniq_str = asset.symbol
800815

801816
cache_filename = f"{asset.asset_type}_{uniq_str}_{timespan}_{datastyle}.parquet"
802-
cache_file = lumibot_cache_folder / cache_filename
817+
cache_file = base_folder / cache_filename
803818
return cache_file
804819

805820

@@ -1969,7 +1984,7 @@ def get_chains_cached(
19691984
Retrieve option chain with caching (MATCHES POLYGON PATTERN).
19701985
19711986
This function follows the EXACT same caching strategy as Polygon:
1972-
1. Check cache: LUMIBOT_CACHE_FOLDER/thetadata/option_chains/{symbol}_{date}.parquet
1987+
1. Check cache: LUMIBOT_CACHE_FOLDER/thetadata/<asset-type>/option_chains/{symbol}_{date}.parquet
19731988
2. Reuse files within RECENT_FILE_TOLERANCE_DAYS (default 7 days)
19741989
3. If not found, fetch from ThetaData and save to cache
19751990
4. Use pyarrow engine with snappy compression
@@ -2006,7 +2021,7 @@ def get_chains_cached(
20062021
return None
20072022

20082023
# 2) Build cache folder path
2009-
chain_folder = Path(LUMIBOT_CACHE_FOLDER) / "thetadata" / "option_chains"
2024+
chain_folder = Path(LUMIBOT_CACHE_FOLDER) / "thetadata" / _resolve_asset_folder(asset) / "option_chains"
20102025
chain_folder.mkdir(parents=True, exist_ok=True)
20112026

20122027
# 3) Check for recent cached file (within RECENT_FILE_TOLERANCE_DAYS)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name="lumibot",
8-
version="4.2.1",
8+
version="4.2.3",
99
author="Robert Grzesik",
1010
author_email="rob@lumiwealth.com",
1111
description="Backtesting and Trading Library, Made by Lumiwealth",

tests/test_backtest_cache_manager.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def _build_settings(prefix: str = "prod/cache") -> BacktestCacheSettings:
6767
def test_remote_key_uses_relative_cache_path(tmp_path, monkeypatch):
6868
cache_root = tmp_path / "cache"
6969
cache_root.mkdir()
70-
local_file = cache_root / "thetadata" / "bars" / "spy.parquet"
70+
local_file = cache_root / "thetadata" / "stock" / "minute" / "ohlc" / "stock_SPY_minute_ohlc.parquet"
7171
local_file.parent.mkdir(parents=True, exist_ok=True)
7272

7373
monkeypatch.setattr(backtest_cache, "LUMIBOT_CACHE_FOLDER", cache_root)
@@ -76,17 +76,17 @@ def test_remote_key_uses_relative_cache_path(tmp_path, monkeypatch):
7676
manager = BacktestCacheManager(settings, client_factory=lambda settings: StubS3Client())
7777

7878
remote_key = manager.remote_key_for(local_file)
79-
assert remote_key == "stage/cache/v3/thetadata/bars/spy.parquet"
79+
assert remote_key == "stage/cache/v3/thetadata/stock/minute/ohlc/stock_SPY_minute_ohlc.parquet"
8080

8181

8282
def test_ensure_local_file_downloads_from_s3(tmp_path, monkeypatch):
8383
cache_root = tmp_path / "cache"
8484
cache_root.mkdir()
85-
local_file = cache_root / "thetadata" / "bars" / "spy.parquet"
85+
local_file = cache_root / "thetadata" / "stock" / "minute" / "ohlc" / "stock_SPY_minute_ohlc.parquet"
8686

8787
monkeypatch.setattr(backtest_cache, "LUMIBOT_CACHE_FOLDER", cache_root)
8888

89-
remote_key = "stage/cache/v3/thetadata/bars/spy.parquet"
89+
remote_key = "stage/cache/v3/thetadata/stock/minute/ohlc/stock_SPY_minute_ohlc.parquet"
9090
objects = {("test-bucket", remote_key): b"cached-data"}
9191

9292
stub = StubS3Client(objects)
@@ -101,7 +101,7 @@ def test_ensure_local_file_downloads_from_s3(tmp_path, monkeypatch):
101101
def test_ensure_local_file_handles_missing_remote(tmp_path, monkeypatch):
102102
cache_root = tmp_path / "cache"
103103
cache_root.mkdir()
104-
local_file = cache_root / "thetadata" / "bars" / "spy.parquet"
104+
local_file = cache_root / "thetadata" / "stock" / "minute" / "ohlc" / "stock_SPY_minute_ohlc.parquet"
105105

106106
monkeypatch.setattr(backtest_cache, "LUMIBOT_CACHE_FOLDER", cache_root)
107107

@@ -116,13 +116,13 @@ def test_ensure_local_file_handles_missing_remote(tmp_path, monkeypatch):
116116
def test_on_local_update_uploads_file(tmp_path, monkeypatch):
117117
cache_root = tmp_path / "cache"
118118
cache_root.mkdir()
119-
local_file = cache_root / "thetadata" / "bars" / "spy.parquet"
119+
local_file = cache_root / "thetadata" / "stock" / "minute" / "ohlc" / "stock_SPY_minute_ohlc.parquet"
120120
local_file.parent.mkdir(parents=True, exist_ok=True)
121121
local_file.write_bytes(b"new-data")
122122

123123
monkeypatch.setattr(backtest_cache, "LUMIBOT_CACHE_FOLDER", cache_root)
124124

125-
remote_key = "stage/cache/v3/thetadata/bars/spy.parquet"
125+
remote_key = "stage/cache/v3/thetadata/stock/minute/ohlc/stock_SPY_minute_ohlc.parquet"
126126
stub = StubS3Client({("test-bucket", remote_key): b"old"})
127127
manager = BacktestCacheManager(_build_settings(prefix="stage/cache"), client_factory=lambda s: stub)
128128

tests/test_backtesting_data_source_env.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ def test_auto_select_polygon_case_insensitive(self, clean_environment, restore_t
7575
# Configure caplog to capture INFO level logs from lumibot.strategies._strategy
7676
import logging
7777
caplog.set_level(logging.INFO, logger='lumibot.strategies._strategy')
78+
polygon_key = os.environ.get("POLYGON_API_KEY")
79+
if not polygon_key:
80+
pytest.skip("Polygon API key not configured")
7881

7982
with patch.dict(os.environ, {'BACKTESTING_DATA_SOURCE': 'polygon'}):
8083
# Re-import credentials to pick up env change

tests/test_thetadata_helper.py

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -324,13 +324,13 @@ def test_get_trading_dates():
324324
def test_build_cache_filename(mocker, tmpdir, datastyle):
325325
asset = Asset("SPY")
326326
timespan = "1D"
327-
mocker.patch.object(thetadata_helper, "LUMIBOT_CACHE_FOLDER", tmpdir)
328-
expected = tmpdir / "thetadata" / f"stock_SPY_1D_{datastyle}.parquet"
327+
mocker.patch.object(thetadata_helper, "LUMIBOT_CACHE_FOLDER", str(tmpdir))
328+
expected = tmpdir / "thetadata" / "stock" / "1d" / datastyle / f"stock_SPY_1D_{datastyle}.parquet"
329329
assert thetadata_helper.build_cache_filename(asset, timespan, datastyle) == expected
330330

331331
expire_date = datetime.date(2023, 8, 1)
332332
option_asset = Asset("SPY", asset_type="option", expiration=expire_date, strike=100, right="CALL")
333-
expected = tmpdir / "thetadata" / f"option_SPY_230801_100_CALL_1D_{datastyle}.parquet"
333+
expected = tmpdir / "thetadata" / "option" / "1d" / datastyle / f"option_SPY_230801_100_CALL_1D_{datastyle}.parquet"
334334
assert thetadata_helper.build_cache_filename(option_asset, timespan, datastyle) == expected
335335

336336
# Bad option asset with no expiration
@@ -427,8 +427,8 @@ def test_missing_dates():
427427
],
428428
)
429429
def test_update_cache(mocker, tmpdir, df_all, df_cached, datastyle):
430-
mocker.patch.object(thetadata_helper, "LUMIBOT_CACHE_FOLDER", tmpdir)
431-
cache_file = Path(tmpdir / "thetadata" / f"stock_SPY_1D_{datastyle}.parquet")
430+
mocker.patch.object(thetadata_helper, "LUMIBOT_CACHE_FOLDER", str(tmpdir))
431+
cache_file = thetadata_helper.build_cache_filename(Asset("SPY"), "1D", datastyle)
432432

433433
# Empty DataFrame of df_all, don't write cache file
434434
thetadata_helper.update_cache(cache_file, df_all, df_cached)
@@ -550,8 +550,9 @@ def on_local_update(self, local_path, payload=None):
550550
)
551551
def test_load_data_from_cache(mocker, tmpdir, df_cached, datastyle):
552552
# Setup some basics
553-
mocker.patch.object(thetadata_helper, "LUMIBOT_CACHE_FOLDER", tmpdir)
554-
cache_file = Path(tmpdir / "thetadata" / f"stock_SPY_1D_{datastyle}.parquet")
553+
mocker.patch.object(thetadata_helper, "LUMIBOT_CACHE_FOLDER", str(tmpdir))
554+
asset = Asset("SPY")
555+
cache_file = thetadata_helper.build_cache_filename(asset, "1D", datastyle)
555556

556557
# No cache file should return None (not raise)
557558
assert thetadata_helper.load_cache(cache_file) is None
@@ -1371,8 +1372,8 @@ def test_chains_cache_reuse(self):
13711372

13721373
# CLEAR CACHE to ensure first call downloads fresh data
13731374
# This prevents cache pollution from previous tests in the suite
1374-
# Chains are stored in: LUMIBOT_CACHE_FOLDER / "thetadata" / "option_chains"
1375-
chain_folder = Path(LUMIBOT_CACHE_FOLDER) / "thetadata" / "option_chains"
1375+
# Chains are stored in: LUMIBOT_CACHE_FOLDER / "thetadata" / "option" / "option_chains"
1376+
chain_folder = Path(LUMIBOT_CACHE_FOLDER) / "thetadata" / "option" / "option_chains"
13761377
if chain_folder.exists():
13771378
# Delete all AAPL chain cache files
13781379
for cache_file in chain_folder.glob("AAPL_*.parquet"):
@@ -1407,6 +1408,46 @@ def test_chains_cache_reuse(self):
14071408
assert time2 < time1 * 0.1, f"Cache not working: time1={time1:.2f}s, time2={time2:.2f}s (should be 10x faster)"
14081409
print(f"✓ Cache speedup: {time1/time2:.1f}x faster ({time1:.2f}s -> {time2:.4f}s)")
14091410

1411+
1412+
def test_finalize_day_frame_handles_dst_fallback():
1413+
tz = pytz.timezone("America/New_York")
1414+
utc = pytz.UTC
1415+
frame_index = pd.date_range(
1416+
end=tz.localize(datetime.datetime(2024, 10, 31, 16, 0)),
1417+
periods=5,
1418+
freq="D",
1419+
)
1420+
frame = pd.DataFrame(
1421+
{
1422+
"open": [100 + i for i in range(len(frame_index))],
1423+
"high": [101 + i for i in range(len(frame_index))],
1424+
"low": [99 + i for i in range(len(frame_index))],
1425+
"close": [100.5 + i for i in range(len(frame_index))],
1426+
"volume": [1000 + i for i in range(len(frame_index))],
1427+
},
1428+
index=frame_index,
1429+
)
1430+
1431+
data_source = ThetaDataBacktestingPandas(
1432+
datetime_start=utc.localize(datetime.datetime(2024, 10, 1)),
1433+
datetime_end=utc.localize(datetime.datetime(2024, 11, 5)),
1434+
username="user",
1435+
password="pass",
1436+
use_quote_data=False,
1437+
)
1438+
1439+
current_dt = utc.localize(datetime.datetime(2024, 11, 4, 13, 30))
1440+
result = data_source._finalize_day_frame(
1441+
frame,
1442+
current_dt,
1443+
requested_length=len(frame_index),
1444+
timeshift=None,
1445+
asset=Asset("TSLA"),
1446+
)
1447+
1448+
assert result is not None
1449+
assert len(result) == len(frame_index)
1450+
14101451
def test_chains_strike_format(self):
14111452
"""Test strikes are floats (not integers) and properly converted."""
14121453
username = os.environ.get("THETADATA_USERNAME")

tests/test_thetadata_pandas_verification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def count_cache_files():
4444
cache_dir = get_cache_dir()
4545
if not cache_dir.exists():
4646
return 0
47-
return len(list(cache_dir.glob("*.parquet")))
47+
return sum(1 for _ in cache_dir.rglob("*.parquet"))
4848

4949

5050
class WeeklyMomentumOptionsStrategy(Strategy):

0 commit comments

Comments
 (0)