Skip to content

Commit 80982f8

Browse files
authored
feat: check lowercase naming for qlib features directories (#2087)
* feat: check lowercase naming for qlib features directories * docs: add background reference for lowercase features dir check
1 parent 477160e commit 80982f8

File tree

1 file changed

+47
-2
lines changed

1 file changed

+47
-2
lines changed

scripts/check_data_health.py

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
from loguru import logger
21
import os
32
from typing import Optional
43

54
import fire
65
import pandas as pd
7-
import qlib
6+
from loguru import logger
87
from tqdm import tqdm
98

9+
import qlib
1010
from qlib.data import D
1111

1212

@@ -36,6 +36,7 @@ def __init__(
3636
self.large_step_threshold_price = large_step_threshold_price
3737
self.large_step_threshold_volume = large_step_threshold_volume
3838
self.missing_data_num = missing_data_num
39+
self.qlib_dir = os.path.abspath(os.path.expanduser(qlib_dir))
3940

4041
if csv_path:
4142
assert os.path.isdir(csv_path), f"{csv_path} should be a directory."
@@ -68,6 +69,43 @@ def load_qlib_data(self):
6869
self.data[instrument] = df
6970
print(df)
7071

72+
# NOTE:
73+
# This check is added due to a known issue in Qlib where feature paths
74+
# are constructed using lowercased instrument names. On case-sensitive
75+
# file systems (e.g. Linux), uppercase directory names under `features/`
76+
# will cause data loading failures.
77+
#
78+
# See: https://github.com/microsoft/qlib/issues/2053
79+
def check_features_dir_lowercase(self) -> Optional[pd.DataFrame]:
80+
"""
81+
Check whether all subdirectories under `<qlib_dir>/features` are named in lowercase.
82+
83+
This validation helps prevent data loading issues on case-sensitive
84+
file systems caused by uppercase instrument directory names.
85+
"""
86+
if not self.qlib_dir:
87+
return None
88+
89+
features_dir = os.path.join(self.qlib_dir, "features")
90+
if not os.path.isdir(features_dir):
91+
logger.warning(f"`features` directory not found under {self.qlib_dir}")
92+
return None
93+
94+
bad_dirs = []
95+
for name in os.listdir(features_dir):
96+
full_path = os.path.join(features_dir, name)
97+
if os.path.isdir(full_path) and name != name.lower():
98+
bad_dirs.append(name)
99+
100+
if bad_dirs:
101+
result_df = pd.DataFrame({"non_lowercase_dir": bad_dirs})
102+
return result_df
103+
else:
104+
logger.info(
105+
f"✅ All subdirectories under `{os.path.join(self.qlib_dir, 'features')}` are named in lowercase."
106+
)
107+
return None
108+
71109
def check_missing_data(self) -> Optional[pd.DataFrame]:
72110
"""Check if any data is missing in the DataFrame."""
73111
result_dict = {
@@ -177,11 +215,13 @@ def check_data(self):
177215
check_large_step_changes_result = self.check_large_step_changes()
178216
check_required_columns_result = self.check_required_columns()
179217
check_missing_factor_result = self.check_missing_factor()
218+
check_features_dir_case_result = self.check_features_dir_lowercase()
180219
if (
181220
check_large_step_changes_result is not None
182221
or check_large_step_changes_result is not None
183222
or check_required_columns_result is not None
184223
or check_missing_factor_result is not None
224+
or check_features_dir_case_result is not None
185225
):
186226
print(f"\nSummary of data health check ({len(self.data)} files checked):")
187227
print("-------------------------------------------------")
@@ -197,6 +237,11 @@ def check_data(self):
197237
if isinstance(check_missing_factor_result, pd.DataFrame):
198238
logger.warning(f"The factor column does not exist or is empty")
199239
print(check_missing_factor_result)
240+
if isinstance(check_features_dir_case_result, pd.DataFrame):
241+
logger.warning(
242+
f"Some subdirectories under `{os.path.join(self.qlib_dir, 'features')}` contain uppercase letters, please rename them to lowercase manually."
243+
)
244+
print(check_features_dir_case_result)
200245

201246

202247
if __name__ == "__main__":

0 commit comments

Comments
 (0)