Skip to content

Commit 7bf0ace

Browse files
committed
修复数据RangeIndex变Index的问题
1 parent bedf7e4 commit 7bf0ace

File tree

6 files changed

+76
-34
lines changed

6 files changed

+76
-34
lines changed

ddump/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.3.1"
1+
__version__ = "0.3.2"

ddump/api/dump.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,13 @@ def save(self, pre_save=func_pre_save, pre_save_kwargs={}):
191191
192192
"""
193193
dfs = [pre_save(df, **pre_save_kwargs) for key, df in self.dfs.items()]
194-
df = pd.concat(dfs) if len(dfs) > 0 else pd.DataFrame()
194+
if len(dfs) > 0:
195+
if isinstance(dfs[0].index, pd.RangeIndex):
196+
df = pd.concat(dfs, ignore_index=True)
197+
else:
198+
df = pd.concat(dfs)
199+
else:
200+
df = pd.DataFrame()
195201

196202
self.path.mkdir(parents=True, exist_ok=True)
197203

ddump/merge.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99

1010
def merge_files_to_file(path, files,
11-
ignore_index=True,
1211
delete_src=False,
1312
single_overwrite=True):
1413
"""合并件列表到文件
@@ -19,8 +18,6 @@ def merge_files_to_file(path, files,
1918
目标路径
2019
files: list of Path
2120
源路径列表
22-
ignore_index: bool
23-
合并时是否忽略索引。索引没有意义时忽略能加速
2421
delete_src: bool
2522
是否删除源文件
2623
single_overwrite: bool
@@ -64,14 +61,20 @@ def merge_files_to_file(path, files,
6461
# 合并,希望内存够
6562
logger.info('合并 {} 至 {} 等 {}个文件。是否删除?{}', h.name, t.name, len(files), delete_src)
6663
dfs = [d for d in dfs if not d.empty]
67-
dfs = pd.concat(dfs, ignore_index=ignore_index)
64+
if len(dfs) > 0:
65+
if isinstance(dfs[0].index, pd.RangeIndex):
66+
df = pd.concat(dfs, ignore_index=True)
67+
else:
68+
df = pd.concat(dfs)
69+
else:
70+
df = pd.DataFrame()
6871

6972
file_temp = path.with_suffix('.tmp')
7073
logger.info('写入文件:{}', file_temp)
7174

7275
# 写入临时文件
7376
path.parent.mkdir(parents=True, exist_ok=True)
74-
dfs.to_parquet(file_temp, compression='zstd')
77+
df.to_parquet(file_temp, compression='zstd')
7578

7679
# 全删
7780
if delete_src:
@@ -84,8 +87,7 @@ def merge_files_to_file(path, files,
8487
file_temp.rename(path)
8588

8689

87-
def merge_files_dict(files_dict,
88-
ignore_index=False, delete_src=False):
90+
def merge_files_dict(files_dict, delete_src=False):
8991
"""合并特殊字典。
9092
9193
key为路径
@@ -94,7 +96,7 @@ def merge_files_dict(files_dict,
9496
for i, kv in enumerate(files_dict):
9597
# 最后N个单文件总是试着覆盖
9698
single_overwrite = i >= len(files_dict) - 3
97-
merge_files_to_file(kv['to'], kv['from'], ignore_index, delete_src, single_overwrite)
99+
merge_files_to_file(kv['to'], kv['from'], delete_src, single_overwrite)
98100

99101

100102
def check_include(start1, end1, start2, end2):

examples/fix_index.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
部分老数据很早以前下载,当时没有考虑到Index部分混乱问题
3+
现在将老数据的Index重置一下
4+
5+
需手工修改路径,以后基本就不用动了
6+
"""
7+
import pandas as pd
8+
9+
10+
def update_index(path, update=False):
11+
df1 = pd.read_parquet(path)
12+
if isinstance(df1.index, pd.RangeIndex):
13+
print(df1.index.is_monotonic_increasing, df1.index.is_monotonic_decreasing)
14+
return
15+
if pd.api.types.is_integer_dtype(df1.index.dtype) and df1.index.name is None:
16+
print(df1.head(2))
17+
print(df1.index.is_monotonic_increasing, df1.index.is_monotonic_decreasing)
18+
if update:
19+
df2 = df1.reset_index(drop=True)
20+
del df1
21+
df2.to_parquet(path, compression='zstd')
22+
23+
24+
import pathlib
25+
26+
# 修改路径观察指定文件夹
27+
files = pathlib.Path(r"F:\data\jqresearch\get_extras_stock_is_st").glob("*.parquet")
28+
# 先观察数据是否要改,然后改成True进行更新
29+
update = False
30+
31+
for i, file in enumerate(files):
32+
print(file)
33+
update_index(file, update=update)
34+
if not update:
35+
if i > 20:
36+
break

examples/preprocessing/merge_jqresearch.py

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,29 +19,28 @@
1919

2020
def get_paths(root):
2121
return [
22-
(rf'{root}\get_extras_stock_is_st', False),
23-
(rf'{root}\get_industry_stock', False),
24-
(rf'{root}\get_price_stock_factor', False),
25-
(rf'{root}\get_price_stock_daily', False),
22+
rf'{root}\get_extras_stock_is_st',
23+
rf'{root}\get_industry_stock',
24+
rf'{root}\get_price_stock_factor',
25+
rf'{root}\get_price_stock_daily',
2626

27-
# ignore_index=True 表示合并时丢弃索引,因为索引不含有效信息
28-
(rf'{root}\get_fundamentals_balance', True),
29-
(rf'{root}\get_fundamentals_cash_flow', True),
30-
(rf'{root}\get_fundamentals_income', True),
31-
(rf'{root}\get_fundamentals_indicator', True),
32-
(rf'{root}\get_fundamentals_valuation', True),
33-
(rf'{root}\get_STK_XR_XD', True),
34-
(rf'{root}\get_STK_BALANCE_SHEET', True),
35-
(rf'{root}\get_STK_CASHFLOW_STATEMENT', True),
36-
(rf'{root}\get_STK_INCOME_STATEMENT', True),
27+
rf'{root}\get_fundamentals_balance',
28+
rf'{root}\get_fundamentals_cash_flow',
29+
rf'{root}\get_fundamentals_income',
30+
rf'{root}\get_fundamentals_indicator',
31+
rf'{root}\get_fundamentals_valuation',
32+
rf'{root}\get_STK_XR_XD',
33+
rf'{root}\get_STK_BALANCE_SHEET',
34+
rf'{root}\get_STK_CASHFLOW_STATEMENT',
35+
rf'{root}\get_STK_INCOME_STATEMENT',
3736

38-
(rf'{root}\get_index_weights\000016.XSHG', False),
39-
(rf'{root}\get_index_weights\000300.XSHG', False),
40-
(rf'{root}\get_index_weights\000852.XSHG', False),
41-
(rf'{root}\get_index_weights\000905.XSHG', False),
37+
rf'{root}\get_index_weights\000016.XSHG',
38+
rf'{root}\get_index_weights\000300.XSHG',
39+
rf'{root}\get_index_weights\000852.XSHG',
40+
rf'{root}\get_index_weights\000905.XSHG',
4241

43-
(rf'{root}\get_price_futures_daily', False),
44-
(rf'{root}\get_dominant_futures', False),
42+
rf'{root}\get_price_futures_daily',
43+
rf'{root}\get_dominant_futures',
4544
]
4645

4746

@@ -53,12 +52,12 @@ def main():
5352

5453
paths1 = get_paths(PATH_INPUT1)
5554
paths2 = get_paths(PATH_OUTPUT)
56-
for (path1, _), (path2, _) in zip(paths1, paths2):
55+
for path1, path2 in zip(paths1, paths2):
5756
logger.info('=' * 60, )
5857
path1 = pathlib.Path(path1)
5958
path2 = pathlib.Path(path2)
6059
files = path_groupby_date(path1, path2)
61-
merge_files_dict(files, ignore_index=_, delete_src=False)
60+
merge_files_dict(files, delete_src=False)
6261
remove_sub_range(path2)
6362

6463

examples/preprocessing/step1.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ def step1(ROOT) -> pl.DataFrame:
2929
df1 = (
3030
pl.read_parquet(PATH_INPUT1, use_pyarrow=True)
3131
# .with_columns(pl.col('paused').cast(pl.Boolean))
32-
.drop(['__index_level_0__'])
3332
.sort(by=['code', 'time'])
3433
)
3534

@@ -56,7 +55,7 @@ def step1(ROOT) -> pl.DataFrame:
5655
df5 = (
5756
pl.read_parquet(PATH_INPUT5, use_pyarrow=True)
5857
.rename({'day': 'time'})
59-
.drop(['id', 'pubDate', '__index_level_0__'])
58+
.drop(['id', 'pubDate'])
6059
.with_columns(pl.col('time').str.strptime(pl.Datetime, "%Y-%m-%d"))
6160
)
6261

0 commit comments

Comments
 (0)