修复数据RangeIndex变Index的问题

wukan1986 · wukan1986 · commit 7bf0ace0ecad · 2025-09-21T23:21:44.000+08:00
diff --git a/ddump/_version.py b/ddump/_version.py
@@ -1 +1 @@
-__version__ = "0.3.1"
+__version__ = "0.3.2"
diff --git a/ddump/api/dump.py b/ddump/api/dump.py
@@ -191,7 +191,13 @@ def save(self, pre_save=func_pre_save, pre_save_kwargs={}):
 
         """
         dfs = [pre_save(df, **pre_save_kwargs) for key, df in self.dfs.items()]
-        df = pd.concat(dfs) if len(dfs) > 0 else pd.DataFrame()
+        if len(dfs) > 0:
+            if isinstance(dfs[0].index, pd.RangeIndex):
+                df = pd.concat(dfs, ignore_index=True)
+            else:
+                df = pd.concat(dfs)
+        else:
+            df = pd.DataFrame()
 
         self.path.mkdir(parents=True, exist_ok=True)
 
diff --git a/ddump/merge.py b/ddump/merge.py
@@ -8,7 +8,6 @@
 
 
 def merge_files_to_file(path, files,
-                        ignore_index=True,
                         delete_src=False,
                         single_overwrite=True):
     """合并件列表到文件
@@ -19,8 +18,6 @@ def merge_files_to_file(path, files,
         目标路径
     files: list of Path
         源路径列表
-    ignore_index: bool
-        合并时是否忽略索引。索引没有意义时忽略能加速
     delete_src: bool
         是否删除源文件
     single_overwrite: bool
@@ -64,14 +61,20 @@ def merge_files_to_file(path, files,
     # 合并,希望内存够
     logger.info('合并 {} 至 {} 等 {}个文件。是否删除?{}', h.name, t.name, len(files), delete_src)
     dfs = [d for d in dfs if not d.empty]
-    dfs = pd.concat(dfs, ignore_index=ignore_index)
+    if len(dfs) > 0:
+        if isinstance(dfs[0].index, pd.RangeIndex):
+            df = pd.concat(dfs, ignore_index=True)
+        else:
+            df = pd.concat(dfs)
+    else:
+        df = pd.DataFrame()
 
     file_temp = path.with_suffix('.tmp')
     logger.info('写入文件：{}', file_temp)
 
     # 写入临时文件
     path.parent.mkdir(parents=True, exist_ok=True)
-    dfs.to_parquet(file_temp, compression='zstd')
+    df.to_parquet(file_temp, compression='zstd')
 
     # 全删
     if delete_src:
@@ -84,8 +87,7 @@ def merge_files_to_file(path, files,
     file_temp.rename(path)
 
 
-def merge_files_dict(files_dict,
-                     ignore_index=False, delete_src=False):
+def merge_files_dict(files_dict, delete_src=False):
     """合并特殊字典。
 
     key为路径
@@ -94,7 +96,7 @@ def merge_files_dict(files_dict,
     for i, kv in enumerate(files_dict):
         # 最后N个单文件总是试着覆盖
         single_overwrite = i >= len(files_dict) - 3
-        merge_files_to_file(kv['to'], kv['from'], ignore_index, delete_src, single_overwrite)
+        merge_files_to_file(kv['to'], kv['from'], delete_src, single_overwrite)
 
 
 def check_include(start1, end1, start2, end2):
diff --git a/examples/fix_index.py b/examples/fix_index.py
@@ -0,0 +1,36 @@
+"""
+部分老数据很早以前下载，当时没有考虑到Index部分混乱问题
+现在将老数据的Index重置一下
+
+需手工修改路径，以后基本就不用动了
+"""
+import pandas as pd
+
+
+def update_index(path, update=False):
+    df1 = pd.read_parquet(path)
+    if isinstance(df1.index, pd.RangeIndex):
+        print(df1.index.is_monotonic_increasing, df1.index.is_monotonic_decreasing)
+        return
+    if pd.api.types.is_integer_dtype(df1.index.dtype) and df1.index.name is None:
+        print(df1.head(2))
+        print(df1.index.is_monotonic_increasing, df1.index.is_monotonic_decreasing)
+        if update:
+            df2 = df1.reset_index(drop=True)
+            del df1
+            df2.to_parquet(path, compression='zstd')
+
+
+import pathlib
+
+# 修改路径观察指定文件夹
+files = pathlib.Path(r"F:\data\jqresearch\get_extras_stock_is_st").glob("*.parquet")
+# 先观察数据是否要改，然后改成True进行更新
+update = False
+
+for i, file in enumerate(files):
+    print(file)
+    update_index(file, update=update)
+    if not update:
+        if i > 20:
+            break
diff --git a/examples/preprocessing/merge_jqresearch.py b/examples/preprocessing/merge_jqresearch.py
@@ -19,29 +19,28 @@
 
 def get_paths(root):
     return [
-        (rf'{root}\get_extras_stock_is_st', False),
-        (rf'{root}\get_industry_stock', False),
-        (rf'{root}\get_price_stock_factor', False),
-        (rf'{root}\get_price_stock_daily', False),
+        rf'{root}\get_extras_stock_is_st',
+        rf'{root}\get_industry_stock',
+        rf'{root}\get_price_stock_factor',
+        rf'{root}\get_price_stock_daily',
 
-        # ignore_index=True 表示合并时丢弃索引，因为索引不含有效信息
-        (rf'{root}\get_fundamentals_balance', True),
-        (rf'{root}\get_fundamentals_cash_flow', True),
-        (rf'{root}\get_fundamentals_income', True),
-        (rf'{root}\get_fundamentals_indicator', True),
-        (rf'{root}\get_fundamentals_valuation', True),
-        (rf'{root}\get_STK_XR_XD', True),
-        (rf'{root}\get_STK_BALANCE_SHEET', True),
-        (rf'{root}\get_STK_CASHFLOW_STATEMENT', True),
-        (rf'{root}\get_STK_INCOME_STATEMENT', True),
+        rf'{root}\get_fundamentals_balance',
+        rf'{root}\get_fundamentals_cash_flow',
+        rf'{root}\get_fundamentals_income',
+        rf'{root}\get_fundamentals_indicator',
+        rf'{root}\get_fundamentals_valuation',
+        rf'{root}\get_STK_XR_XD',
+        rf'{root}\get_STK_BALANCE_SHEET',
+        rf'{root}\get_STK_CASHFLOW_STATEMENT',
+        rf'{root}\get_STK_INCOME_STATEMENT',
 
-        (rf'{root}\get_index_weights\000016.XSHG', False),
-        (rf'{root}\get_index_weights\000300.XSHG', False),
-        (rf'{root}\get_index_weights\000852.XSHG', False),
-        (rf'{root}\get_index_weights\000905.XSHG', False),
+        rf'{root}\get_index_weights\000016.XSHG',
+        rf'{root}\get_index_weights\000300.XSHG',
+        rf'{root}\get_index_weights\000852.XSHG',
+        rf'{root}\get_index_weights\000905.XSHG',
 
-        (rf'{root}\get_price_futures_daily', False),
-        (rf'{root}\get_dominant_futures', False),
+        rf'{root}\get_price_futures_daily',
+        rf'{root}\get_dominant_futures',
     ]
 
 
@@ -53,12 +52,12 @@ def main():
 
     paths1 = get_paths(PATH_INPUT1)
     paths2 = get_paths(PATH_OUTPUT)
-    for (path1, _), (path2, _) in zip(paths1, paths2):
+    for path1, path2 in zip(paths1, paths2):
         logger.info('=' * 60, )
         path1 = pathlib.Path(path1)
         path2 = pathlib.Path(path2)
         files = path_groupby_date(path1, path2)
-        merge_files_dict(files, ignore_index=_, delete_src=False)
+        merge_files_dict(files, delete_src=False)
         remove_sub_range(path2)
 
 
diff --git a/examples/preprocessing/step1.py b/examples/preprocessing/step1.py
@@ -29,7 +29,6 @@ def step1(ROOT) -> pl.DataFrame:
     df1 = (
         pl.read_parquet(PATH_INPUT1, use_pyarrow=True)
         # .with_columns(pl.col('paused').cast(pl.Boolean))
-        .drop(['__index_level_0__'])
         .sort(by=['code', 'time'])
     )
 
@@ -56,7 +55,7 @@ def step1(ROOT) -> pl.DataFrame:
     df5 = (
         pl.read_parquet(PATH_INPUT5, use_pyarrow=True)
         .rename({'day': 'time'})
-        .drop(['id', 'pubDate', '__index_level_0__'])
+        .drop(['id', 'pubDate'])
         .with_columns(pl.col('time').str.strptime(pl.Datetime, "%Y-%m-%d"))
     )
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.3.1"`
	`1`	`+__version__ = "0.3.2"`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,6 @@ def step1(ROOT) -> pl.DataFrame:`
`29`	`29`	`df1 = (`
`30`	`30`	`pl.read_parquet(PATH_INPUT1, use_pyarrow=True)`
`31`	`31`	`# .with_columns(pl.col('paused').cast(pl.Boolean))`
`32`		`- .drop(['__index_level_0__'])`
`33`	`32`	`.sort(by=['code', 'time'])`
`34`	`33`	`)`
`35`	`34`
`@@ -56,7 +55,7 @@ def step1(ROOT) -> pl.DataFrame:`
`56`	`55`	`df5 = (`
`57`	`56`	`pl.read_parquet(PATH_INPUT5, use_pyarrow=True)`
`58`	`57`	`.rename({'day': 'time'})`
`59`		`- .drop(['id', 'pubDate', '__index_level_0__'])`
	`58`	`+ .drop(['id', 'pubDate'])`
`60`	`59`	`.with_columns(pl.col('time').str.strptime(pl.Datetime, "%Y-%m-%d"))`
`61`	`60`	`)`
`62`	`61`