baostock_cache_system/core.py at main · FengYang/baostock_cache_system · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
"""
BaoStock数据缓存系统 - 核心模块
包含基础类定义和数据库管理功能
"""

import os
import sqlite3
import threading
import time
import logging
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any, Callable
import pandas as pd
import numpy as np
import pickle
import gzip
import json
from concurrent.futures import ThreadPoolExecutor
from collections import OrderedDict
import baostock as bs

# 设置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/baostock_cache.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


class BaoStockDataCache:
    """
BaoStock数据缓存系统核心类
    """

    def __init__(self, data_root: str = None,
                 cache_size: int = 1000,
                 max_workers: int = 4):
        """
        初始化BaoStock数据缓存系统

        Args:
            data_root: 数据根目录
            cache_size: 内存缓存大小
            max_workers: 最大工作线程数
        """
        logger.info(f"初始化BaoStock数据缓存系统")

        # 如果没有提供data_root，使用默认路径
        if data_root is None:
            data_root = "e:/code/baostock_data"

        self.data_root = Path(data_root)
        self.cache_size = cache_size
        self.max_workers = max_workers

        # 添加线程锁
        self.memory_cache_lock = threading.RLock()
        self.performance_metrics_lock = threading.RLock()

        # 性能监控指标
        self.performance_metrics = {
            'cache_hits': 0,
            'cache_misses': 0,
            'api_calls': 0,
            'source_timing': {},
            'request_counts': {}
        }

        # 数据源优先级
        self.data_sources = ['memory', 'database', 'file', 'baostock_api']

        # 缓存过期时间（天）
        self.cache_expiry = {
            'daily': 1,      # 日线数据1天
            'weekly': 7,     # 周线数据7天
            'monthly': 30,   # 月线数据30天
            'minutely': 0.04, # 分钟数据1小时
            'basic': 30      # 基本信息30天
        }

        # 初始化目录结构
        self._init_directories()

        # 初始化缓存
        self._init_optimized_cache()

        # 初始化线程池
        self.thread_pool = ThreadPoolExecutor(max_workers=max_workers)

        logger.info(f"缓存系统初始化完成，缓存大小: {self.cache_size}")

    def _init_directories(self):
        """
        初始化目录结构
        """
        directories = [
            'daily', 'weekly', 'monthly', 'minutely',  # K线数据
            'basic', 'metadata',           # 基本信息和元数据
            'cache', 'logs'               # 缓存和日志
        ]

        for dir_name in directories:
            dir_path = self.data_root / dir_name
            dir_path.mkdir(parents=True, exist_ok=True)
            logger.info(f"初始化目录: {dir_path}")

    def _init_optimized_cache(self):
        """
        初始化优化缓存系统
        """
        # 内存缓存 - 使用OrderedDict实现LRU
        with self.memory_cache_lock:
            self.memory_cache = OrderedDict()

        # 初始化SQLite数据库
        self._init_database()

        logger.info(f"缓存系统初始化完成，缓存大小: {self.cache_size}")

    def _init_database(self):
        """
        初始化SQLite数据库
        """
        db_path = self.data_root / 'quant_data.db'

        # 如果数据库不存在，创建并初始化表结构
        if not db_path.exists():
            logger.info(f"创建数据库: {db_path}")
            conn = sqlite3.connect(str(db_path))
            cursor = conn.cursor()

            # 创建股票日线数据表
            cursor.execute('''
            CREATE TABLE IF NOT EXISTS stock_daily (
                code TEXT,
                date TEXT,
                open REAL,
                high REAL,
                low REAL,
                close REAL,
                volume REAL,
                amount REAL,
                PRIMARY KEY (code, date)
            )
            ''')

            # 创建股票基本信息表
            cursor.execute('''
            CREATE TABLE IF NOT EXISTS stock_basic (
                code TEXT PRIMARY KEY,
                code_name TEXT,
                ipoDate TEXT,
                outDate TEXT,
                type TEXT,
                status TEXT
            )
            ''')

            # 创建索引
            cursor.execute('CREATE INDEX IF NOT EXISTS idx_stock_daily_date ON stock_daily (date)')
            cursor.execute('CREATE INDEX IF NOT EXISTS idx_stock_daily_code ON stock_daily (code)')

            conn.commit()
            conn.close()
            logger.info("数据库初始化完成")

    def _get_cache_key(self, data_type, symbol, start_date, end_date, **kwargs):
        """
        生成缓存键

        Args:
            data_type: 数据类型
            symbol: 股票代码
            start_date: 开始日期
            end_date: 结束日期
            **kwargs: 其他参数

        Returns:
            缓存键字符串
        """
        key_parts = [
            data_type,
            symbol,
            start_date or 'None',
            end_date or 'None'
        ]

        # 添加其他参数，替换冒号为下划线以兼容Windows文件系统
        for k, v in sorted(kwargs.items()):
            if v is not None:
                key_parts.append(f"{k}_{v}")

        return "_".join(key_parts)

    def _get_from_cache(self, cache_key):
        """
        从缓存获取数据

        Args:
            cache_key: 缓存键

        Returns:
            缓存的数据或None
        """
        # 检查内存缓存
        with self.memory_cache_lock:
            if cache_key in self.memory_cache:
                # 移动到最前面（LRU）
                self.memory_cache.move_to_end(cache_key, last=False)
                with self.performance_metrics_lock:
                    self.performance_metrics['cache_hits'] += 1
                logger.info(f"从内存缓存获取数据: {cache_key}")
                return self.memory_cache[cache_key]

        # 检查数据库缓存
        db_data = self._get_from_database(cache_key)
        if db_data is not None:
            with self.performance_metrics_lock:
                self.performance_metrics['cache_hits'] += 1
            logger.info(f"从数据库缓存获取数据: {cache_key}")
            # 写入内存缓存
            self._set_to_memory_cache(cache_key, db_data)
            return db_data

        # 检查文件缓存
        file_data = self._get_from_file(cache_key)
        if file_data is not None:
            with self.performance_metrics_lock:
                self.performance_metrics['cache_hits'] += 1
            logger.info(f"从文件缓存获取数据: {cache_key}")
            # 写入内存缓存和数据库
            self._set_to_memory_cache(cache_key, file_data)
            self._set_to_database(cache_key, file_data)
            return file_data

        # 缓存未命中
        with self.performance_metrics_lock:
            self.performance_metrics['cache_misses'] += 1
        logger.info(f"缓存未命中: {cache_key}")
        return None

    def _set_to_memory_cache(self, cache_key, data):
        """
        写入内存缓存

        Args:
            cache_key: 缓存键
            data: 要缓存的数据
        """
        with self.memory_cache_lock:
            # 如果缓存已满，删除最久未使用的
            if len(self.memory_cache) >= self.cache_size:
                self.memory_cache.popitem(last=True)
            # 添加到缓存
            self.memory_cache[cache_key] = data
            # 移动到最前面
            self.memory_cache.move_to_end(cache_key, last=False)

    def _get_from_database(self, cache_key):
        """
        从数据库获取数据

        Args:
            cache_key: 缓存键

        Returns:
            数据或None
        """
        try:
            # 解析缓存键
            parts = cache_key.split('_')
            if len(parts) < 4:
                return None

            data_type = parts[0]
            symbol = parts[1]
            start_date = parts[2]
            end_date = parts[3]

            if start_date == 'None':
                start_date = ''
            if end_date == 'None':
                end_date = ''

            # 根据数据类型查询不同表
            if data_type == 'daily':
                return self._query_daily_from_db(symbol, start_date, end_date)
            elif data_type == 'basic':
                return self._query_basic_from_db(symbol)

        except Exception as e:
            logger.error(f"从数据库获取数据失败: {e}")

        return None

    def _set_to_database(self, cache_key, data):
        """
        写入数据库

        Args:
            cache_key: 缓存键
            data: 要写入的数据
        """
        try:
            # 解析缓存键
            parts = cache_key.split('_')
            if len(parts) < 4:
                return

            data_type = parts[0]
            symbol = parts[1]

            # 根据数据类型写入不同表
            if data_type == 'daily' and isinstance(data, pd.DataFrame):
                self._save_daily_to_db(symbol, data)
            elif data_type == 'basic' and isinstance(data, pd.DataFrame):
                self._save_basic_to_db(symbol, data)

        except Exception as e:
            logger.error(f"写入数据库失败: {e}")

    def _query_daily_from_db(self, symbol, start_date, end_date):
        """
        从数据库查询日线数据
        """
        try:
            db_path = self.data_root / 'quant_data.db'
            conn = sqlite3.connect(str(db_path))

            query = "SELECT * FROM stock_daily WHERE code = ?"
            params = [symbol]

            if start_date:
                query += " AND date >= ?"
                params.append(start_date)
            if end_date:
                query += " AND date <= ?"
                params.append(end_date)

            query += " ORDER BY date"

            df = pd.read_sql_query(query, conn, params=params)
            conn.close()

            if not df.empty:
                return df

        except Exception as e:
            logger.error(f"查询数据库失败: {e}")

        return None

    def _save_daily_to_db(self, symbol, df):
        """
        保存日线数据到数据库
        """
        try:
            if df is None or df.empty:
                return

            db_path = self.data_root / 'quant_data.db'
            conn = sqlite3.connect(str(db_path))
            cursor = conn.cursor()

            rows = []
            for _, row in df.iterrows():
                rows.append((
                    symbol,
                    str(row.get('date')),
                    float(row.get('open')) if 'open' in df.columns else None,
                    float(row.get('high')) if 'high' in df.columns else None,
                    float(row.get('low')) if 'low' in df.columns else None,
                    float(row.get('close')) if 'close' in df.columns else None,
                    float(row.get('volume')) if 'volume' in df.columns else None,
                    float(row.get('amount')) if 'amount' in df.columns else None
                ))

            cursor.executemany(
                'INSERT OR REPLACE INTO stock_daily (code, date, open, high, low, close, volume, amount) VALUES (?,?,?,?,?,?,?,?)',
                rows
            )

            conn.commit()
            conn.close()
            logger.info(f"已写入数据库: {symbol} {len(rows)} 条")

        except Exception as e:
            logger.error(f"写入数据库失败: {e}")

    def _query_basic_from_db(self, symbol):
        """
        从数据库查询基本信息
        """
        try:
            db_path = self.data_root / 'quant_data.db'
            conn = sqlite3.connect(str(db_path))

            query = "SELECT * FROM stock_basic"
            params = []

            if symbol != 'all':
                query += " WHERE code = ?"
                params.append(symbol)

            df = pd.read_sql_query(query, conn, params=params)
            conn.close()

            if not df.empty:
                return df

        except Exception as e:
            logger.error(f"查询基本信息失败: {e}")

        return None

    def _save_basic_to_db(self, symbol, df):
        """
        保存基本信息到数据库
        """
        try:
            if df is None or df.empty:
                return

            db_path = self.data_root / 'quant_data.db'
            conn = sqlite3.connect(str(db_path))
            cursor = conn.cursor()

            rows = []
            for _, row in df.iterrows():
                rows.append((
                    str(row.get('code')),
                    str(row.get('code_name')),
                    str(row.get('ipoDate')),
                    str(row.get('outDate')),
                    str(row.get('type')),
                    str(row.get('status'))
                ))

            cursor.executemany(
                'INSERT OR REPLACE INTO stock_basic (code, code_name, ipoDate, outDate, type, status) VALUES (?,?,?,?,?,?)',
                rows
            )

            conn.commit()
            conn.close()
            logger.info(f"已写入基本信息: {len(rows)} 条")

        except Exception as e:
            logger.error(f"写入基本信息失败: {e}")

    def _get_from_file(self, cache_key):
        """
        从文件获取数据
        """
        try:
            # 生成文件路径
            file_path = self.data_root / 'cache' / f"{cache_key}.pkl.gz"
            if not file_path.exists():
                return None

            # 读取并解压缩数据
            with gzip.open(file_path, 'rb') as f:
                data = pickle.load(f)

            return data

        except Exception as e:
            logger.error(f"从文件获取数据失败: {e}")
            return None

    def _set_to_file(self, cache_key, data):
        """
        写入文件
        """
        try:
            # 生成文件路径
            file_path = self.data_root / 'cache' / f"{cache_key}.pkl.gz"
            file_path.parent.mkdir(parents=True, exist_ok=True)

            # 压缩并写入数据
            with gzip.open(file_path, 'wb') as f:
                pickle.dump(data, f)

        except Exception as e:
            logger.error(f"写入文件失败: {e}")

    def _is_cache_expired(self, cache_key):
        """
        检查缓存是否过期
        """
        try:
            parts = cache_key.split('_')
            if len(parts) < 1:
                return True

            data_type = parts[0]
            expiry_days = self.cache_expiry.get(data_type, 1)

            # 检查文件修改时间
            file_path = self.data_root / 'cache' / f"{cache_key}.pkl.gz"
            if file_path.exists():
                modified_time = datetime.fromtimestamp(file_path.stat().st_mtime)
                if (datetime.now() - modified_time).days < expiry_days:
                    return False

        except Exception as e:
            logger.error(f"检查缓存过期失败: {e}")

        return True

    def clear_cache(self):
        """
        清空所有缓存
        """
        # 清空内存缓存
        with self.memory_cache_lock:
            self.memory_cache.clear()

        # 清空文件缓存
        cache_dir = self.data_root / 'cache'
        if cache_dir.exists():
            for file in cache_dir.glob('*.pkl.gz'):
                try:
                    file.unlink()
                except Exception as e:
                    logger.error(f"删除缓存文件失败: {e}")

        logger.info("缓存已清空")

    def get_performance_metrics(self):
        """
        获取性能指标
        """
        with self.performance_metrics_lock:
            return self.performance_metrics.copy()

    def login(self):
        """
        登录BaoStock
        """
        lg = bs.login()
        logger.info(f"BaoStock登录: {lg.error_code} - {lg.error_msg}")
        return lg

    def logout(self):
        """
        登出BaoStock
        """
        lg = bs.logout()
        logger.info(f"BaoStock登出: {lg.error_code} - {lg.error_msg}")
        return lg

    def _check_data_coverage(self, data: pd.DataFrame, start_date: str, end_date: str, date_column: str = 'date'):
        """
        检查缓存数据是否覆盖请求范围

        Args:
            data: 缓存的数据
            start_date: 请求的开始日期
            end_date: 请求的结束日期
            date_column: 日期列名（默认为'date'）

        Returns:
            (fully_covered, missing_dates)
            - fully_covered: 是否完全覆盖
            - missing_dates: 缺失的日期列表
        """
        if data.empty or date_column not in data.columns:
            return False, []

        try:
            # 创建数据副本以避免修改原始数据
            data_copy = data.copy()

            # 转换日期格式
            if not pd.api.types.is_datetime64_any_dtype(data_copy[date_column]):
                data_copy[date_column] = pd.to_datetime(data_copy[date_column], errors='coerce')

            # 过滤掉无效日期
            data_copy = data_copy[data_copy[date_column].notna()]

            if data_copy.empty:
                return False, []

            # 获取缓存数据的日期范围
            cached_start = data_copy[date_column].min()
            cached_end = data_copy[date_column].max()

            # 转换请求日期（处理可能的空值）
            request_start = pd.to_datetime(start_date, errors='coerce') if start_date else None
            request_end = pd.to_datetime(end_date, errors='coerce') if end_date else None

            # 检查是否完全覆盖
            if request_start is not None and request_end is not None:
                if cached_start <= request_start and cached_end >= request_end:
                    logger.debug(f"缓存数据完全覆盖请求范围: {cached_start} ~ {cached_end}")
                    return True, []

                # 生成请求的日期范围
                requested_dates = pd.date_range(start=request_start, end=request_end, freq='D')

                # 计算缺失的日期
                cached_dates = set(data_copy[date_column].dt.date)
                missing_dates = []
                for date in requested_dates:
                    if date.date() not in cached_dates:
                        missing_dates.append(date.strftime('%Y-%m-%d'))

                if missing_dates:
                    logger.debug(f"缓存数据部分覆盖，缺失 {len(missing_dates)} 个日期")
                    logger.debug(f"   缓存范围: {cached_start} ~ {cached_end}")
                    logger.debug(f"   请求范围: {request_start} ~ {request_end}")
                    logger.debug(f"   缺失日期: {missing_dates[:5]}{'...' if len(missing_dates) > 5 else ''}")

                return False, missing_dates
            else:
                return False, []

        except Exception as e:
            logger.error(f"检查数据覆盖范围失败: {e}")
            return False, []

    def _merge_dataframes(self, df1: pd.DataFrame, df2: pd.DataFrame, date_column: str = 'date'):
        """
        合并两个DataFrame，去重并按日期排序

        Args:
            df1: 第一个DataFrame
            df2: 第二个DataFrame
            date_column: 日期列名（默认为'date'）

        Returns:
            合并后的DataFrame
        """
        if df1.empty:
            return df2.copy() if not df2.empty else pd.DataFrame()

        if df2.empty:
            return df1.copy()

        try:
            # 创建数据副本以避免修改原始数据
            df1_clean = df1.copy()
            df2_clean = df2.copy()

            # 过滤掉空列或全NA列
            df1_clean = df1_clean.dropna(axis=1, how='all')
            df2_clean = df2_clean.dropna(axis=1, how='all')

            # 确保两DataFrame有相同的列顺序
            common_columns = sorted(set(df1_clean.columns) & set(df2_clean.columns))
            if common_columns:
                df1_clean = df1_clean[common_columns]
                df2_clean = df2_clean[common_columns]

            # 合并数据
            merged = pd.concat([df1_clean, df2_clean], ignore_index=True)

            # 去重（基于日期列）
            if date_column in merged.columns:
                # 确保日期列是datetime类型
                try:
                    if not pd.api.types.is_datetime64_any_dtype(merged[date_column]):
                        merged[date_column] = pd.to_datetime(merged[date_column])
                except Exception as e:
                    logger.debug(f"转换日期列失败: {e}")

                merged = merged.drop_duplicates(subset=[date_column], keep='last')

                # 按日期排序
                merged = merged.sort_values(date_column).reset_index(drop=True)

            logger.debug(f"合并DataFrame: df1({len(df1)}行) + df2({len(df2)}行) = merged({len(merged)}行)")

            return merged

        except Exception as e:
            logger.error(f"合并DataFrame失败: {e}")
            # 如果合并失败，返回非空的DataFrame
            return df1 if not df1.empty else df2