Improve reading timezone from earning dates and only fall back to general symbol timezone when decoding fails

mreiche · mreiche · commit 3e35ae766d02 · 2024-08-18T19:25:15.000+02:00
diff --git a/requirements.txt b/requirements.txt
@@ -9,3 +9,5 @@ frozendict>=2.3.4
 beautifulsoup4>=4.11.1
 html5lib>=1.1
 peewee>=3.16.2
+requests-cache==1.2.1
+requests-ratelimiter==0.7.0
diff --git a/tests/test_ticker.py b/tests/test_ticker.py
@@ -8,18 +8,18 @@
    python -m unittest tests.ticker.TestTicker
 
 """
-import pandas as pd
-
-from .context import yfinance as yf
-from .context import session_gbl
-from yfinance.exceptions import YFChartError, YFInvalidPeriodError, YFNotImplementedError, YFTickerMissingError, YFTzMissingError
-
 
 import unittest
-import requests_cache
 from typing import Union, Any, get_args, _GenericAlias
 from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
 
+import pandas as pd
+import requests_cache
+
+from yfinance.exceptions import YFChartError, YFInvalidPeriodError, YFNotImplementedError, YFTickerMissingError, YFTzMissingError
+from .context import session_gbl
+from .context import yfinance as yf
+
 ticker_attributes = (
     ("major_holders", pd.DataFrame),
     ("institutional_holders", pd.DataFrame),
@@ -289,6 +289,7 @@ def test_earnings_dates(self):
         data = self.ticker.earnings_dates
         self.assertIsInstance(data, pd.DataFrame, "data has wrong type")
         self.assertFalse(data.empty, "data is empty")
+        self.assertEqual(data.index.tz.zone, "America/New_York")
 
     def test_earnings_dates_with_limit(self):
         # use ticker with lots of historic earnings
@@ -298,6 +299,7 @@ def test_earnings_dates_with_limit(self):
         self.assertIsInstance(data, pd.DataFrame, "data has wrong type")
         self.assertFalse(data.empty, "data is empty")
         self.assertEqual(len(data), limit, "Wrong number or rows")
+        self.assertEqual(data.index[0].tz.zone, "America/New_York")
 
         data_cached = ticker.get_earnings_dates(limit=limit)
         self.assertIs(data, data_cached, "data not cached")
@@ -323,6 +325,15 @@ def test_earnings_dates_with_limit(self):
     #     data_cached = self.ticker.earnings_trend
     #     self.assertIs(data, data_cached, "data not cached")
 
+    def test_ticker_has_tz(self):
+        test_data = {"AMZN": "America/New_York", "LHA.DE": "Europe/Berlin", "6758.T": "Asia/Tokyo"}
+        for symbol, tz in test_data.items():
+            with self.subTest(f"{symbol}-{tz}"):
+                ticker = yf.Ticker(symbol)
+                data = ticker.get_earnings_dates(limit=1)
+                self.assertIsNotNone(data.index.tz)
+                self.assertEqual(data.index.tz.zone, tz)
+
 
 class TestTickerHolders(unittest.TestCase):
     session = None
diff --git a/yfinance/base.py b/yfinance/base.py
@@ -21,25 +21,26 @@
 
 from __future__ import print_function
 
-from io import StringIO
 import json as _json
+import re
 import warnings
+from io import StringIO
 from typing import Optional, Union
 from urllib.parse import quote as urlencode
 
 import pandas as pd
+import pytz
 import requests
 
 from . import utils, cache
+from .const import _BASE_URL_, _ROOT_URL_
 from .data import YfData
 from .exceptions import YFEarningsDateMissing
 from .scrapers.analysis import Analysis
 from .scrapers.fundamentals import Fundamentals
+from .scrapers.history import PriceHistory
 from .scrapers.holders import Holders
 from .scrapers.quote import Quote, FastInfo
-from .scrapers.history import PriceHistory
-
-from .const import _BASE_URL_, _ROOT_URL_
 
 
 class TickerBase:
@@ -534,6 +535,15 @@ def get_earnings_dates(self, limit=12, proxy=None) -> Optional[pd.DataFrame]:
 
         logger = utils.get_yf_logger()
 
+        ticker_tz = ""
+
+        def get_ticker_tz():
+            nonlocal ticker_tz
+            if ticker_tz == "":
+                self._quote.proxy = proxy or self.proxy
+                ticker_tz = self._get_ticker_tz(proxy=proxy, timeout=30)
+            return ticker_tz
+
         page_size = min(limit, 100)  # YF caps at 100, don't go higher
         page_offset = 0
         dates = None
@@ -589,20 +599,20 @@ def get_earnings_dates(self, limit=12, proxy=None) -> Optional[pd.DataFrame]:
 
         # Parse earnings date string
         cn = "Earnings Date"
-        # - remove AM/PM and timezone from date string
-        tzinfo = dates[cn].str.extract('([AP]M[a-zA-Z]*)$')
-        dates[cn] = dates[cn].replace(' [AP]M[a-zA-Z]*$', '', regex=True)
-        # - split AM/PM from timezone
-        tzinfo = tzinfo[0].str.extract('([AP]M)([a-zA-Z]*)', expand=True)
-        tzinfo.columns = ["AM/PM", "TZ"]
-        # - combine and parse
-        dates[cn] = dates[cn] + ' ' + tzinfo["AM/PM"]
-        dates[cn] = pd.to_datetime(dates[cn], format="%b %d, %Y, %I %p")
-        # - instead of attempting decoding of ambiguous timezone abbreviation, just use 'info':
-        self._quote.proxy = proxy or self.proxy
-        tz = self._get_ticker_tz(proxy=proxy, timeout=30)
-        dates[cn] = dates[cn].dt.tz_localize(tz)
 
+        def map_date(time_str: str):
+            tz_match = re.search('([AP]M)([a-zA-Z]*)$', time_str)
+            tz_str = tz_match.group(2).strip()
+            # - remove AM/PM and timezone from date string
+            time_str = time_str.replace(tz_str, "")
+            try:
+                tz = pytz.timezone(tz_str)
+            except pytz.UnknownTimeZoneError:
+                tz = get_ticker_tz()
+
+            return pd.to_datetime(time_str, format="%b %d, %Y, %I %p").tz_localize(tz)
+
+        dates[cn] = dates[cn].map(map_date)
         dates = dates.set_index("Earnings Date")
 
         self._earnings_dates[limit] = dates