Merge pull request #7 from andrusha/type-hints

andrusha · web-flow · commit 2d87e9d82aea · 2022-11-05T11:14:00.000+01:00
Type hints and comments
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "pandas-maxminddb"
-version = "0.2.0"
+version = "0.2.1"
 authors = ["Andrew Korzhuev <korzhuev@andrusha.me>"]
 edition = "2021"
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "pandas-maxminddb"
-version = "0.2.0"
+version = "0.2.1"
 description = "Fast geolocation library for Pandas Dataframes, built on Numpy C-FFI"
 requires-python = ">=3.8"
 license = {text = "MIT"}
diff --git a/python/pandas_maxminddb/__init__.py b/python/pandas_maxminddb/__init__.py
@@ -4,11 +4,20 @@
 
 from .pandas_maxminddb import Reader, ReaderMem, ReaderMmap, mmdb_geolocate
 
-__all__ = ["open_database", "GeoAccessor", "ReaderMem", "ReaderMmap"]
+__all__ = ["open_database", "GeoAccessor", "Reader", "ReaderMem", "ReaderMmap"]
 
 
 @contextmanager
 def open_database(mmdb_path: str, mmap=False) -> Reader:
+    """
+    If you want to manage lifetime of the object yourself,
+     then instantiate ReaderMem / ReaderMmap yourself
+
+    :param mmdb_path: path maxmind db
+    :param mmap: use memory mapping or not, useful for big files and few lookups
+    :return: corresponding context-managed Reader, which can be used with `with` statement
+    """
+
     if mmap:
         yield ReaderMmap(mmdb_path)
     else:
@@ -17,13 +26,22 @@ def open_database(mmdb_path: str, mmap=False) -> Reader:
 
 @pd.api.extensions.register_dataframe_accessor("geo")
 class GeoAccessor:
+    """
+    Defines Dataframe extension, which can be accessible as `some_df.geo.geolocate`
+    """
+
     def __init__(self, pandas_obj: pd.DataFrame):
         self._obj = pandas_obj
 
     def geolocate(
         self, ip_column_name: str, reader: Reader, geo_columns: list = None, parallel=False, parallel_chunk_size=1024
     ) -> pd.DataFrame:
         """
+        :param ip_column_name: name of the dataframe column containing IPs, malformed IPs are ignored
+        :param reader: one of the reader classes
+        :param geo_columns: list of columns to lookup
+        :param parallel: if lookups should be done in parallel (uses all the available cores)
+        :param parallel_chunk_size: size of the job into which ip list is split for parallel processing
         :return: appends geolocation information based on the given IP address column
         """
         if geo_columns is None:
diff --git a/python/pandas_maxminddb/pandas_maxminddb.pyi b/python/pandas_maxminddb/pandas_maxminddb.pyi
@@ -0,0 +1,43 @@
+from typing import Dict, List
+
+import numpy
+
+class Reader:
+    """
+    Abstract superclass of all the readers, can not be instantiated,
+    should be used as a type hint
+    """
+
+class ReaderMem(Reader):
+    """
+    Loads MMDB in-memory, required when parallel processing is used
+    """
+
+    def __init__(self, mmdb_path: str) -> None:
+        """
+        :param mmdb_path: path to maxmind db file
+        """
+
+class ReaderMmap(Reader):
+    """
+    Uses memory map to read the db, so only the records you're accessing are read from disk.
+    Useful when memory is limited and few lookups are made
+    """
+
+    def __init__(self, mmdb_path: str) -> None:
+        """
+        :param mmdb_path: path to maxmind db file
+        """
+
+def mmdb_geolocate(
+    ips: numpy.ndarray, reader: Reader, columns: List[str], parallel: bool, parallel_chunk_size: int
+) -> Dict[str, numpy.ndarray]:
+    """
+
+    :param ips: ndarray of ip strings
+    :param reader: one of the reader subclasses
+    :param columns: list of columns to fetch
+    :param parallel: if processing should be done in parallel
+    :param parallel_chunk_size: chunk size for ips to be split for parallel processing
+    :return: dict with keys being columns and values ndarray of lookup results
+    """
diff --git a/python/pandas_maxminddb/py.typed b/python/pandas_maxminddb/py.typed