tusharchou · tusharchou · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/.github/workflows/manual.yml b/.github/workflows/manual.yml
@@ -0,0 +1,32 @@
+# This is a basic workflow that is manually triggered
+
+name: Manual workflow
+
+# Controls when the action will run. Workflow runs when manually triggered using the UI
+# or API.
+on:
+  workflow_dispatch:
+    # Inputs the workflow accepts.
+    inputs:
+      name:
+        # Friendly description to be shown in the UI instead of 'name'
+        description: 'Person to greet'
+        # Default value if no value is explicitly provided
+        default: 'World'
+        # Input has to be provided for the workflow to run
+        required: true
+        # The data type of the input
+        type: string
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "greet"
+  greet:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+    # Runs a single command using the runners shell
+    - name: Send greeting
+      run: echo "Hello ${{ inputs.name }}"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,42 @@
+name: Upload Python Package to PyPI when a Release is Created
+
+on:
+  release:
+    types: [created]
+  workflow_dispatch:
+
+jobs:
+  pypi-publish:
+    name: Publish release to PyPI
+    runs-on: ubuntu-latest
+    environment:
+      name: production
+      url: https://pypi.org/p/local-data-platform
+    permissions:
+      id-token: write
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.x"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install setuptools wheel poetry==1.8
+          cd local-data-platform
+          poetry install
+
+      - name: Build package
+        run: |
+          cd local-data-platform
+          poetry build
+
+      - name: Publish package to PyPI
+        uses: pypa/gh-action-pypi-publish@v1.11.0
+        with: 
+          packages-dir: local-data-platform/dist/
+
diff --git a/local-data-platform/README.md b/local-data-platform/README.md
@@ -0,0 +1,3 @@
+# Local Data Platform
+Local Data Platform is a python library that uses open source tools to orchestrate a data platform operations locally for development and testing. <br/>
+This library provides solutions for all stages ranging from ingestion to reporting all of which one can build data pipeline locally, test and easily scale up to cloud. 
diff --git a/local-data-platform/dist/local_data_platform-0.1.1-py3-none-any.whl b/local-data-platform/dist/local_data_platform-0.1.1-py3-none-any.whl
diff --git a/local-data-platform/dist/local_data_platform-0.1.1.tar.gz b/local-data-platform/dist/local_data_platform-0.1.1.tar.gz
diff --git a/local-data-platform/local_data_platform/__init__.py b/local-data-platform/local_data_platform/__init__.py
@@ -39,7 +39,7 @@ class Table(Base):
 
     def __init__(self, name: str, path: Path = os.getcwd()):
         self.name = name
-        self.path = path
+        self.path = os.getcwd()+path
 
     def get(self):
         raise TableNotFound(

diff --git a/local-data-platform/local_data_platform/catalog/local/iceberg/__init__.py b/local-data-platform/local_data_platform/catalog/local/iceberg/__init__.py
@@ -30,10 +30,10 @@ class LocalIcebergCatalog(SqlCatalog):
 
     def __init__(self, name: str, path: str, *args, **kwargs):
         self.name = name
-        self.uri = f"sqlite:///{path}/{name}.db" 
+        self.uri = f"sqlite:///{path}/{name}_catalog.db"
         self.warehouse = f"file://{path}"
         try: 
-            logger.error(f"Initializing LocalIcebergCatalog with {self.uri}")
+            logger.info(f"Initializing LocalIcebergCatalog with {self.uri}")
             super().__init__(*args, **kwargs, **self.__dict__)
         except Exception as e:
             logger.error(f"Failed to initialize LocalIcebergCatalog {e}")

diff --git a/local-data-platform/local_data_platform/format/csv/__init__.py b/local-data-platform/local_data_platform/format/csv/__init__.py
@@ -17,17 +17,18 @@ def __init__(self, *args, **kwargs):
 
     def get(self) -> Table:
         if not os.path.isfile(self.path):
+            logger.error(f"This path {self.path} is invalid")
             raise FileNotFoundError
 
         logger.info(
             f"""
             reading CSV from {self.path}
             """
         )
-        df = csv.read_table(self.path)
+        df = csv.read_csv(self.path)
         logger.info(
             f"""
-            df type {type(df)}
+            df type {type(df)} len {len(df)}
             """
         )
         if df is not None:

diff --git a/local-data-platform/local_data_platform/format/iceberg/__init__.py b/local-data-platform/local_data_platform/format/iceberg/__init__.py
@@ -4,6 +4,10 @@
 from pyiceberg.typedef import Identifier
 from pyarrow import Table
 from local_data_platform.logger import log
+import os
+
+os.environ['PYICEBERG_DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE'] = 'true'
+
 
 logger = log()
 
@@ -21,35 +25,41 @@ class Iceberg(Format):
     Methods:
         __init__(catalog: str, *args, **kwargs):
             Initializes the Iceberg instance with the given catalog and metadata.
-            
+
         put(df: Table) -> Table:
             Writes the given data frame to the Iceberg table.
-            
+
         get():
             Fetches data from the Iceberg table and returns it as an Arrow table.
     """
-    def __init__(self, catalog: str, *args, **kwargs):
-        logger.info(f"Iceberg catalog : {catalog}")
-        self.catalog_identifier = catalog["identifier"]
+    def __init__(self, config: str, *args, **kwargs):
+        logger.info(f"Iceberg catalog : {config}")
+        self.catalog_identifier = config["identifier"]
         self.catalog = LocalIcebergCatalog(
-            self.catalog_identifier, path=catalog["warehouse_path"]
+            self.catalog_identifier, path=config["warehouse_path"]
         )
-        self.catalog.create_namespace(self.catalog_identifier)
+        if not self.catalog._namespace_exists(self.catalog_identifier):
+            self.catalog.create_namespace(self.catalog_identifier)
         self.identifier = f"{self.catalog_identifier}.{kwargs['name']}"
         self.metadata = kwargs
         logger.info(f"Iceberg created with catalog namespace {self.catalog_identifier}")
         logger.info(f"Iceberg initialised with identifier {self.identifier}")
         super().__init__(*args, **kwargs)
 
     def put(self, df: Table) -> Table:
+        if not df:
+            logger.error(f"While doing put in Iceberg Format we got df as None")
+            raise Exception(f" Got Table as non")
         logger.info(f"self.identifier {self.identifier}")
         logger.info(
             f"""
-            Writing {len(df)} to Iceberg Table {self.identifier}
+            Writing type {type(df)} of length {len(df)} to Iceberg Table {self.identifier}
             """
         )
         table = self.catalog.create_table_if_not_exists(
-            identifier=self.identifier, schema=df.schema
+            identifier=self.identifier, schema=df.schema, properties={
+                "downcast-ns-timestamp-to-us-on-write": True  # Set property for downcasting
+            }
         )
         table.append(df)
         return table

diff --git a/local-data-platform/local_data_platform/format/parquet/__init__.py b/local-data-platform/local_data_platform/format/parquet/__init__.py
@@ -16,14 +16,14 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
     def get(self) -> Table:
-        if not os.path.isfile(self.path):
-            raise FileNotFoundError
-
         logger.info(
             f"""
             reading parquet from {self.path}
             """
         )
+        if not os.path.isfile(self.path):
+            raise FileNotFoundError
+
         df = parquet.read_table(self.path)
         logger.info(
             f"""

diff --git a/local-data-platform/local_data_platform/logger.py b/local-data-platform/local_data_platform/logger.py
@@ -8,10 +8,7 @@
 def log():
     basicConfig(level=INFO, format=
                 """
-                %(filename)s - %(funcName)s 
-                - %(asctime)s - %(name)s 
-                - %(levelname)s 
-                - message : %(message)s
+                 %(message)s
                 """
             )
 

diff --git a/local-data-platform/local_data_platform/pipeline/__init__.py b/local-data-platform/local_data_platform/pipeline/__init__.py
@@ -11,7 +11,5 @@ class Pipeline(Flow):
 
     def __init__(self, config: Config, *args, **kwargs):
         self.config = config
-        # self.source = Source(**config.metadata['source'])
-        # self.target = Target(**config.metadata['target'])
         super().__init__(*args, **kwargs)
 
diff --git a/local-data-platform/local_data_platform/pipeline/egression/iceberg_to_csv/__init__.py b/local-data-platform/local_data_platform/pipeline/egression/iceberg_to_csv/__init__.py
@@ -36,7 +36,7 @@ def __init__(self, config: Config, *args, **kwargs):
         self.source = config.metadata["source"]
         self.target = config.metadata["target"]
         self.target = CSV(name=self.target["name"], path=self.target["path"])
-        self.source = Iceberg(name=self.source["name"], catalog=self.source["catalog"])
+        self.source = Iceberg(name=self.source["name"], config=self.source["catalog"])
         logger.info(
             f"""
             IcebergToCSV initialised with

diff --git a/local-data-platform/local_data_platform/pipeline/ingestion/__init__.py b/local-data-platform/local_data_platform/pipeline/ingestion/__init__.py
@@ -1,11 +1,16 @@
 from local_data_platform.pipeline import Pipeline
+from local_data_platform.logger import log
 
+logger = log()
 
-class Ingestion(Pipeline):
 
+class Ingestion(Pipeline):
 
     def extract(self):
-        self.source.get()
+        logger.info("Extracting Source in ingestion pipeline")
+        return self.source.get()
 
     def load(self):
-        self.target.put(self.extract())
+        df = self.extract()
+        logger.info(f"Loading Source {len(df)} in ingestion pipeline")
+        self.target.put(df)
diff --git a/local-data-platform/local_data_platform/pipeline/ingestion/csv_to_iceberg/__init__.py b/local-data-platform/local_data_platform/pipeline/ingestion/csv_to_iceberg/__init__.py
@@ -5,6 +5,7 @@
 
 logger = log()
 
+
 class CSVToIceberg(Ingestion):
         """
         CSVToIceberg is a class responsible for ingesting data from a CSV source and 
@@ -39,7 +40,7 @@ def __init__(self, config, *args, **kwargs):
             )
             self.target = Iceberg(
                 name=self.target['name'],
-                catalog=self.target['catalog']
+                config=self.target['catalog']
             )
             logger.info(
                 f"""

diff --git a/local-data-platform/local_data_platform/pipeline/ingestion/parquet_to_iceberg/__init__.py b/local-data-platform/local_data_platform/pipeline/ingestion/parquet_to_iceberg/__init__.py
@@ -34,7 +34,7 @@ def __init__(self, config: Config, *args, **kwargs):
         )
         self.target = Iceberg(
             name=self.target['name'],
-            catalog=self.target['catalog']
+            config=self.target['catalog']
         )
         logger.info(
             f"""

diff --git a/local-data-platform/local_data_platform/store/source/gcp/__init__.py b/local-data-platform/local_data_platform/store/source/gcp/__init__.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 import json
 from local_data_platform import Credentials
-from local_data_platform import logger
+from local_data_platform.logger import log
 
 logger = log()
 

diff --git a/local-data-platform/pyproject.toml b/local-data-platform/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "local-data-platform"
-version = "0.1.0"
-description = ""
+version = "0.1.1"
+description = "Python library for iceberg lake house on your local"
 authors = ["Tushar Choudhary <151359025+tusharchou@users.noreply.github.com>"]
 readme = "README.md"
 

diff --git a/local-data-platform/real_world_use_cases/near_data_lake/reports/get_data.py b/local-data-platform/real_world_use_cases/near_data_lake/reports/get_data.py
@@ -0,0 +1,60 @@
+from local_data_platform.pipeline.ingestion.bigquery_to_csv import BigQueryToCSV
+from local_data_platform import Config, SupportedFormat, SupportedEngine
+from local_data_platform.store.source.json import Json
+from local_data_platform.exceptions import PipelineNotFound
+import os
+from local_data_platform.logger import log
+
+
+logger = log()
+
+
+def get_near_transaction_dataset(
+    dataset="near_transactions",
+    config_path="/real_world_use_cases/near_data_lake/config/ingestion.json",
+):
+    """
+    Retrieves and processes the near transaction dataset based on the provided configuration.
+
+    Args:
+        dataset (str): The name of the dataset to be processed. Defaults to "near_transactions".
+        config_path (str): The path to the configuration file. Defaults to "/real_world_use_cases/near_data_lake/config/ingestion.json".
+
+    Raises:
+        PipelineNotFound: If the source and target formats specified in the configuration are not supported.
+
+    Returns:
+        None
+    """
+
+    config = Config(
+        **Json(
+            name=dataset,
+            path=config_path,
+        ).get()
+    )
+    print(config)
+    logger.info(
+        f"""
+        We are using the following dictionary as the configuration to generate a monthly trust metric
+        {config}
+        """
+    )
+    if (
+        config.metadata["source"]["format"] == SupportedFormat.JSON.value
+        and config.metadata["target"]["format"] == SupportedFormat.CSV.value
+        and config.metadata["source"]["engine"] == SupportedEngine.BIGQUERY.value
+    ):
+        data_loader = BigQueryToCSV(config=config)
+        data_loader.load()
+    else:
+        raise PipelineNotFound(
+            f"""
+            source {config.metadata['source']['format']} 
+            to target {config.metadata['target']['format']}
+            pipeline is not supported yet
+            """
+        )
+
+
+get_near_transaction_dataset()