temporalio
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎cloud_export_to_parquet/README.md
Lines changed: 23 additions & 0 deletions b/‎cloud_export_to_parquet/README.md
Lines changed: 23 additions & 0 deletions
diff --git a/‎cloud_export_to_parquet/__init__.py b/‎cloud_export_to_parquet/__init__.py
diff --git a/‎cloud_export_to_parquet/create_schedule.py
Lines changed: 64 additions & 0 deletions b/‎cloud_export_to_parquet/create_schedule.py
Lines changed: 64 additions & 0 deletions
diff --git a/‎cloud_export_to_parquet/data_trans_activities.py
Lines changed: 123 additions & 0 deletions b/‎cloud_export_to_parquet/data_trans_activities.py
Lines changed: 123 additions & 0 deletions
diff --git a/‎cloud_export_to_parquet/run_worker.py
Lines changed: 38 additions & 0 deletions b/‎cloud_export_to_parquet/run_worker.py
Lines changed: 38 additions & 0 deletions
diff --git a/‎cloud_export_to_parquet/workflows.py
Lines changed: 74 additions & 0 deletions b/‎cloud_export_to_parquet/workflows.py
Lines changed: 74 additions & 0 deletions
@@ -52,6 +52,7 @@ Some examples require extra dependencies. See each sample's directory for specif
   * [hello_signal](hello/hello_signal.py) - Send signals to a workflow.
 <!-- Keep this list in alphabetical order -->
 * [activity_worker](activity_worker) - Use Python activities from a workflow in another language.
+* [cloud_export_to_parquet](cloud_export_to_parquet) - Set up schedule workflow to process exported files on an hourly basis
 * [custom_converter](custom_converter) - Use a custom payload converter to handle custom types.
 * [custom_decorator](custom_decorator) - Custom decorator to auto-heartbeat a long-running activity.
 * [dsl](dsl) - DSL workflow that executes steps defined in a YAML file.
@@ -68,7 +69,6 @@ Some examples require extra dependencies. See each sample's directory for specif
 * [worker_specific_task_queues](worker_specific_task_queues) - Use unique task queues to ensure activities run on specific workers.
 * [worker_versioning](worker_versioning) - Use the Worker Versioning feature to more easily version your workflows & other code.
 
-
 ## Test
 
 Running the tests requires `poe` to be installed.
 
@@ -0,0 +1,23 @@
+# Cloud Export to parquet sample
+
+This is an example workflow to convert exported file from proto to parquet file. The workflow is an hourly schedule. 
+
+Please make sure your python is 3.9 above. For this sample, run:
+
+    poetry install --with cloud_export_to_parquet
+
+Before you start, please modify workflow input in `create_schedule.py` with your s3 bucket and namespace. Also make sure you've the right AWS permission set up in your environment to allow this workflow read and write to your s3 bucket. 
+
+To run, first see [README.md](../README.md) for prerequisites. Then, run the following from this directory to start the worker:
+
+```bash
+poetry run python run_worker.py
+```
+
+This will start the worker. Then, in another terminal, run the following to execute the schedule:
+
+```bash
+poetry run python create_schedule.py
+```
+
+The workflow should convert exported file in your input s3 bucket to parquet in your specified location.
@@ -0,0 +1,64 @@
+import asyncio
+import traceback
+from datetime import datetime, timedelta
+
+from temporalio.client import (
+    Client,
+    Schedule,
+    ScheduleActionStartWorkflow,
+    ScheduleIntervalSpec,
+    ScheduleSpec,
+    WorkflowFailureError,
+)
+
+from cloud_export_to_parquet.workflows import (
+    ProtoToParquet,
+    ProtoToParquetWorkflowInput,
+)
+
+
+async def main() -> None:
+    """Main function to run temporal workflow."""
+    # Create client connected to server at the given address
+    client = await Client.connect("localhost:7233")
+    # TODO: update s3_bucket and namespace to the actual usecase
+    wf_input = ProtoToParquetWorkflowInput(
+        num_delay_hour=2,
+        export_s3_bucket="test-input-bucket",
+        namespace="test.namespace",
+        output_s3_bucket="test-output-bucket",
+    )
+
+    # Run the workflow
+    # try:
+    #     await client.start_workflow(
+    #         ProtoToParquet.run,
+    #         wf_input,
+    #         id = f"proto-to-parquet-{datetime.now()}",
+    #         task_queue="DATA_TRANSFORMATION_TASK_QUEUE",
+    #     )
+    # except WorkflowFailureError:
+    #     print("Got exception: ", traceback.format_exc())
+
+    # Create the schedule
+    try:
+        await client.create_schedule(
+            "hourly-proto-to-parquet-wf-schedule",
+            Schedule(
+                action=ScheduleActionStartWorkflow(
+                    ProtoToParquet.run,
+                    wf_input,
+                    id=f"proto-to-parquet-{datetime.now()}",
+                    task_queue="DATA_TRANSFORMATION_TASK_QUEUE",
+                ),
+                spec=ScheduleSpec(
+                    intervals=[ScheduleIntervalSpec(every=timedelta(hours=1))]
+                ),
+            ),
+        )
+    except WorkflowFailureError:
+        print("Got exception: ", traceback.format_exc())
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,123 @@
+import json
+import uuid
+from dataclasses import dataclass
+from typing import List
+
+import boto3
+import pandas as pd
+import temporalio.api.export.v1 as export
+from google.protobuf.json_format import MessageToJson
+from temporalio import activity
+
+
+@dataclass
+class GetObjectKeysActivityInput:
+    bucket: str
+    path: str
+
+
+@dataclass
+class DataTransAndLandActivityInput:
+    export_s3_bucket: str
+    object_key: str
+    output_s3_bucket: str
+    write_path: str
+
+
+@activity.defn
+def get_object_keys(activity_input: GetObjectKeysActivityInput) -> List[str]:
+    """Function that list objects by key."""
+    object_keys = []
+    s3 = boto3.client("s3")
+    response = s3.list_objects_v2(
+        Bucket=activity_input.bucket, Prefix=activity_input.path
+    )
+    for obj in response.get("Contents", []):
+        object_keys.append(obj["Key"])
+    if len(object_keys) == 0:
+        raise FileNotFoundError(
+            f"No files found in {activity_input.bucket}/{activity_input.path}"
+        )
+
+    return object_keys
+
+
+@activity.defn
+def data_trans_and_land(activity_input: DataTransAndLandActivityInput) -> str:
+    """Function that convert proto to parquet and save to S3."""
+    key = activity_input.object_key
+    data = get_data_from_object_key(activity_input.export_s3_bucket, key)
+    activity.logger.info("Convert proto to parquet for file: %s", key)
+    parquet_data = convert_proto_to_parquet_flatten(data)
+    activity.logger.info("Finish transformation for file: %s", key)
+    return save_to_sink(
+        parquet_data, activity_input.output_s3_bucket, activity_input.write_path
+    )
+
+
+def get_data_from_object_key(
+    bucket_name: str, object_key: str
+) -> export.WorkflowExecutions:
+    """Function that get object by key."""
+    v = export.WorkflowExecutions()
+
+    s3 = boto3.client("s3")
+    try:
+        data = s3.get_object(Bucket=bucket_name, Key=object_key)["Body"].read()
+    except Exception as e:
+        activity.logger.error(f"Error reading object: {e}")
+        raise e
+    v.ParseFromString(data)
+    return v
+
+
+def convert_proto_to_parquet_flatten(wfs: export.WorkflowExecutions) -> pd.DataFrame:
+    """Function that convert flatten proto data to parquet."""
+    dfs = []
+    for wf in wfs.items:
+        start_attributes = wf.history.events[
+            0
+        ].workflow_execution_started_event_attributes
+        histories = wf.history
+        json_str = MessageToJson(histories)
+        row = {
+            "WorkflowID": start_attributes.workflow_id,
+            "RunID": start_attributes.original_execution_run_id,
+            "Histories": json.loads(json_str),
+        }
+        dfs.append(pd.DataFrame([row]))
+    df = pd.concat(dfs, ignore_index=True)
+    rows_flatten = []
+    for _, row in df.iterrows():
+        wf_histories_raw = row["Histories"]["events"]
+        worfkow_id = row["WorkflowID"]
+        run_id = row["RunID"]
+        for history_event in wf_histories_raw:
+            row_flatten = pd.json_normalize(history_event, sep="_")
+            skip_name = ["payloads", "."]
+            columns_to_drop = [
+                col for col in row_flatten.columns for skip in skip_name if skip in col
+            ]
+            row_flatten.drop(columns_to_drop, axis=1, inplace=True)
+            row_flatten.insert(0, "WorkflowId", worfkow_id)
+            row_flatten.insert(1, "RunId", run_id)
+            rows_flatten.append(row_flatten)
+    df_flatten = pd.concat(rows_flatten, ignore_index=True)
+    return df_flatten
+
+
+def save_to_sink(data: pd.DataFrame, s3_bucket: str, write_path: str) -> str:
+    """Function that save object to s3 bucket."""
+    write_bytes = data.to_parquet(None, compression="snappy", index=False)
+    uuid_name = uuid.uuid1()
+    file_name = f"{uuid_name}.parquet"
+    activity.logger.info("Writing to S3 bucket: %s", file_name)
+
+    s3 = boto3.client("s3")
+    try:
+        key = f"{write_path}/{file_name}"
+        s3.put_object(Bucket=s3_bucket, Key=key, Body=write_bytes)
+        return key
+    except Exception as e:
+        activity.logger.error(f"Error saving to sink: {e}")
+        raise e
@@ -0,0 +1,38 @@
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+
+from temporalio.client import Client
+from temporalio.worker import Worker
+from temporalio.worker.workflow_sandbox import (
+    SandboxedWorkflowRunner,
+    SandboxRestrictions,
+)
+
+from cloud_export_to_parquet.data_trans_activities import (
+    data_trans_and_land,
+    get_object_keys,
+)
+from cloud_export_to_parquet.workflows import ProtoToParquet
+
+
+async def main() -> None:
+    """Main worker function."""
+    # Create client connected to server at the given address
+    client = await Client.connect("localhost:7233")
+
+    # Run the worker
+    worker: Worker = Worker(
+        client,
+        task_queue="DATA_TRANSFORMATION_TASK_QUEUE",
+        workflows=[ProtoToParquet],
+        activities=[get_object_keys, data_trans_and_land],
+        workflow_runner=SandboxedWorkflowRunner(
+            restrictions=SandboxRestrictions.default.with_passthrough_modules("boto3")
+        ),
+        activity_executor=ThreadPoolExecutor(100),
+    )
+    await worker.run()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,74 @@
+from datetime import timedelta
+
+from temporalio import workflow
+from temporalio.common import RetryPolicy
+from temporalio.exceptions import ActivityError
+
+with workflow.unsafe.imports_passed_through():
+    from cloud_export_to_parquet.data_trans_activities import (
+        DataTransAndLandActivityInput,
+        data_trans_and_land,
+        get_object_keys,
+        GetObjectKeysActivityInput,
+    )
+from dataclasses import dataclass
+
+
+@dataclass
+class ProtoToParquetWorkflowInput:
+    num_delay_hour: int
+    export_s3_bucket: str
+    namespace: str
+    output_s3_bucket: str
+
+
+@workflow.defn
+class ProtoToParquet:
+    """Proto to parquet workflow."""
+
+    @workflow.run
+    async def run(self, workflow_input: ProtoToParquetWorkflowInput) -> str:
+        """Run proto to parquet workflow."""
+        retry_policy = RetryPolicy(
+            maximum_attempts=10, maximum_interval=timedelta(seconds=5)
+        )
+
+        # Read from export S3 bucket and given at least 2 hour delay to ensure the file has been uploaded
+        read_time = workflow.now() - timedelta(hours=workflow_input.num_delay_hour)
+        common_path = f"{workflow_input.namespace}/{read_time.year}/{read_time.month:02}/{read_time.day:02}/{read_time.hour:02}/00"
+        path = f"temporal-workflow-history/export/{common_path}"
+        get_object_keys_input = GetObjectKeysActivityInput(
+            workflow_input.export_s3_bucket, path
+        )
+
+        # Read Input File
+        object_keys_output = await workflow.execute_activity(
+            get_object_keys,
+            get_object_keys_input,
+            start_to_close_timeout=timedelta(minutes=5),
+            retry_policy=retry_policy,
+        )
+
+        write_path = f"temporal-workflow-history/parquet/{common_path}"
+
+        try:
+            # Could create a list of corountine objects to process files in parallel
+            for key in object_keys_output:
+                data_trans_and_land_input = DataTransAndLandActivityInput(
+                    workflow_input.export_s3_bucket,
+                    key,
+                    workflow_input.output_s3_bucket,
+                    write_path,
+                )
+                # Convert proto to parquet and save to S3
+                await workflow.execute_activity(
+                    data_trans_and_land,
+                    data_trans_and_land_input,
+                    start_to_close_timeout=timedelta(minutes=15),
+                    retry_policy=retry_policy,
+                )
+        except ActivityError as output_err:
+            workflow.logger.error(f"Data transformation failed: {output_err}")
+            raise output_err
+
+        return write_path