Fix id bug hotfix

ty-kim7 · ty-kim7 · commit cf2cb36a88c1 · 2025-04-03T15:09:45.000+09:00
diff --git a/collector/spot-dataset/aws/lambda/post_processing_data/lambda_function.py b/collector/spot-dataset/aws/lambda/post_processing_data/lambda_function.py
@@ -1,7 +1,9 @@
 # ------ import module ------
 from datetime import datetime, timezone, timedelta
 import boto3
-import pickle, gzip, json
+import pickle
+import gzip
+import json
 import pandas as pd
 import numpy as np
 import os
@@ -11,9 +13,11 @@
 from upload_data import upload_timestream, update_latest, save_raw, update_query_selector, update_config
 from compare_data import compare, compare_max_instance
 
+
 class FirstRunError(Exception):
     pass
 
+
 def main():
     print("Start Lambda Function")
     start_time = datetime.now(timezone.utc)
@@ -25,15 +29,15 @@ def main():
     TIMESTAMP = start_time.replace(minute=((start_time.minute // 10) * 10), second=0) - timedelta(minutes=10)
     S3_DIR_NAME = TIMESTAMP.strftime('%Y/%m/%d')
     S3_OBJECT_PREFIX = TIMESTAMP.strftime('%H-%M')
-    
+
     SPS_FILE_PREFIX = f"{BUCKET_FILE_PATH}/sps/{S3_DIR_NAME}"
     SPOTIF_FILE_NAME = f"{BUCKET_FILE_PATH}/spot_if/{S3_DIR_NAME}/{S3_OBJECT_PREFIX}_spot_if.pkl.gz"
     ONDEMAND_PRICE_FILE_NAME = f"{BUCKET_FILE_PATH}/ondemand_price/{S3_DIR_NAME}/ondemand_price.pkl.gz"
     SPOTPRICE_FILE_NAME = f"{BUCKET_FILE_PATH}/spot_price/{S3_DIR_NAME}/{S3_OBJECT_PREFIX}_spot_price.pkl.gz"
 
     # ------ Set time data ------
     time_value = TIMESTAMP.strftime("%Y-%m-%d %H:%M:%S")
-    
+
     try:
         # ------ Create Boto3 Session ------
         s3 = boto3.resource("s3")
@@ -61,7 +65,7 @@ def main():
         spotinfo_df = spotinfo_df[['InstanceType', 'Region', 'IF']]
         ondemand_price_df = ondemand_price_df[['InstanceType', 'Region', 'OndemandPrice']]
         spot_price_df = spot_price_df[['InstanceType', 'AZ', 'SpotPrice']]
-        
+
         # ------ Formatting Data ------
         spot_price_df['SpotPrice'] = spot_price_df['SpotPrice'].astype('float').round(5)
         ondemand_price_df['OndemandPrice'] = ondemand_price_df['OndemandPrice'].astype('float').round(5)
@@ -84,7 +88,7 @@ def main():
         merge_df['T2'] = merge_df['T2'].fillna(0).astype('int')
 
         merge_df = merge_df.drop(merge_df[(merge_df['AZ'].isna()) | (merge_df['Region'].isna()) | (merge_df['InstanceType'].isna())].index)
-        
+
         merge_df.reset_index(drop=True, inplace=True)
         merge_df['Time'] = time_value
 
@@ -101,11 +105,11 @@ def main():
             # Verify that the data is in the old format
             columns_to_check = ["T3", "T2"]
             existing_columns = [col for col in columns_to_check if col in previous_df.columns]
-            
+
             if len(existing_columns) == 0:
                 raise FirstRunError("Can't load the previous df from s3 bucket or First run since changing the collector")
             else:
-                previous_df = previous_df.drop(columns=['Id'])
+                previous_df = previous_df.drop(columns=['id'])
         except FirstRunError as e:
             # If system is first time uploading data, make a new one and upload it to TSDB
             update_latest(merge_df, TIMESTAMP)
@@ -117,21 +121,21 @@ def main():
 
         end_time = datetime.now(timezone.utc)
         print(f"Checking time of previous json file is {(end_time - start_time).total_seconds() * 1000 / 60000:.2f} min")
-        
+
         start_time = datetime.now(timezone.utc)
-    
+
         # ------ Compare T3 and T2 Data ------
         current_df = compare_max_instance(previous_df, merge_df, target_capacity)
 
         # ------ Upload Merge DF to s3 Bucket ------
         update_latest(current_df, TIMESTAMP)
         save_raw(current_df, TIMESTAMP)
-            
+
         # ------ Compare All Data ------
         workload_cols = ['InstanceType', 'Region', 'AZ']
         feature_cols = ['SPS', 'T3', 'T2', 'IF', 'SpotPrice', 'OndemandPrice']
 
-        changed_df, removed_df = compare(previous_df, current_df, workload_cols, feature_cols) # compare previous_df and current_df to extract changed rows)
+        changed_df, removed_df = compare(previous_df, current_df, workload_cols, feature_cols)  # compare previous_df and current_df to extract changed rows)
         end_time = datetime.now(timezone.utc)
         print(f"Compare time is {(end_time - start_time).total_seconds() * 1000 / 60000:.2f} min")
 
@@ -152,13 +156,14 @@ def main():
         print(e)
         raise
 
+
 def lambda_handler(event, context):
     start_time = datetime.now(timezone.utc)
     main()
     end_time = datetime.now(timezone.utc)
     print(f"Running time is {(end_time - start_time).total_seconds() * 1000 / 60000:.2f} min")
     return "Process completed successfully"
 
+
 if __name__ == "__main__":
     lambda_handler({}, {})
-