1+ # ------ import module ------
2+ from datetime import datetime , timezone , timedelta
3+ import boto3
4+ import pickle , gzip , json
5+ import pandas as pd
6+ import numpy as np
7+ import os
8+ import sys
9+
10+ # ------ import user module ------
11+ sys .path .append ("/home/ubuntu/spotlake/utility" )
12+ from slack_msg_sender import send_slack_message
13+ from upload_data import upload_timestream , update_latest , save_raw , update_query_selector , update_config
14+ from compare_data import compare , compare_max_instance
15+
16+ def process_timestamp (TIMESTAMP , BUCKET_NAME , BUCKET_FILE_PATH ):
17+ S3_DIR_NAME = TIMESTAMP .strftime ('%Y/%m/%d' )
18+ S3_OBJECT_PREFIX = TIMESTAMP .strftime ('%H-%M' )
19+
20+ SPS_FILE_PREFIX = f"{ BUCKET_FILE_PATH } /sps/{ S3_DIR_NAME } "
21+ SPOTIF_FILE_NAME = f"{ BUCKET_FILE_PATH } /spot_if/{ S3_DIR_NAME } /{ S3_OBJECT_PREFIX } _spot_if.pkl.gz"
22+ ONDEMAND_PRICE_FILE_NAME = f"{ BUCKET_FILE_PATH } /ondemand_price/{ S3_DIR_NAME } /ondemand_price.pkl.gz"
23+ SPOTPRICE_FILE_NAME = f"{ BUCKET_FILE_PATH } /spot_price/{ S3_DIR_NAME } /{ S3_OBJECT_PREFIX } _spot_price.pkl.gz"
24+
25+ # ------ Set time data ------
26+ time_value = TIMESTAMP .strftime ("%Y-%m-%d %H:%M:%S" )
27+ print (f"Processing timestamp: { time_value } " )
28+ try :
29+ start_time = datetime .now (timezone .utc )
30+ # ------ Create Boto3 Session ------
31+ s3 = boto3 .resource ("s3" )
32+ s3_client = boto3 .client ('s3' )
33+
34+ # ------ Find Sps File in S3 ------
35+ sps_file_list = s3_client .list_objects_v2 (Bucket = BUCKET_NAME , Prefix = SPS_FILE_PREFIX )
36+ sps_files = []
37+ for obj in sps_file_list ['Contents' ]:
38+ if obj ['Key' ].startswith (f"{ SPS_FILE_PREFIX } /{ S3_OBJECT_PREFIX } " ):
39+ sps_files .append (obj ['Key' ])
40+
41+ sps_file_name = sps_files [0 ]
42+ print (sps_file_name )
43+ target_capacity = int (sps_file_name .split ('/' )[- 1 ].split ('_' )[2 ].split ('.' )[0 ])
44+
45+ # ------ Load Data from PKL File in S3 ------
46+ sps_df = pickle .load (gzip .open (s3 .Object (BUCKET_NAME , sps_file_name ).get ()["Body" ]))
47+ spotinfo_df = pickle .load (gzip .open (s3 .Object (BUCKET_NAME , SPOTIF_FILE_NAME .strip ()).get ()["Body" ]))
48+ ondemand_price_df = pickle .load (gzip .open (s3 .Object (BUCKET_NAME , ONDEMAND_PRICE_FILE_NAME .strip ()).get ()["Body" ]))
49+ spot_price_df = pickle .load (gzip .open (s3 .Object (BUCKET_NAME , SPOTPRICE_FILE_NAME .strip ()).get ()["Body" ]))
50+
51+ # ------ Create a DF by Selecting Only The Columns Required ------
52+ sps_df = sps_df [['InstanceType' , 'Region' , 'AZ' , 'SPS' , 'T3' , 'T2' ]]
53+ spotinfo_df = spotinfo_df [['InstanceType' , 'Region' , 'IF' ]]
54+ ondemand_price_df = ondemand_price_df [['InstanceType' , 'Region' , 'OndemandPrice' ]]
55+ spot_price_df = spot_price_df [['InstanceType' , 'AZ' , 'SpotPrice' ]]
56+
57+ # ------ Formatting Data ------
58+ spot_price_df ['SpotPrice' ] = spot_price_df ['SpotPrice' ].astype ('float' ).round (5 )
59+ ondemand_price_df ['OndemandPrice' ] = ondemand_price_df ['OndemandPrice' ].astype ('float' ).round (5 )
60+
61+ # ------ Need to Change to Outer Join ------
62+ merge_df = pd .merge (sps_df , spotinfo_df , how = "outer" )
63+ merge_df = pd .merge (merge_df , ondemand_price_df , how = "outer" )
64+ merge_df = pd .merge (merge_df , spot_price_df , how = "outer" )
65+
66+ merge_df ['Savings' ] = 100.0 - (merge_df ['SpotPrice' ] * 100 / merge_df ['OndemandPrice' ])
67+ merge_df ['Savings' ] = merge_df ['Savings' ].fillna (- 1 )
68+ merge_df ['SPS' ] = merge_df ['SPS' ].fillna (- 1 )
69+ merge_df ['SpotPrice' ] = merge_df ['SpotPrice' ].fillna (- 1 )
70+ merge_df ['OndemandPrice' ] = merge_df ['OndemandPrice' ].fillna (- 1 )
71+ merge_df ['IF' ] = merge_df ['IF' ].fillna (- 1 )
72+
73+ merge_df ['Savings' ] = merge_df ['Savings' ].astype ('int' )
74+ merge_df ['SPS' ] = merge_df ['SPS' ].astype ('int' )
75+ merge_df ['T3' ] = merge_df ['T3' ].fillna (0 ).astype ('int' )
76+ merge_df ['T2' ] = merge_df ['T2' ].fillna (0 ).astype ('int' )
77+
78+ merge_df = merge_df .drop (merge_df [(merge_df ['AZ' ].isna ()) | (merge_df ['Region' ].isna ()) | (merge_df ['InstanceType' ].isna ())].index )
79+
80+ merge_df .reset_index (drop = True , inplace = True )
81+ merge_df ['Time' ] = time_value
82+
83+ end_time = datetime .now (timezone .utc )
84+ print (f"Merging time is { (end_time - start_time ).total_seconds () * 1000 / 60000 :.2f} min" )
85+
86+ # ------ Check The Previous DF File in S3 and Local ------
87+ previous_df = None
88+ start_time = datetime .now (timezone .utc )
89+ filename = '/home/ubuntu/spotlake/utility/manual_merge_aws_rawdata/latest_aws.json'
90+
91+ previous_df = pd .DataFrame (json .load (open (filename , 'r' )))
92+
93+ previous_df = previous_df .drop (columns = ['id' ])
94+ print (previous_df )
95+
96+ end_time = datetime .now (timezone .utc )
97+ print (f"Checking time of previous json file is { (end_time - start_time ).total_seconds () * 1000 / 60000 :.2f} min" )
98+
99+ start_time = datetime .now (timezone .utc )
100+
101+ # ------ Compare T3 and T2 Data ------
102+ current_df = compare_max_instance (previous_df , merge_df , target_capacity )
103+
104+ # # ------ Upload Merge DF to s3 Bucket ------
105+ update_latest (current_df , TIMESTAMP )
106+ save_raw (current_df , TIMESTAMP )
107+
108+ # ------ Compare All Data ------
109+ workload_cols = ['InstanceType' , 'Region' , 'AZ' ]
110+ feature_cols = ['SPS' , 'T3' , 'T2' , 'IF' , 'SpotPrice' , 'OndemandPrice' ]
111+
112+ changed_df , removed_df = compare (previous_df , current_df , workload_cols , feature_cols ) # compare previous_df and current_df to extract changed rows)
113+ end_time = datetime .now (timezone .utc )
114+ print (f"Compare time is { (end_time - start_time ).total_seconds () * 1000 / 60000 :.2f} min" )
115+
116+ # # ------ Upload TSDB ------
117+ # start_time = datetime.now(timezone.utc)
118+ # upload_timestream(changed_df, TIMESTAMP)
119+ # upload_timestream(removed_df, TIMESTAMP)
120+ # end_time = datetime.now(timezone.utc)
121+ # print(f"Uploading time to TSDB is {(end_time - start_time).total_seconds() * 1000 / 60000:.2f} min")
122+
123+ except Exception as e :
124+ send_slack_message (e )
125+ print (e )
126+
127+ def main ():
128+ print ("Start Lambda Function" )
129+ send_slack_message ("수동 데이터 CSV 병합이 시작되었습니다!" )
130+ start_time = datetime .now (timezone .utc )
131+
132+ # ------ Set Constants ------
133+ BUCKET_NAME = "spotlake"
134+ BUCKET_FILE_PATH = "rawdata/aws"
135+
136+ START_DATE = datetime (2025 , 2 , 15 , 0 , 10 , 0 , tzinfo = timezone .utc )
137+ END_DATE = datetime (2025 , 4 , 4 , 0 , 0 , 0 , tzinfo = timezone .utc )
138+
139+ current_time = START_DATE
140+ while current_time <= END_DATE :
141+ TIMESTAMP = current_time .replace (minute = ((current_time .minute // 10 ) * 10 ), second = 0 ) - timedelta (minutes = 10 )
142+ process_timestamp (TIMESTAMP , BUCKET_NAME , BUCKET_FILE_PATH )
143+ current_time += timedelta (minutes = 10 )
144+
145+ end_time = datetime .now (timezone .utc )
146+ print (f"Total running time is { (end_time - start_time ).total_seconds () * 1000 / 60000 :.2f} min" )
147+
148+ def lambda_handler (event , context ):
149+ start_time = datetime .now (timezone .utc )
150+ print ("Lambda handler invoked" )
151+ main ()
152+ end_time = datetime .now (timezone .utc )
153+ print (f"Running time is { (end_time - start_time ).total_seconds () * 1000 / 60000 :.2f} min" )
154+ return "Process completed successfully"
155+
156+ if __name__ == "__main__" :
157+ lambda_handler (None , None )
158+ send_slack_message ("수동 데이터 CSV 병합이 완료되었습니다!" )
0 commit comments