Skip to content

Commit 223a269

Browse files
authored
Merge pull request #609 from ddps-lab/azure-collector
Force first optimization with DesiredCount==1
2 parents a2fd0a3 + a213d2b commit 223a269

2 files changed

Lines changed: 35 additions & 10 deletions

File tree

collector/spot-dataset/azure/lambda/current_collector/lambda_function_sps.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,26 +71,44 @@ def lambda_handler(event, context):
7171
desired_count_index = metadata["desired_count_index"]
7272
current_desired_count = DESIRED_COUNTS[desired_count_index]
7373

74-
workload_date = metadata.get("workload_date")
74+
# 2. Determine Execution Parameters (Date Check & Index Rotation)
75+
is_first_time_optimization = False
7576

76-
# 2. Check Date for First Time Optimization
77+
# Check Workload Date
7778
if workload_date != current_date:
78-
Logger.info(f"Workload date changed: {workload_date} -> {current_date}. Running First Time Optimization with Count: {current_desired_count}")
79-
# Use the rotated desired count for the optimization run too
80-
sps_df = load_sps.collect_spot_placement_score_first_time(desired_counts=[current_desired_count])
79+
Logger.info(f"Workload date changed: {workload_date} -> {current_date}. Prepared First Time Optimization.")
80+
is_first_time_optimization = True
81+
82+
# Update Metadata: Date
8183
metadata["workload_date"] = current_date
84+
85+
# Force Desired Count to 1 for First Time Optimization execution
86+
# Note: We do NOT reset the index here. We continue rotation seamlessly.
87+
current_execution_desired_count = 1
8288
else:
83-
Logger.info(f"Running Regular Collection. Desired Count: {current_desired_count} (Index: {desired_count_index})")
84-
sps_df = load_sps.collect_spot_placement_score(desired_counts=[current_desired_count])
89+
current_execution_desired_count = current_desired_count
8590

86-
# 3. Update Index for Next Run
91+
# Update Metadata: Next Index (Always rotate to prevent stuck loops)
8792
next_index = (desired_count_index + 1) % len(DESIRED_COUNTS)
8893
metadata["desired_count_index"] = next_index
94+
95+
# 3. Save Metadata (State Commit BEFORE Execution)
8996
try:
9097
write_metadata(metadata)
9198
except Exception as e:
9299
Logger.error(f"Failed to write metadata: {e}")
93-
raise
100+
# Log but proceed. If write failed, we might retry same index next time,
101+
# but if execution succeeds, at least data is collected.
102+
# If execution also fails, we risk loop, but S3 failure is rare compared to API Timeout.
103+
104+
# 4. Execute Logic
105+
if is_first_time_optimization:
106+
Logger.info(f"Executing First Time Optimization with Count: {current_execution_desired_count} (Forced)")
107+
sps_df = load_sps.collect_spot_placement_score_first_time(desired_counts=[current_execution_desired_count])
108+
else:
109+
Logger.info(f"Executing Regular Collection. Desired Count: {current_execution_desired_count} (Index: {desired_count_index})")
110+
sps_df = load_sps.collect_spot_placement_score(desired_counts=[current_execution_desired_count])
111+
94112

95113
else:
96114
# --- Legacy Fallback Logic: S3 Metadata Missing ---

collector/spot-dataset/azure/lambda/current_collector/load_sps.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,14 @@ def execute_spot_placement_score_api(region_chunk, instance_type_chunk, desired_
320320
retries = handle_retry("InvalidInstanceType", retries, max_retries)
321321

322322
if "BadGatewayConnection" in error_message:
323-
print(f"HTTP error occurred: {error_message}")
323+
print(f"[DEBUG_ERROR] BadGatewayConnection occurred!")
324+
print(f"URL: {url}")
325+
print(f"Data - DesiredCount: {desired_count}, AvailabilityZones: {availability_zones}")
326+
print(f"Data - Region Chunk: {region_chunk}")
327+
print(f"Data - Instance Chunk: {instance_type_chunk}")
328+
print(f"Response Headers: {http_err.response.headers}")
329+
print(f"Full Error Message: {error_message}")
330+
324331
retries = handle_retry("BadGatewayConnection", retries, max_retries)
325332

326333
elif "InvalidParameter" in error_message:

0 commit comments

Comments
 (0)