1
1
from dataclasses import dataclass
2
- from typing import List
2
+ from typing import List , Optional
3
3
4
4
import logging
5
5
import multiprocessing
8
8
from datetime import datetime , timedelta , timezone
9
9
10
10
from temporalio import workflow
11
+ from temporalio .common import SearchAttributeKey
11
12
from temporalio .worker import Worker , SharedStateManager
12
13
from temporalio .client import (
13
14
Client as TemporalClient ,
18
19
ScheduleState ,
19
20
)
20
21
22
+ from oonipipeline .temporal .activities .common import (
23
+ optimize_all_tables ,
24
+ ClickhouseParams ,
25
+ )
26
+
21
27
22
28
# Handle temporal sandbox violations related to calls to self.processName =
23
29
# mp.current_process().name in logger, see:
36
42
make_analysis_in_a_day ,
37
43
make_cc_batches ,
38
44
)
39
- from oonipipeline .temporal .common import get_obs_count_by_cc , optimize_all_tables
45
+ from oonipipeline .temporal .common import get_obs_count_by_cc
40
46
from oonipipeline .temporal .activities .observations import (
41
47
MakeObservationsParams ,
42
48
make_observation_in_day ,
@@ -67,6 +73,7 @@ def make_worker(client: TemporalClient, parallelism: int) -> Worker:
67
73
make_observation_in_day ,
68
74
make_ground_truths_in_day ,
69
75
make_analysis_in_a_day ,
76
+ optimize_all_tables ,
70
77
],
71
78
activity_executor = concurrent .futures .ProcessPoolExecutor (parallelism + 2 ),
72
79
max_concurrent_activities = parallelism ,
@@ -76,6 +83,14 @@ def make_worker(client: TemporalClient, parallelism: int) -> Worker:
76
83
)
77
84
78
85
86
+ def get_workflow_start_time () -> datetime :
87
+ workflow_start_time = workflow .info ().typed_search_attributes .get (
88
+ SearchAttributeKey .for_datetime ("TemporalScheduledStartTime" )
89
+ )
90
+ assert workflow_start_time is not None , "TemporalScheduledStartTime not set"
91
+ return workflow_start_time
92
+
93
+
79
94
@dataclass
80
95
class ObservationsWorkflowParams :
81
96
probe_cc : List [str ]
@@ -84,37 +99,27 @@ class ObservationsWorkflowParams:
84
99
data_dir : str
85
100
fast_fail : bool
86
101
log_level : int = logging .INFO
102
+ bucket_date : Optional [str ] = None
87
103
88
104
89
105
@workflow .defn
90
106
class ObservationsWorkflow :
91
107
@workflow .run
92
108
async def run (self , params : ObservationsWorkflowParams ) -> dict :
93
- # TODO(art): wrap this a coroutine call
94
- optimize_all_tables (params .clickhouse )
95
-
96
- workflow_id = workflow .info ().workflow_id
97
-
98
- # TODO(art): this is quite sketchy. Waiting on temporal slack question:
99
- # https://temporalio.slack.com/archives/CTT84RS0P/p1714040382186429
100
- run_ts = datetime .strptime (
101
- "-" .join (workflow_id .split ("-" )[- 3 :]),
102
- "%Y-%m-%dT%H:%M:%SZ" ,
109
+ if params .bucket_date is None :
110
+ params .bucket_date = (
111
+ get_workflow_start_time () - timedelta (days = 1 )
112
+ ).strftime ("%Y-%m-%d" )
113
+
114
+ await workflow .execute_activity (
115
+ optimize_all_tables ,
116
+ ClickhouseParams (clickhouse_url = params .clickhouse ),
117
+ start_to_close_timeout = timedelta (minutes = 5 ),
103
118
)
104
- bucket_date = (run_ts - timedelta (days = 1 )).strftime ("%Y-%m-%d" )
105
-
106
- # read_time = workflow_info.start_time - timedelta(days=1)
107
- # log.info(f"workflow.info().start_time={workflow.info().start_time} ")
108
- # log.info(f"workflow.info().cron_schedule={workflow.info().cron_schedule} ")
109
- # log.info(f"workflow_info.workflow_id={workflow_info.workflow_id} ")
110
- # log.info(f"workflow_info.run_id={workflow_info.run_id} ")
111
- # log.info(f"workflow.now()={workflow.now()}")
112
- # print(workflow)
113
- # bucket_date = f"{read_time.year}-{read_time.month:02}-{read_time.day:02}"
114
119
115
120
t = PerfTimer ()
116
121
log .info (
117
- f"Starting observation making with probe_cc={ params .probe_cc } ,test_name={ params .test_name } bucket_date={ bucket_date } "
122
+ f"Starting observation making with probe_cc={ params .probe_cc } ,test_name={ params .test_name } bucket_date={ params . bucket_date } "
118
123
)
119
124
120
125
res = await workflow .execute_activity (
@@ -125,19 +130,17 @@ async def run(self, params: ObservationsWorkflowParams) -> dict:
125
130
clickhouse = params .clickhouse ,
126
131
data_dir = params .data_dir ,
127
132
fast_fail = params .fast_fail ,
128
- bucket_date = bucket_date ,
133
+ bucket_date = params . bucket_date ,
129
134
),
130
135
start_to_close_timeout = timedelta (minutes = 30 ),
131
136
)
132
137
133
138
total_size = res ["size" ]
134
139
total_measurement_count = res ["measurement_count" ]
135
-
136
- # This needs to be adjusted once we get the the per entry concurrency working
137
140
mb_per_sec = round (total_size / t .s / 10 ** 6 , 1 )
138
141
msmt_per_sec = round (total_measurement_count / t .s )
139
142
log .info (
140
- f"finished processing { bucket_date } speed: { mb_per_sec } MB/s ({ msmt_per_sec } msmt/s)"
143
+ f"finished processing { params . bucket_date } speed: { mb_per_sec } MB/s ({ msmt_per_sec } msmt/s)"
141
144
)
142
145
143
146
# with ClickhouseConnection(params.clickhouse) as db:
0 commit comments