Skip to content

Commit aa94d06

Browse files
committed
Add big collection test for GBK and BagState
1 parent 41828ba commit aa94d06

File tree

5 files changed

+155
-0
lines changed

5 files changed

+155
-0
lines changed

big-collection/python/bigbag.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import logging
2+
from typing import Tuple
3+
4+
from apache_beam import Create
5+
from apache_beam import DoFn
6+
from apache_beam import FlatMap
7+
from apache_beam import ParDo
8+
from apache_beam import Pipeline
9+
from apache_beam import TimeDomain
10+
from apache_beam import WithKeys
11+
from apache_beam import typehints
12+
from apache_beam.coders import BytesCoder
13+
from apache_beam.options.pipeline_options import PipelineOptions
14+
from apache_beam.options.pipeline_options import SetupOptions
15+
from apache_beam.transforms.userstate import BagStateSpec
16+
from apache_beam.transforms.userstate import TimerSpec
17+
from apache_beam.transforms.userstate import on_timer
18+
19+
# The total bytes processed is NUM_SHARDS * NUM_ELEMENTS_PER_SHARD * ELEMENT_BYTES ~= 3 GiB
20+
NUM_SHARDS = 100
21+
NUM_ELEMENTS_PER_SHARD = 10
22+
ELEMENT_BYTES = 3 * 1024 * 1024 # 3 MiB
23+
24+
25+
@typehints.with_input_types(Tuple[str, bytes])
26+
@typehints.with_output_types(None)
27+
class BigBagDoFn(DoFn):
28+
VALUES_STATE = BagStateSpec('values', BytesCoder())
29+
END_OF_WINDOW_TIMER = TimerSpec('end_of_window', TimeDomain.WATERMARK)
30+
31+
def process(self, element: Tuple[str, bytes], window=DoFn.WindowParam,
32+
values_state=DoFn.StateParam(VALUES_STATE),
33+
end_of_window_timer=DoFn.TimerParam(END_OF_WINDOW_TIMER)):
34+
logging.info('start process.')
35+
key, value = element
36+
end_of_window_timer.set(window.end)
37+
values_state.add(value)
38+
logging.info('end process.')
39+
40+
@on_timer(END_OF_WINDOW_TIMER)
41+
def end_of_window(self, values_state=DoFn.StateParam(VALUES_STATE)):
42+
logging.info('start end_of_window.')
43+
44+
read_count = 0
45+
read_bytes = 0
46+
values = values_state.read()
47+
for value in values:
48+
read_count += 1
49+
read_bytes += len(value)
50+
51+
logging.info('read_count: %s, read_bytes: %s', read_count, read_bytes)
52+
logging.info('end end_of_window.')
53+
54+
55+
def main():
56+
options = PipelineOptions()
57+
options.view_as(SetupOptions).save_main_session = True
58+
59+
p = Pipeline(options=options)
60+
(p
61+
| Create(list(range(NUM_SHARDS)))
62+
| FlatMap(lambda _:
63+
(bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD)))
64+
| WithKeys('')
65+
| ParDo(BigBagDoFn()))
66+
67+
p.run()
68+
69+
70+
if __name__ == '__main__':
71+
logging.getLogger().setLevel(logging.INFO)
72+
main()

big-collection/python/biggbk.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import logging
2+
from typing import Iterable
3+
from typing import Tuple
4+
5+
from apache_beam import Create
6+
from apache_beam import FlatMap
7+
from apache_beam import GroupByKey
8+
from apache_beam import ParDo
9+
from apache_beam import Pipeline
10+
from apache_beam import WithKeys
11+
from apache_beam.options.pipeline_options import PipelineOptions
12+
from apache_beam.options.pipeline_options import SetupOptions
13+
14+
# The total bytes processed is NUM_SHARDS * NUM_ELEMENTS_PER_SHARD * ELEMENT_BYTES ~= 3 GiB
15+
16+
NUM_SHARDS = 100
17+
NUM_ELEMENTS_PER_SHARD = 10
18+
ELEMENT_BYTES = 3 * 1024 * 1024 # 3 MiB
19+
20+
def print_bytes(element: Tuple[str, Iterable[bytes]]) -> None:
21+
logging.info('start print_bytes.')
22+
key, values = element
23+
24+
read_count = 0
25+
read_bytes = 0
26+
for value in values:
27+
read_count += 1
28+
read_bytes += len(value)
29+
30+
logging.info('read_count: %s, read_bytes: %s', read_count, read_bytes)
31+
logging.info('end print_bytes.')
32+
33+
34+
def main():
35+
options = PipelineOptions()
36+
options.view_as(SetupOptions).save_main_session = True
37+
38+
p = Pipeline(options=options)
39+
(p
40+
| Create(list(range(NUM_SHARDS)))
41+
| FlatMap(lambda _:
42+
(bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD)))
43+
| WithKeys('')
44+
| GroupByKey()
45+
| ParDo(print_bytes))
46+
47+
p.run()
48+
49+
50+
if __name__ == '__main__':
51+
logging.getLogger().setLevel(logging.INFO)
52+
main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
apache-beam[gcp]==2.35.0
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
3+
PROJECT=$(gcloud config get-value project)
4+
PIPELINE=bigbag.py
5+
6+
# NOTE: Job has no progress if --number_of_worker_harness_threads=1
7+
python $PIPELINE --runner=DataflowRunner \
8+
--project=$PROJECT \
9+
--region=us-central1 \
10+
--streaming \
11+
--worker_machine_type custom-1-102400-ext \
12+
--experiments=use_runner_v2 \
13+
--experiments=no_use_multiple_sdk_containers \
14+
--max_num_workers=1 \
15+
--number_of_worker_harness_threads=2
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
3+
PROJECT=$(gcloud config get-value project)
4+
PIPELINE=biggbk.py
5+
6+
# NOTE: Job has no progress if --number_of_worker_harness_threads=1
7+
python $PIPELINE --runner=DataflowRunner \
8+
--project=$PROJECT \
9+
--region=us-central1 \
10+
--streaming \
11+
--worker_machine_type custom-1-102400-ext \
12+
--experiments=use_runner_v2 \
13+
--experiments=no_use_multiple_sdk_containers \
14+
--max_num_workers=1 \
15+
--number_of_worker_harness_threads=2

0 commit comments

Comments
 (0)