Conversation
| actions = (event.data for event in events) | ||
|
|
||
| index = 0 | ||
| async for success, item in helpers.async_streaming_bulk(client, actions, **kwargs): # type: ignore | ||
| if index >= len(events): | ||
| break | ||
| async for success, item in helpers.async_streaming_bulk(client, actions, **kwargs): | ||
|
|
||
| event = events[index] | ||
| event.state.current_state = EventStateType.STORING_IN_OUTPUT | ||
| # This should not be possible! | ||
| assert index < len(events) |
There was a problem hiding this comment.
bulk_id = uuid.uuid4()
actions = {**event.data, "_id": f"{bulk_id}_{index}"} for index, event in enumerate(events)
index = 0
async for success, item in helpers.async_streaming_bulk(client, actions, **kwargs):
# This should not be possible!
assert index < len(events)
assert index == int(item["create"]["_id"][37:])This proofs that helpers.async_streaming_bulk keeps the order for actions and yield iteration the same
There was a problem hiding this comment.
This could possibly stay in the code, but I dont like generating a uuid here and setting it as an id, if opensearch probably does that more performantly
mhoff
left a comment
There was a problem hiding this comment.
Many thanks for your work. Here the few comments we already discussed
| async for success, item in helpers.async_streaming_bulk(client, actions, **kwargs): # type: ignore | ||
| if index >= len(events): | ||
| break | ||
| async for success, item in helpers.async_streaming_bulk(client, actions, **kwargs): |
There was a problem hiding this comment.
# "queue_size": self.config.queue_size,
# "thread_count": self.config.thread_count,
Please remove and describe that it's not used in the docs
| # parallel_bulk often returned item that allowed item.get("_op_type") | ||
| # streaming_bulk usually returns {"index": {...}} / {"create": {...}} | ||
| op_type = item.get("_op_type") if isinstance(item, dict) else None | ||
| if not op_type and isinstance(item, dict) and item: | ||
| op_type = self.config.default_op_type | ||
| if "_op_type" in item: | ||
| op_type = item["_op_type"] | ||
| elif isinstance(item, dict): | ||
| op_type = next(iter(item.keys())) |
There was a problem hiding this comment.
Please simplify this code as we are only using the async_streaming_bulk interface right now and don't need the downwards compatibility
| error_info = ( | ||
| item.get(op_type, {}) if isinstance(item.get(op_type), dict) else {} | ||
| ) | ||
| if op_type in item and isinstance(item[op_type], dict): |
There was a problem hiding this comment.
We can statically assume item to be a dict which can simplify this code quite a bit
| async for success, item in helpers.async_streaming_bulk(client, actions, **kwargs): # type: ignore | ||
| if index >= len(events): | ||
| break | ||
| async for success, item in helpers.async_streaming_bulk(client, actions, **kwargs): |
There was a problem hiding this comment.
Please add a follow-up ticket for us that we might want send the chunks concurrently in the future, depending on where we identify actual performance bottlenecks
Description
Cleanup and optimize Opensearch async output
Assignee
Documentation
Code Quality
How did you verify that the changes work in practice?
Reviewer
The rendered docs for this PR can be found here.