dlstreamer/python/gstgva/audio/audio_event.py at 530511988c371a2d679cc73da473c0ceec72d4b7 · open-edge-platform/dlstreamer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# ==============================================================================
# Copyright (C) 2018-2026 Intel Corporation
#
# SPDX-License-Identifier: MIT
# ==============================================================================

## @file audio_event.py
#  @brief This file contains gstgva.audio_event.AudioEvent class to control audio events for particular gstgva.audio_frame.AudioFrame with gstgva.tensor.Tensor instances attached

import ctypes
from typing import List
from collections import namedtuple

from ..tensor import Tensor
from ..util import libgst, libgobject, GLIST_POINTER
from .audio_event_meta import AudioEventMeta

import gi
gi.require_version('GstAudio', '1.0')
gi.require_version('GLib', '2.0')
gi.require_version('Gst', '1.0')
# pylint: disable=no-name-in-module
from gi.repository import GstAudio, GLib, GObject, Gst
# pylint: enable=no-name-in-module

Segment = namedtuple("Segment", "start_time end_time")

## @brief This class represents audio event - object describing detection result (audio segment) and containing multiple
# Tensor objects (inference results) attached by multiple models. For example, it can be audio event with detected
# speech and converts speech to text. It can be produced by a pipeline with gvaaudiodetect with detection model and
# gvaspeechtotext element with speechtotext model. Such AudioEvent will have start and end timestamps filled and will
# have 2 Tensor objects attached - 1 Tensor object with detection result, other with speech to text tensor objectresult

class AudioEvent(object):
    ## @brief Get clip of  AudioEvent as start and end time stamps
    #  @return Start and end time of AudioEvent
    def segment(self):
        return Segment(start_time = self.__event_meta.start_timestamp,
                    end_time = self.__event_meta.end_timestamp)

    ## @brief Get AudioEvent label
    #  @return AudioEvent label
    def label(self) -> str:
        return GLib.quark_to_string(self.__event_meta.event_type)

    ## @brief Get AudioEvent detection confidence (set by gvaaudiodetect)
    # @return last added detection Tensor confidence if exists, otherwise None
    def confidence(self) -> float:
        detection = self.detection()
        return detection.confidence() if detection else None

    ## @brief Get all Tensor instances added to this AudioEvent
    # @return vector of Tensor instances added to this AudioEvent
    def tensors(self):
        param = self.meta()._params
        while param:
            tensor_structure = param.contents.data
            yield Tensor(tensor_structure)
            param = param.contents.next

    ## @brief Returns detection Tensor, last added to this AudioEvent. As any other Tensor, returned detection
    # Tensor can contain arbitrary information. If you use AudioEvent based on GstGVAAudioEventMeta
    # attached by gvaaudiodetect by default, then this Tensor will contain "label_id", "confidence", "start_timestamp",
    # "end_timestamp" fields.
    # If AudioEvent doesn't have detection Tensor, it will be created in-place.
    # @return detection Tensor, empty if there were no detection Tensor objects added to this AudioEvent when
    # this method was called
    def detection(self) -> Tensor:
        for tensor in self.tensors():
            if tensor.is_detection():
                return tensor
        return None

    ## @brief Get label_id from detection Tensor, last added to this AudioEvent
    # @return last added detection Tensor label_id if exists, otherwise None
    def label_id(self) -> int:
        detection = self.detection()
        return detection.label_id() if detection else None

    ## @brief Get AudioEventMeta containing start, end time information and tensors (inference results).
    # Tensors are represented as GstStructures added to GstGVAAudioEventMeta.params
    # @return AudioEventMeta containing start, end time information and tensors (inference results)
    def meta(self) -> AudioEventMeta:
        return self.__event_meta

    ## @brief Iterate by AudioEventMeta instances attached to buffer
    # @param buffer buffer with GstGVAAudioEventMeta instances attached
    # @return generator for AudioEventMeta instances attached to buffer
    @classmethod
    def _iterate(self, buffer: Gst.Buffer):
        try:
            meta_api = hash(GObject.GType.from_name("GstGVAAudioEventMetaAPI"))
        except:
            return
        gpointer = ctypes.c_void_p()
        while True:
            try:
                value = libgst.gst_buffer_iterate_meta_filtered(hash(buffer), ctypes.byref(gpointer), meta_api)
            except:
                value = None

            if not value:
                return

            event_meta = ctypes.cast(value, ctypes.POINTER(AudioEventMeta)).contents
            yield AudioEvent(event_meta)

    ## @brief Construct AudioEvent instance from AudioEventMeta. After this, AudioEvent will
    # obtain all tensors (detection & inference results) from AudioEventMeta
    # @param event_meta AudioEventMeta containing start, end time information and tensors
    def __init__(self, event_meta: AudioEventMeta):
        self.__event_meta = event_meta