AKeyS/summarizer.py at main · fansunqi/AKeyS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Temporal Summarizer
# Adapted from https://github.com/Hyu-Zhang/HCQA/
import os
import re
import json
import time
import pdb
import ast
import openai
from tqdm import tqdm
from model import GPT
from datetime import datetime
from util import parse_json


systerm = '''
You're a visual summary expert. You can accurately make a [SUMMARY] based on [CAPTION], where the [CAPTION] is textual descriptions of the video as seen from your first person perspective.
'''

incontext_prompt = '''
[CAPTION]: Textual descriptions of first-person perspective videos, about natural human activities and behaviour. Each caption represents a frame in the video in chronological order, although they may not be consecutive. And each description is separated by a newline character. At the beginning of each caption, the #C indicates the image seen from your point of view, and the #O indicates the other people in the image you seen.
[SUMMARY]: Based on the CAPTIONS of these video clips, you need to summarise them into an overall description of the video, in chronological order.
I will give you an example as follow:
<Example>
{example}
Now, you should make a [SUMMARY] based on the [CAPTION] below. You SHOULD follow the format of example.
[CAPTION]
{caption}
[SUMMARY]
'''

systerm_qa='''
You are a visual question answering expert. You can choose the correct answer from five options of [OPTION] based on the [CAPTION], [SUMMARY], [QUESTION], and [REASON]. Where the [CAPTION] is textual descriptions of the video as seen from your first person perspective.
'''

incontext_prompt_qa='''
[CAPTION]: Textual descriptions of first-person perspective videos, about natural human activities and behaviour. Each caption represents a frame in the video in chronological order, although they may not be consecutive. And each description is separated by a newline character. At the beginning of each caption, the #C indicates the image seen from your point of view, and the #O indicates the other people in the image you seen.
[SUMMARY]: Based on the CAPTIONS of these video clips, an overall description of the video, in chronological order.
[QUESTION]: A question about video that needs to be answered.
[OPTION]: Five candidates for the question.
[REASON]: Based on [QUESTION] and [SUMMARY], reasoning step by step to get the answer. If [SUMMARY] doesn't have enough information, you need to get it from the [CAPTION].
I will give you some examples as follow:
{example}
Now, you should first make a [REASON] based on [QUESTION] and [SUMMARY], then give right number of [OPTION] as [ANSWER] . Additionally, you need to give me [CONFIDENCE] that indicates your confidence in answering the question accurately, on a scale from 1 to 5. You SHOULD answer the question, even given a low confidence.
[CAPTION]
{caption}
[SUMMARY]
{summary}
[QUESTION]
{question}
[OPTION]
{option}
'''

response_prompt='''
YOU MUST output in the JSON format.
{
"REASON":"[REASON]",
"ANSWER":"[ANSWER]",
"CONFIDENCE": "[CONFIDENCE]"
}
'''


# example_summary_path = "data/egoschema/example_summary.txt"
# with open(example_summary_path,'r') as ex:
#     example_summary = ex.read()

# example_qa_by_summary_path = "data/egoschema/example_qa_by_summary.txt"
# with open(example_qa_by_summary_path,'r') as ex:
#     example_qa_by_summary = ex.read()


def summarize_one_video(summarizer, video_id, sample_caps, \
                        example_summary, use_cache, logger):
    caps = ''
    for _, c in sample_caps.items():
        caps += c + "\n"
    instruction = str(video_id) + "\n" + systerm + "\n" + incontext_prompt.format(example = example_summary, caption = caps)
    response, info = summarizer.forward(head = None, prompt = instruction, \
                                        use_cache=use_cache, logger=logger)
    return response


def qa_one_video_by_summary(qa_model, ann, summary, video_id, sample_caps, \
                            example_qa_by_summary, use_cache, logger):

    caps = ''
    for _, c in sample_caps.items():
        caps += c + "\n"

    opt = ''
    que = 'question: ' + ann['question']
    opt += 'option 0: ' + ann['option 0'] + "\n"
    opt += 'option 1: ' + ann['option 1'] + "\n"
    opt += 'option 2: ' + ann['option 2'] + "\n"
    opt += 'option 3: ' + ann['option 3'] + "\n"
    opt += 'option 4: ' + ann['option 4'] + "\n"

    instruction = str(video_id) + "\n" + systerm_qa + "\n" + incontext_prompt_qa.format(
        example = example_qa_by_summary, caption = caps, summary = summary, question = que, option = opt
        ) + "\n" + response_prompt
    response, info = qa_model.forward(head=None, prompt=instruction, \
                                      use_cache=use_cache, logger=logger)

    response_dict = None
    try:
        response_dict = ast.literal_eval(response)
    except:
        response_dict = parse_json(response, video_id)

    # if response_dict == None:
    #     pdb.set_trace()

    return response_dict


def postprocess_response_dict(response_dict):

    if (response_dict == None) or \
        ("ANSWER" not in response_dict) or \
        ("CONFIDENCE" not in response_dict):
        return -1, -1

    try:
        id_value = int(response_dict["ANSWER"])
        if id_value < 0 or id_value > 4:
            id_value = -1
        conf = int(response_dict["CONFIDENCE"])
    except:
        pattern = r'\d+'
        match = re.search(pattern, str(response_dict["ANSWER"]))
        conf = re.search(pattern, str(response_dict["CONFIDENCE"]))
        if match:
            id_value = int(match.group())
            conf = int(conf.group())
        else:
            id_value = -1
            conf = -1

    return id_value, conf