Skip to content

Commit 4ce1b80

Browse files
authored
Dynamics of Talk-Time Sharing in Conversations (#276)
1 parent 0bda800 commit 4ce1b80

File tree

8 files changed

+3692
-1
lines changed

8 files changed

+3692
-1
lines changed

convokit/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"utterance_likelihood": ".utterance_likelihood",
3333
"speaker_convo_helpers": ".speaker_convo_helpers",
3434
"politeness_collections": ".politeness_collections",
35+
"talktimesharing": ".talktimesharing",
3536
}
3637

3738
# Cache for loaded modules
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .talktimesharing import *
Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
from convokit.model import Corpus
2+
from convokit.transformer import Transformer
3+
from tqdm import tqdm
4+
from typing import Callable
5+
from convokit.model.conversation import Conversation
6+
import re
7+
8+
from .talktimesharing_util import (
9+
_get_ps,
10+
_convo_balance_score,
11+
_convo_balance_lst,
12+
_plot_individual_conversation_floors,
13+
_plot_multi_conversation_floors,
14+
)
15+
16+
17+
def plot_single_conversation_balance(
18+
corpus,
19+
convo_id,
20+
window_ps_threshold,
21+
window_size,
22+
sliding_size,
23+
remove_first_last_utt,
24+
min_utt_words,
25+
plot_name=None,
26+
window_ss_threshold=None,
27+
):
28+
if window_ss_threshold is None:
29+
window_ss_threshold = window_ps_threshold
30+
_plot_individual_conversation_floors(
31+
corpus,
32+
convo_id,
33+
window_ps_threshold,
34+
window_ss_threshold,
35+
window_size,
36+
sliding_size,
37+
remove_first_last_utt,
38+
min_utt_words,
39+
plot_name=plot_name,
40+
)
41+
42+
43+
def plot_multi_conversation_balance(
44+
corpus,
45+
convo_id_lst,
46+
window_ps_threshold,
47+
window_ss_threshold,
48+
window_size,
49+
sliding_size,
50+
remove_first_last_utt,
51+
min_utt_words,
52+
plot_name=None,
53+
):
54+
if window_ss_threshold is None:
55+
window_ss_threshold = window_ps_threshold
56+
_plot_multi_conversation_floors(
57+
corpus,
58+
convo_id_lst,
59+
window_ps_threshold,
60+
window_ss_threshold,
61+
window_size,
62+
sliding_size,
63+
remove_first_last_utt,
64+
min_utt_words,
65+
plot_name=plot_name,
66+
)
67+
68+
69+
class TalkTimeSharing(Transformer):
70+
"""
71+
The TalkTimeSharing transformer quantifies and annotates conversations' talk-time sharing dynamics
72+
between predefined speaker groups within a corpus.
73+
74+
It assigns each conversation a primary speaker group (more talkative), a secondary
75+
speaker group (less talkative), and a scalar imbalance score. It also computes a
76+
list of windowed imbalance scores over a sliding windows of the conversation to capture how talk-time
77+
sharing distribution unfolds over time.
78+
79+
Each utterance is expected to have a speaker group label under `utt.meta['utt_group']`,
80+
which can be precomputed or inferred from `convo.meta['speaker_groups']`.
81+
Annotation of speaker groups for each utterance is required before using the TalkTimeSharing transformer.
82+
The transform() function assumes either `convo.meta['speaker_groups']` or `utt.meta['utt_group']`
83+
is already presented in the corpus for correct computation.
84+
85+
:param primary_threshold: Minimum talk-time share to label a group as the primary speaker.
86+
:param window_ps_threshold: Talk-time share threshold for identifying dominance in a time window for primary speaker group.
87+
:param window_ss_threshold: Talk-time share threshold for identifying dominance in a time window for secondary speaker group. If not provided, defaults to `window_ps_threshold`.
88+
:param window_size: Length (in minutes) of each analysis window.
89+
:param sliding_size: Step size (in seconds) to slide the window forward.
90+
:param min_utt_words: Exclude utterances shorter than this number of words from the analysis.
91+
:param remove_first_last_utt: Whether to exclude the first and last utterance.
92+
"""
93+
94+
def __init__(
95+
self,
96+
primary_threshold=0.50001,
97+
window_ps_threshold=0.6,
98+
window_ss_threshold=None,
99+
window_size=2.5,
100+
sliding_size=30,
101+
min_utt_words=0,
102+
remove_first_last_utt=True,
103+
):
104+
self.primary_threshold = primary_threshold
105+
self.window_ps_threshold = window_ps_threshold
106+
self.window_ss_threshold = (
107+
window_ss_threshold if window_ss_threshold else window_ps_threshold
108+
)
109+
self.window_size = window_size
110+
self.sliding_size = sliding_size
111+
self.min_utt_words = min_utt_words
112+
self.remove_first_last_utt = remove_first_last_utt
113+
114+
def transform(
115+
self, corpus: Corpus, selector: Callable[[Conversation], bool] = lambda convo: True
116+
):
117+
"""
118+
Computes and annotate talk-time sharing information for each conversation in the corpus.
119+
120+
Annotates the corpus with speaker group labels and if utterances `utt_group` metadata is missing, the data
121+
is assumed to be labeled in `convo.meta['speaker_groups']`.
122+
123+
Each conversation is then annotated with its primary and secondary speaker groups, an overall conversation level
124+
imbalance score, and a list of windowed imbalance score computed via sliding window analysis.
125+
126+
:param corpus: Corpus to transform
127+
:param selector: (lambda) function selecting conversations to include in this accuracy calculation;
128+
129+
:return: The input corpus where selected data is annotated with talk-time sharing dynamics information
130+
"""
131+
### Annotate utterances with speaker group information
132+
if "utt_group" not in corpus.random_utterance().meta.keys():
133+
for convo in tqdm(
134+
corpus.iter_conversations(),
135+
desc="Annotating speaker groups based on `speaker_groups` from conversation metadata",
136+
):
137+
if selector(convo):
138+
if "speaker_groups" not in convo.meta:
139+
raise ValueError(
140+
f"Missing 'speaker_groups' metadata in conversation {convo.id}, which is required for annotating utterances."
141+
)
142+
speaker_groups_dict = convo.meta["speaker_groups"]
143+
for utt in convo.iter_utterances():
144+
utt.meta["utt_group"] = speaker_groups_dict[utt.speaker.id]
145+
146+
### Annotate conversations with talk-time sharing information
147+
for convo in tqdm(
148+
corpus.iter_conversations(), desc="Annotating conversation talk-time sharing"
149+
):
150+
if selector(convo):
151+
convo.meta["primary_speaker"] = _get_ps(
152+
corpus,
153+
convo,
154+
self.remove_first_last_utt,
155+
self.min_utt_words,
156+
self.primary_threshold,
157+
)
158+
if convo.meta["primary_speaker"] is not None:
159+
convo.meta["secondary_speaker"] = (
160+
"groupA" if convo.meta["primary_speaker"] == "groupB" else "groupB"
161+
)
162+
else:
163+
convo.meta["secondary_speaker"] = None
164+
convo.meta["balance_score"] = _convo_balance_score(
165+
corpus, convo.id, self.remove_first_last_utt, self.min_utt_words
166+
)
167+
convo.meta["balance_lst"] = _convo_balance_lst(
168+
corpus,
169+
convo.id,
170+
self.window_ps_threshold,
171+
self.window_ss_threshold,
172+
self.window_size,
173+
self.sliding_size,
174+
self.remove_first_last_utt,
175+
self.min_utt_words,
176+
)
177+
178+
def fit_transform(
179+
self, corpus: Corpus, selector: Callable[[Conversation], bool] = lambda convo: True
180+
):
181+
"""
182+
Same as transform.
183+
184+
:param corpus: Corpus to transform
185+
:param selector: (lambda) function selecting conversations to include in this accuracy calculation;
186+
"""
187+
return self.transform(corpus, selector=selector)
188+
189+
def summarize(
190+
self,
191+
corpus: Corpus,
192+
selector: Callable[[Conversation], bool] = lambda convo: True,
193+
high_balance_threshold: float = 0.5,
194+
mid_balance_threshold: float = 0.55,
195+
low_balance_threshold: float = 0.65,
196+
dominating_throughout_threshold: float = 75.0,
197+
back_and_forth_threshold: float = 60.0,
198+
alter_dominance_threshold: float = 25.0,
199+
):
200+
"""
201+
Summarizes the talk-time sharing dynamics across conversations in the corpus.
202+
203+
Categorizes conversations into balance types (high_balance, mid_balance, low_balance) and
204+
more fine-grained talk-time sharing dynamics types introduced in the paper (dominating_throughout, back_and_forth, alter_dominance) based on configurable thresholds.
205+
206+
If conversations are missing required metadata (balance_score, balance_lst), the transformer
207+
will automatically run on those conversations to annotate them before categorization.
208+
209+
:param corpus: Corpus to summarize
210+
:param selector: (lambda) function selecting conversations to include in the summary
211+
:param high_balance_threshold: Minimum balance score for high_balance category (default: 0.5)
212+
:param mid_balance_threshold: Minimum balance score for mid_balance category (default: 0.55)
213+
:param low_balance_threshold: Minimum balance score for low_balance category (default: 0.65)
214+
:param dominating_throughout_threshold: Percentage threshold for dominating_throughout type (default: 75.0)
215+
:param back_and_forth_threshold: Percentage threshold for back_and_forth type (default: 60.0)
216+
:param alter_dominance_threshold: Percentage threshold for alter_dominance type (default: 25.0)
217+
218+
:return: Dictionary containing counts for each category
219+
"""
220+
# Initialize counters
221+
balance_counts = {"high_balance": 0, "mid_balance": 0, "low_balance": 0, "invalid": 0}
222+
223+
triangle_counts = {
224+
"dominating_throughout": 0,
225+
"back_and_forth": 0,
226+
"alter_dominance": 0,
227+
"no_label": 0,
228+
}
229+
230+
# Check if any conversations need annotation
231+
needs_annotation = False
232+
for convo in corpus.iter_conversations():
233+
if selector(convo) and (
234+
"balance_score" not in convo.meta or "balance_lst" not in convo.meta
235+
):
236+
needs_annotation = True
237+
break
238+
239+
# If any conversations need annotation, run the transformer on the entire corpus
240+
if needs_annotation:
241+
self.transform(corpus, selector=selector)
242+
243+
total_conversations = 0
244+
245+
# Process each conversation
246+
for convo in corpus.iter_conversations():
247+
if not selector(convo):
248+
continue
249+
250+
total_conversations += 1
251+
252+
if "balance_score" not in convo.meta or "balance_lst" not in convo.meta:
253+
balance_counts["invalid"] += 1
254+
triangle_counts["no_label"] += 1
255+
continue
256+
257+
# Categorize by balance type
258+
balance_score = convo.meta["balance_score"]
259+
if balance_score >= low_balance_threshold:
260+
balance_counts["low_balance"] += 1
261+
elif balance_score >= mid_balance_threshold:
262+
balance_counts["mid_balance"] += 1
263+
elif balance_score >= high_balance_threshold:
264+
balance_counts["high_balance"] += 1
265+
else:
266+
balance_counts["invalid"] += 1
267+
268+
# Categorize by triangle type
269+
balance_lst = convo.meta["balance_lst"]
270+
if not balance_lst: # Empty balance list
271+
triangle_counts["no_label"] += 1
272+
continue
273+
274+
# Check for dominating throughout (blue windows)
275+
count_ones = balance_lst.count(1)
276+
count_neg_ones = balance_lst.count(-1)
277+
percent_ones = (count_ones / len(balance_lst)) * 100
278+
percent_neg_ones = (count_neg_ones / len(balance_lst)) * 100
279+
280+
if (
281+
percent_ones >= dominating_throughout_threshold
282+
or percent_neg_ones >= dominating_throughout_threshold
283+
):
284+
triangle_counts["dominating_throughout"] += 1
285+
# Check for back and forth (gray windows)
286+
elif balance_lst.count(0) / len(balance_lst) * 100 >= back_and_forth_threshold:
287+
triangle_counts["back_and_forth"] += 1
288+
# Check for alternating dominance (red windows)
289+
elif count_neg_ones / len(balance_lst) * 100 > alter_dominance_threshold:
290+
triangle_counts["alter_dominance"] += 1
291+
else:
292+
triangle_counts["no_label"] += 1
293+
294+
return {
295+
"total_conversations": total_conversations,
296+
"balance_types": balance_counts,
297+
"triangle_types": triangle_counts,
298+
"parameters": {
299+
"high_balance_threshold": high_balance_threshold,
300+
"mid_balance_threshold": mid_balance_threshold,
301+
"low_balance_threshold": low_balance_threshold,
302+
"dominating_throughout_threshold": dominating_throughout_threshold,
303+
"back_and_forth_threshold": back_and_forth_threshold,
304+
"alter_dominance_threshold": alter_dominance_threshold,
305+
},
306+
}

0 commit comments

Comments
 (0)