|
| 1 | +from convokit.model import Corpus |
| 2 | +from convokit.transformer import Transformer |
| 3 | +from tqdm import tqdm |
| 4 | +from typing import Callable |
| 5 | +from convokit.model.conversation import Conversation |
| 6 | +import re |
| 7 | + |
| 8 | +from .talktimesharing_util import ( |
| 9 | + _get_ps, |
| 10 | + _convo_balance_score, |
| 11 | + _convo_balance_lst, |
| 12 | + _plot_individual_conversation_floors, |
| 13 | + _plot_multi_conversation_floors, |
| 14 | +) |
| 15 | + |
| 16 | + |
| 17 | +def plot_single_conversation_balance( |
| 18 | + corpus, |
| 19 | + convo_id, |
| 20 | + window_ps_threshold, |
| 21 | + window_size, |
| 22 | + sliding_size, |
| 23 | + remove_first_last_utt, |
| 24 | + min_utt_words, |
| 25 | + plot_name=None, |
| 26 | + window_ss_threshold=None, |
| 27 | +): |
| 28 | + if window_ss_threshold is None: |
| 29 | + window_ss_threshold = window_ps_threshold |
| 30 | + _plot_individual_conversation_floors( |
| 31 | + corpus, |
| 32 | + convo_id, |
| 33 | + window_ps_threshold, |
| 34 | + window_ss_threshold, |
| 35 | + window_size, |
| 36 | + sliding_size, |
| 37 | + remove_first_last_utt, |
| 38 | + min_utt_words, |
| 39 | + plot_name=plot_name, |
| 40 | + ) |
| 41 | + |
| 42 | + |
| 43 | +def plot_multi_conversation_balance( |
| 44 | + corpus, |
| 45 | + convo_id_lst, |
| 46 | + window_ps_threshold, |
| 47 | + window_ss_threshold, |
| 48 | + window_size, |
| 49 | + sliding_size, |
| 50 | + remove_first_last_utt, |
| 51 | + min_utt_words, |
| 52 | + plot_name=None, |
| 53 | +): |
| 54 | + if window_ss_threshold is None: |
| 55 | + window_ss_threshold = window_ps_threshold |
| 56 | + _plot_multi_conversation_floors( |
| 57 | + corpus, |
| 58 | + convo_id_lst, |
| 59 | + window_ps_threshold, |
| 60 | + window_ss_threshold, |
| 61 | + window_size, |
| 62 | + sliding_size, |
| 63 | + remove_first_last_utt, |
| 64 | + min_utt_words, |
| 65 | + plot_name=plot_name, |
| 66 | + ) |
| 67 | + |
| 68 | + |
| 69 | +class TalkTimeSharing(Transformer): |
| 70 | + """ |
| 71 | + The TalkTimeSharing transformer quantifies and annotates conversations' talk-time sharing dynamics |
| 72 | + between predefined speaker groups within a corpus. |
| 73 | +
|
| 74 | + It assigns each conversation a primary speaker group (more talkative), a secondary |
| 75 | + speaker group (less talkative), and a scalar imbalance score. It also computes a |
| 76 | + list of windowed imbalance scores over a sliding windows of the conversation to capture how talk-time |
| 77 | + sharing distribution unfolds over time. |
| 78 | +
|
| 79 | + Each utterance is expected to have a speaker group label under `utt.meta['utt_group']`, |
| 80 | + which can be precomputed or inferred from `convo.meta['speaker_groups']`. |
| 81 | + Annotation of speaker groups for each utterance is required before using the TalkTimeSharing transformer. |
| 82 | + The transform() function assumes either `convo.meta['speaker_groups']` or `utt.meta['utt_group']` |
| 83 | + is already presented in the corpus for correct computation. |
| 84 | +
|
| 85 | + :param primary_threshold: Minimum talk-time share to label a group as the primary speaker. |
| 86 | + :param window_ps_threshold: Talk-time share threshold for identifying dominance in a time window for primary speaker group. |
| 87 | + :param window_ss_threshold: Talk-time share threshold for identifying dominance in a time window for secondary speaker group. If not provided, defaults to `window_ps_threshold`. |
| 88 | + :param window_size: Length (in minutes) of each analysis window. |
| 89 | + :param sliding_size: Step size (in seconds) to slide the window forward. |
| 90 | + :param min_utt_words: Exclude utterances shorter than this number of words from the analysis. |
| 91 | + :param remove_first_last_utt: Whether to exclude the first and last utterance. |
| 92 | + """ |
| 93 | + |
| 94 | + def __init__( |
| 95 | + self, |
| 96 | + primary_threshold=0.50001, |
| 97 | + window_ps_threshold=0.6, |
| 98 | + window_ss_threshold=None, |
| 99 | + window_size=2.5, |
| 100 | + sliding_size=30, |
| 101 | + min_utt_words=0, |
| 102 | + remove_first_last_utt=True, |
| 103 | + ): |
| 104 | + self.primary_threshold = primary_threshold |
| 105 | + self.window_ps_threshold = window_ps_threshold |
| 106 | + self.window_ss_threshold = ( |
| 107 | + window_ss_threshold if window_ss_threshold else window_ps_threshold |
| 108 | + ) |
| 109 | + self.window_size = window_size |
| 110 | + self.sliding_size = sliding_size |
| 111 | + self.min_utt_words = min_utt_words |
| 112 | + self.remove_first_last_utt = remove_first_last_utt |
| 113 | + |
| 114 | + def transform( |
| 115 | + self, corpus: Corpus, selector: Callable[[Conversation], bool] = lambda convo: True |
| 116 | + ): |
| 117 | + """ |
| 118 | + Computes and annotate talk-time sharing information for each conversation in the corpus. |
| 119 | +
|
| 120 | + Annotates the corpus with speaker group labels and if utterances `utt_group` metadata is missing, the data |
| 121 | + is assumed to be labeled in `convo.meta['speaker_groups']`. |
| 122 | +
|
| 123 | + Each conversation is then annotated with its primary and secondary speaker groups, an overall conversation level |
| 124 | + imbalance score, and a list of windowed imbalance score computed via sliding window analysis. |
| 125 | +
|
| 126 | + :param corpus: Corpus to transform |
| 127 | + :param selector: (lambda) function selecting conversations to include in this accuracy calculation; |
| 128 | +
|
| 129 | + :return: The input corpus where selected data is annotated with talk-time sharing dynamics information |
| 130 | + """ |
| 131 | + ### Annotate utterances with speaker group information |
| 132 | + if "utt_group" not in corpus.random_utterance().meta.keys(): |
| 133 | + for convo in tqdm( |
| 134 | + corpus.iter_conversations(), |
| 135 | + desc="Annotating speaker groups based on `speaker_groups` from conversation metadata", |
| 136 | + ): |
| 137 | + if selector(convo): |
| 138 | + if "speaker_groups" not in convo.meta: |
| 139 | + raise ValueError( |
| 140 | + f"Missing 'speaker_groups' metadata in conversation {convo.id}, which is required for annotating utterances." |
| 141 | + ) |
| 142 | + speaker_groups_dict = convo.meta["speaker_groups"] |
| 143 | + for utt in convo.iter_utterances(): |
| 144 | + utt.meta["utt_group"] = speaker_groups_dict[utt.speaker.id] |
| 145 | + |
| 146 | + ### Annotate conversations with talk-time sharing information |
| 147 | + for convo in tqdm( |
| 148 | + corpus.iter_conversations(), desc="Annotating conversation talk-time sharing" |
| 149 | + ): |
| 150 | + if selector(convo): |
| 151 | + convo.meta["primary_speaker"] = _get_ps( |
| 152 | + corpus, |
| 153 | + convo, |
| 154 | + self.remove_first_last_utt, |
| 155 | + self.min_utt_words, |
| 156 | + self.primary_threshold, |
| 157 | + ) |
| 158 | + if convo.meta["primary_speaker"] is not None: |
| 159 | + convo.meta["secondary_speaker"] = ( |
| 160 | + "groupA" if convo.meta["primary_speaker"] == "groupB" else "groupB" |
| 161 | + ) |
| 162 | + else: |
| 163 | + convo.meta["secondary_speaker"] = None |
| 164 | + convo.meta["balance_score"] = _convo_balance_score( |
| 165 | + corpus, convo.id, self.remove_first_last_utt, self.min_utt_words |
| 166 | + ) |
| 167 | + convo.meta["balance_lst"] = _convo_balance_lst( |
| 168 | + corpus, |
| 169 | + convo.id, |
| 170 | + self.window_ps_threshold, |
| 171 | + self.window_ss_threshold, |
| 172 | + self.window_size, |
| 173 | + self.sliding_size, |
| 174 | + self.remove_first_last_utt, |
| 175 | + self.min_utt_words, |
| 176 | + ) |
| 177 | + |
| 178 | + def fit_transform( |
| 179 | + self, corpus: Corpus, selector: Callable[[Conversation], bool] = lambda convo: True |
| 180 | + ): |
| 181 | + """ |
| 182 | + Same as transform. |
| 183 | +
|
| 184 | + :param corpus: Corpus to transform |
| 185 | + :param selector: (lambda) function selecting conversations to include in this accuracy calculation; |
| 186 | + """ |
| 187 | + return self.transform(corpus, selector=selector) |
| 188 | + |
| 189 | + def summarize( |
| 190 | + self, |
| 191 | + corpus: Corpus, |
| 192 | + selector: Callable[[Conversation], bool] = lambda convo: True, |
| 193 | + high_balance_threshold: float = 0.5, |
| 194 | + mid_balance_threshold: float = 0.55, |
| 195 | + low_balance_threshold: float = 0.65, |
| 196 | + dominating_throughout_threshold: float = 75.0, |
| 197 | + back_and_forth_threshold: float = 60.0, |
| 198 | + alter_dominance_threshold: float = 25.0, |
| 199 | + ): |
| 200 | + """ |
| 201 | + Summarizes the talk-time sharing dynamics across conversations in the corpus. |
| 202 | +
|
| 203 | + Categorizes conversations into balance types (high_balance, mid_balance, low_balance) and |
| 204 | + more fine-grained talk-time sharing dynamics types introduced in the paper (dominating_throughout, back_and_forth, alter_dominance) based on configurable thresholds. |
| 205 | +
|
| 206 | + If conversations are missing required metadata (balance_score, balance_lst), the transformer |
| 207 | + will automatically run on those conversations to annotate them before categorization. |
| 208 | +
|
| 209 | + :param corpus: Corpus to summarize |
| 210 | + :param selector: (lambda) function selecting conversations to include in the summary |
| 211 | + :param high_balance_threshold: Minimum balance score for high_balance category (default: 0.5) |
| 212 | + :param mid_balance_threshold: Minimum balance score for mid_balance category (default: 0.55) |
| 213 | + :param low_balance_threshold: Minimum balance score for low_balance category (default: 0.65) |
| 214 | + :param dominating_throughout_threshold: Percentage threshold for dominating_throughout type (default: 75.0) |
| 215 | + :param back_and_forth_threshold: Percentage threshold for back_and_forth type (default: 60.0) |
| 216 | + :param alter_dominance_threshold: Percentage threshold for alter_dominance type (default: 25.0) |
| 217 | +
|
| 218 | + :return: Dictionary containing counts for each category |
| 219 | + """ |
| 220 | + # Initialize counters |
| 221 | + balance_counts = {"high_balance": 0, "mid_balance": 0, "low_balance": 0, "invalid": 0} |
| 222 | + |
| 223 | + triangle_counts = { |
| 224 | + "dominating_throughout": 0, |
| 225 | + "back_and_forth": 0, |
| 226 | + "alter_dominance": 0, |
| 227 | + "no_label": 0, |
| 228 | + } |
| 229 | + |
| 230 | + # Check if any conversations need annotation |
| 231 | + needs_annotation = False |
| 232 | + for convo in corpus.iter_conversations(): |
| 233 | + if selector(convo) and ( |
| 234 | + "balance_score" not in convo.meta or "balance_lst" not in convo.meta |
| 235 | + ): |
| 236 | + needs_annotation = True |
| 237 | + break |
| 238 | + |
| 239 | + # If any conversations need annotation, run the transformer on the entire corpus |
| 240 | + if needs_annotation: |
| 241 | + self.transform(corpus, selector=selector) |
| 242 | + |
| 243 | + total_conversations = 0 |
| 244 | + |
| 245 | + # Process each conversation |
| 246 | + for convo in corpus.iter_conversations(): |
| 247 | + if not selector(convo): |
| 248 | + continue |
| 249 | + |
| 250 | + total_conversations += 1 |
| 251 | + |
| 252 | + if "balance_score" not in convo.meta or "balance_lst" not in convo.meta: |
| 253 | + balance_counts["invalid"] += 1 |
| 254 | + triangle_counts["no_label"] += 1 |
| 255 | + continue |
| 256 | + |
| 257 | + # Categorize by balance type |
| 258 | + balance_score = convo.meta["balance_score"] |
| 259 | + if balance_score >= low_balance_threshold: |
| 260 | + balance_counts["low_balance"] += 1 |
| 261 | + elif balance_score >= mid_balance_threshold: |
| 262 | + balance_counts["mid_balance"] += 1 |
| 263 | + elif balance_score >= high_balance_threshold: |
| 264 | + balance_counts["high_balance"] += 1 |
| 265 | + else: |
| 266 | + balance_counts["invalid"] += 1 |
| 267 | + |
| 268 | + # Categorize by triangle type |
| 269 | + balance_lst = convo.meta["balance_lst"] |
| 270 | + if not balance_lst: # Empty balance list |
| 271 | + triangle_counts["no_label"] += 1 |
| 272 | + continue |
| 273 | + |
| 274 | + # Check for dominating throughout (blue windows) |
| 275 | + count_ones = balance_lst.count(1) |
| 276 | + count_neg_ones = balance_lst.count(-1) |
| 277 | + percent_ones = (count_ones / len(balance_lst)) * 100 |
| 278 | + percent_neg_ones = (count_neg_ones / len(balance_lst)) * 100 |
| 279 | + |
| 280 | + if ( |
| 281 | + percent_ones >= dominating_throughout_threshold |
| 282 | + or percent_neg_ones >= dominating_throughout_threshold |
| 283 | + ): |
| 284 | + triangle_counts["dominating_throughout"] += 1 |
| 285 | + # Check for back and forth (gray windows) |
| 286 | + elif balance_lst.count(0) / len(balance_lst) * 100 >= back_and_forth_threshold: |
| 287 | + triangle_counts["back_and_forth"] += 1 |
| 288 | + # Check for alternating dominance (red windows) |
| 289 | + elif count_neg_ones / len(balance_lst) * 100 > alter_dominance_threshold: |
| 290 | + triangle_counts["alter_dominance"] += 1 |
| 291 | + else: |
| 292 | + triangle_counts["no_label"] += 1 |
| 293 | + |
| 294 | + return { |
| 295 | + "total_conversations": total_conversations, |
| 296 | + "balance_types": balance_counts, |
| 297 | + "triangle_types": triangle_counts, |
| 298 | + "parameters": { |
| 299 | + "high_balance_threshold": high_balance_threshold, |
| 300 | + "mid_balance_threshold": mid_balance_threshold, |
| 301 | + "low_balance_threshold": low_balance_threshold, |
| 302 | + "dominating_throughout_threshold": dominating_throughout_threshold, |
| 303 | + "back_and_forth_threshold": back_and_forth_threshold, |
| 304 | + "alter_dominance_threshold": alter_dominance_threshold, |
| 305 | + }, |
| 306 | + } |
0 commit comments