Skip to content

Commit 95a9847

Browse files
authored
Merge pull request #110 from polis-community/abstract-exporter
Abstract exporter from data loader into own module
2 parents 2a851d6 + d0bd0ee commit 95a9847

2 files changed

Lines changed: 413 additions & 388 deletions

File tree

reddwarf/data_exporter.py

Lines changed: 386 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,386 @@
1+
import os
2+
import json
3+
import csv
4+
from datetime import datetime, timezone
5+
from dateutil import parser
6+
7+
8+
class Exporter:
9+
"""
10+
Handles all JSON/CSV export formats for Polis-compatible data.
11+
"""
12+
13+
def __init__(self, votes: list, comments: list, math_data: dict, conversation_data: dict, polis_instance_url: str):
14+
self.votes = votes
15+
self.comments = comments
16+
self.math = math_data
17+
self.conversation = conversation_data
18+
self.polis_instance_url = polis_instance_url
19+
20+
# ---------------------------------------------------------
21+
# Public API
22+
# ---------------------------------------------------------
23+
def export(self, output_dir, format="csv"):
24+
"""
25+
Export loaded data to files in the specified format.
26+
27+
Args:
28+
output_dir (str): Directory path where files will be written.
29+
format (str): Export format, either "json" or "csv". Defaults to "csv".
30+
31+
The CSV format exports multiple files compatible with Polis platform:
32+
- votes.csv: Individual vote records
33+
- comments.csv: Statement/comment data with metadata
34+
- comment-groups.csv: Group-specific voting statistics per statement
35+
- participant-votes.csv: Participant voting patterns and group assignments
36+
- summary.csv: Conversation summary statistics
37+
"""
38+
os.makedirs(output_dir, exist_ok=True)
39+
40+
if format == "json":
41+
self._export_json(output_dir)
42+
elif format == "csv":
43+
self._export_csv(output_dir)
44+
else:
45+
raise ValueError(f"Unknown format: {format}")
46+
47+
# ---------------------------------------------------------
48+
# JSON
49+
# ---------------------------------------------------------
50+
def _export_json(self, output_dir):
51+
self._write_json(output_dir, "votes.json", self.votes)
52+
self._write_json(output_dir, "comments.json", self.comments)
53+
self._write_json(output_dir, "math-pca2.json", self.math)
54+
self._write_json(output_dir, "conversation.json", self.conversation)
55+
56+
def _write_json(self, output_dir, filename, data):
57+
if not data:
58+
return
59+
path = os.path.join(output_dir, filename)
60+
with open(path, "w") as f:
61+
json.dump(data, f, indent=4)
62+
63+
# ---------------------------------------------------------
64+
# CSV
65+
# ---------------------------------------------------------
66+
def _export_csv(self, output_dir):
67+
self._write_votes_csv(output_dir)
68+
self._write_comments_csv(output_dir)
69+
self._write_comment_groups_csv(output_dir)
70+
self._write_participant_votes_csv(output_dir)
71+
self._write_summary_csv(output_dir)
72+
73+
# ---------------------------------------------------------
74+
# Shared time formatter
75+
# ---------------------------------------------------------
76+
def _format_polis_times(self, value):
77+
try:
78+
if isinstance(value, (int, float)):
79+
ts = int(str(value)[:10])
80+
dt = datetime.fromtimestamp(ts, tz=timezone.utc)
81+
else:
82+
dt = parser.parse(value)
83+
if dt.tzinfo is None:
84+
dt = dt.replace(tzinfo=timezone.utc)
85+
86+
dt = dt.astimezone(timezone.utc)
87+
formatted = dt.strftime(
88+
"%a %b %d %Y %H:%M:%S GMT+0000 (Coordinated Universal Time)"
89+
)
90+
return int(dt.timestamp()), formatted
91+
except Exception as e:
92+
raise ValueError(f"Invalid timestamp: {value}: {e}")
93+
94+
# ---------------------------------------------------------
95+
# Votes CSV
96+
# ---------------------------------------------------------
97+
def _write_votes_csv(self, output_dir):
98+
"""
99+
POLIS format:
100+
timestamp,datetime,comment-id,voter-id,vote
101+
"""
102+
if not self.votes:
103+
return
104+
105+
path = os.path.join(output_dir, "votes.csv")
106+
with open(path, "w") as f:
107+
writer = csv.writer(f)
108+
writer.writerow(["timestamp", "datetime", "comment-id", "voter-id", "vote"])
109+
110+
for v in sorted(self.votes, key=lambda x: (x["statement_id"], x["participant_id"])):
111+
ts, dt = self._format_polis_times(v["modified"])
112+
writer.writerow([ts, dt, v["statement_id"], v["participant_id"], v["vote"]])
113+
114+
# ---------------------------------------------------------
115+
# Comments CSV
116+
# ---------------------------------------------------------
117+
def _write_comments_csv(self, output_dir):
118+
if not self.comments:
119+
return
120+
121+
path = os.path.join(output_dir, "comments.csv")
122+
headers = [
123+
"timestamp",
124+
"datetime",
125+
"comment-id",
126+
"author-id",
127+
"agrees",
128+
"disagrees",
129+
"moderated",
130+
"comment-body",
131+
]
132+
133+
with open(path, "w") as f:
134+
writer = csv.writer(f)
135+
writer.writerow(headers)
136+
137+
for c in sorted(self.comments, key=lambda x: (x["statement_id"], x["participant_id"])):
138+
ts, dt = self._format_polis_times(c["created"])
139+
body = c["txt"].replace('"', '""')
140+
writer.writerow([
141+
ts,
142+
dt,
143+
c["statement_id"],
144+
c["participant_id"],
145+
c["agree_count"],
146+
c["disagree_count"],
147+
c["moderated"],
148+
f'"{body}"',
149+
])
150+
151+
# ---------------------------------------------------------
152+
# Comment Groups CSV
153+
# ---------------------------------------------------------
154+
def _write_comment_groups_csv(self, output_dir):
155+
"""
156+
POLIS format:
157+
comment-id,comment,total-votes,total-agrees,total-disagrees,total-passes,group-a-votes,group-a-agrees,group-a-disagrees,group-a-passes,group-[next alphabetic identifier (b)]-votes,[repeat 'votes/agrees/disagrees/passes' with alphabetic identifier...]
158+
159+
Each row represents a comment with total votes & votes by group
160+
"""
161+
if not self.comments or not self.math:
162+
return
163+
164+
group_votes = self.math.get("group-votes", {})
165+
group_clusters = self.math.get("group-clusters", [])
166+
group_ids = [group["id"] for group in group_clusters]
167+
# Map group indices to letters: 0 -> 'a', 1 -> 'b', etc.
168+
group_letters = [chr(ord("a") + i) for i in range(len(group_ids))]
169+
170+
with open(output_dir + "/comment-groups.csv", "w") as f:
171+
# Build header dynamically based on available groups
172+
header = [
173+
"comment-id",
174+
"comment",
175+
"total-votes",
176+
"total-agrees",
177+
"total-disagrees",
178+
"total-passes",
179+
]
180+
for i, group in enumerate(group_clusters):
181+
if i < len(group_letters):
182+
group_letter = group_letters[i]
183+
header.extend(
184+
[
185+
f"group-{group_letter}-votes",
186+
f"group-{group_letter}-agrees",
187+
f"group-{group_letter}-disagrees",
188+
f"group-{group_letter}-passes",
189+
]
190+
)
191+
f.write(",".join(header))
192+
f.write("\n")
193+
rows = []
194+
sorted_comments_data = sorted(
195+
self.comments, key=lambda x: x["statement_id"]
196+
)
197+
for comment in sorted_comments_data:
198+
comment_id = str(comment["statement_id"])
199+
row = [
200+
comment_id,
201+
comment["txt"]
202+
if comment["txt"][0] == '"'
203+
else '"' + comment["txt"] + '"',
204+
comment["count"],
205+
comment["agree_count"],
206+
comment["disagree_count"],
207+
comment["pass_count"],
208+
]
209+
210+
# Add group-specific data
211+
for i, group in enumerate(group_clusters):
212+
if i < len(group_letters):
213+
group_id = str(group["id"])
214+
if (
215+
group_id in group_votes
216+
and comment_id in group_votes[group_id]["votes"]
217+
):
218+
vote_data = group_votes[group_id]["votes"][comment_id]
219+
total_votes = (
220+
vote_data["A"] + vote_data["D"] + vote_data["S"]
221+
)
222+
row.extend(
223+
[
224+
total_votes,
225+
vote_data["A"], # agrees
226+
vote_data["D"], # disagrees
227+
vote_data["S"], # passes (skips)
228+
]
229+
)
230+
else:
231+
# No votes from this group for this comment
232+
row.extend([0, 0, 0, 0])
233+
rows.append(row)
234+
f.write(",".join([str(item) for item in row]) + "\n")
235+
236+
# ---------------------------------------------------------
237+
# Participant Votes CSV
238+
# ---------------------------------------------------------
239+
def _write_participant_votes_csv(self, output_dir):
240+
"""
241+
POLIS format:
242+
participant,group-id,n-comments,n-votes,n-agree,n-disagree,0,1,2,3,...
243+
244+
Each row represents a participant with:
245+
- participant: participant ID
246+
- group-id: which group they belong to (if any)
247+
- n-comments: number of comments they made
248+
- n-votes: total number of votes they cast
249+
- n-agree: number of agree votes
250+
- n-disagree: number of disagree votes
251+
- 0,1,2,3...: their vote on each comment (1=agree, -1=disagree, 0=pass, empty=no vote)
252+
"""
253+
if not self.votes:
254+
return
255+
256+
# Get all unique participant IDs and statement IDs
257+
participant_ids = set()
258+
statement_ids = set()
259+
for vote in self.votes:
260+
participant_ids.add(vote["participant_id"])
261+
statement_ids.add(vote["statement_id"])
262+
263+
# Sort to ensure consistent order
264+
sorted_participant_ids = sorted(participant_ids)
265+
sorted_statement_ids = sorted(statement_ids)
266+
267+
# Build participant vote matrix
268+
participant_votes = {}
269+
for vote in self.votes:
270+
pid = vote["participant_id"]
271+
sid = vote["statement_id"]
272+
if pid not in participant_votes:
273+
participant_votes[pid] = {}
274+
participant_votes[pid][sid] = vote["vote"]
275+
276+
# Get participant group assignments from math data
277+
participant_groups = {}
278+
if self.math and "group-clusters" in self.math:
279+
for group in self.math["group-clusters"]:
280+
group_id = group["id"]
281+
for member in group["members"]:
282+
participant_groups[member] = group_id
283+
284+
# Count comments per participant
285+
participant_comment_counts = {}
286+
if self.comments:
287+
for comment in self.comments:
288+
pid = comment["participant_id"]
289+
participant_comment_counts[pid] = (
290+
participant_comment_counts.get(pid, 0) + 1
291+
)
292+
293+
with open(output_dir + "/participant-votes.csv", "w") as f:
294+
# Build header
295+
header = [
296+
"participant",
297+
"group-id",
298+
"n-comments",
299+
"n-votes",
300+
"n-agree",
301+
"n-disagree",
302+
]
303+
header.extend([str(sid) for sid in sorted_statement_ids])
304+
f.write(",".join(header) + "\n")
305+
306+
# Write participant data
307+
for pid in sorted_participant_ids:
308+
participant_vote_data = participant_votes.get(pid, {})
309+
310+
# Count votes
311+
n_votes = len(participant_vote_data)
312+
n_agree = sum(1 for v in participant_vote_data.values() if v == 1)
313+
n_disagree = sum(1 for v in participant_vote_data.values() if v == -1)
314+
315+
# Get group assignment
316+
group_id = participant_groups.get(pid, "")
317+
318+
# Get comment count
319+
n_comments = participant_comment_counts.get(pid, 0)
320+
321+
row = [pid, group_id, n_comments, n_votes, n_agree, n_disagree]
322+
323+
# Add vote for each statement
324+
for sid in sorted_statement_ids:
325+
vote = participant_vote_data.get(sid, "")
326+
row.append(vote)
327+
328+
f.write(",".join([str(item) for item in row]) + "\n")
329+
330+
# ---------------------------------------------------------
331+
# Summary CSV
332+
# ---------------------------------------------------------
333+
def _write_summary_csv(self, output_dir):
334+
"""
335+
POLIS format:
336+
topic,[string]
337+
url,http://pol.is/[report_id]
338+
voters,[num]
339+
voters-in-conv,[num]
340+
commenters,[num]
341+
comments,[num]
342+
groups,[num]
343+
conversation-description,[string]
344+
"""
345+
if not self.conversation:
346+
return
347+
348+
# Calculate summary statistics
349+
total_voters = (
350+
len(set(vote["participant_id"] for vote in self.votes))
351+
if self.votes
352+
else 0
353+
)
354+
total_commenters = (
355+
len(set(comment["participant_id"] for comment in self.comments))
356+
if self.comments
357+
else 0
358+
)
359+
total_comments = len(self.comments) if self.comments else 0
360+
total_groups = (
361+
len(self.math.get("group-clusters", [])) if self.math else 0
362+
)
363+
364+
# Get conversation details
365+
topic = self.conversation.get("topic", "")
366+
description = self.conversation.get("description", "")
367+
if description:
368+
description = (
369+
description.replace("\n", "\\n")
370+
.replace("\r", "\\r")
371+
.replace("\t", "\\t")
372+
)
373+
374+
# Build URL
375+
conversation_id = self.conversation.get("conversation_id", "")
376+
url = f"{self.polis_instance_url}/{conversation_id}"
377+
378+
with open(output_dir + "/summary.csv", "w") as f:
379+
f.write(f'topic,"{topic}"\n')
380+
f.write(f"url,{url}\n")
381+
f.write(f"voters,{total_voters}\n")
382+
f.write(f"voters-in-conv,{total_voters}\n")
383+
f.write(f"commenters,{total_commenters}\n")
384+
f.write(f"comments,{total_comments}\n")
385+
f.write(f"groups,{total_groups}\n")
386+
f.write(f'conversation-description,"{description}"\n')

0 commit comments

Comments
 (0)