1+ import os
2+ import json
3+ import csv
4+ from datetime import datetime , timezone
5+ from dateutil import parser
6+
7+
8+ class Exporter :
9+ """
10+ Handles all JSON/CSV export formats for Polis-compatible data.
11+ """
12+
13+ def __init__ (self , votes : list , comments : list , math_data : dict , conversation_data : dict , polis_instance_url : str ):
14+ self .votes = votes
15+ self .comments = comments
16+ self .math = math_data
17+ self .conversation = conversation_data
18+ self .polis_instance_url = polis_instance_url
19+
20+ # ---------------------------------------------------------
21+ # Public API
22+ # ---------------------------------------------------------
23+ def export (self , output_dir , format = "csv" ):
24+ """
25+ Export loaded data to files in the specified format.
26+
27+ Args:
28+ output_dir (str): Directory path where files will be written.
29+ format (str): Export format, either "json" or "csv". Defaults to "csv".
30+
31+ The CSV format exports multiple files compatible with Polis platform:
32+ - votes.csv: Individual vote records
33+ - comments.csv: Statement/comment data with metadata
34+ - comment-groups.csv: Group-specific voting statistics per statement
35+ - participant-votes.csv: Participant voting patterns and group assignments
36+ - summary.csv: Conversation summary statistics
37+ """
38+ os .makedirs (output_dir , exist_ok = True )
39+
40+ if format == "json" :
41+ self ._export_json (output_dir )
42+ elif format == "csv" :
43+ self ._export_csv (output_dir )
44+ else :
45+ raise ValueError (f"Unknown format: { format } " )
46+
47+ # ---------------------------------------------------------
48+ # JSON
49+ # ---------------------------------------------------------
50+ def _export_json (self , output_dir ):
51+ self ._write_json (output_dir , "votes.json" , self .votes )
52+ self ._write_json (output_dir , "comments.json" , self .comments )
53+ self ._write_json (output_dir , "math-pca2.json" , self .math )
54+ self ._write_json (output_dir , "conversation.json" , self .conversation )
55+
56+ def _write_json (self , output_dir , filename , data ):
57+ if not data :
58+ return
59+ path = os .path .join (output_dir , filename )
60+ with open (path , "w" ) as f :
61+ json .dump (data , f , indent = 4 )
62+
63+ # ---------------------------------------------------------
64+ # CSV
65+ # ---------------------------------------------------------
66+ def _export_csv (self , output_dir ):
67+ self ._write_votes_csv (output_dir )
68+ self ._write_comments_csv (output_dir )
69+ self ._write_comment_groups_csv (output_dir )
70+ self ._write_participant_votes_csv (output_dir )
71+ self ._write_summary_csv (output_dir )
72+
73+ # ---------------------------------------------------------
74+ # Shared time formatter
75+ # ---------------------------------------------------------
76+ def _format_polis_times (self , value ):
77+ try :
78+ if isinstance (value , (int , float )):
79+ ts = int (str (value )[:10 ])
80+ dt = datetime .fromtimestamp (ts , tz = timezone .utc )
81+ else :
82+ dt = parser .parse (value )
83+ if dt .tzinfo is None :
84+ dt = dt .replace (tzinfo = timezone .utc )
85+
86+ dt = dt .astimezone (timezone .utc )
87+ formatted = dt .strftime (
88+ "%a %b %d %Y %H:%M:%S GMT+0000 (Coordinated Universal Time)"
89+ )
90+ return int (dt .timestamp ()), formatted
91+ except Exception as e :
92+ raise ValueError (f"Invalid timestamp: { value } : { e } " )
93+
94+ # ---------------------------------------------------------
95+ # Votes CSV
96+ # ---------------------------------------------------------
97+ def _write_votes_csv (self , output_dir ):
98+ """
99+ POLIS format:
100+ timestamp,datetime,comment-id,voter-id,vote
101+ """
102+ if not self .votes :
103+ return
104+
105+ path = os .path .join (output_dir , "votes.csv" )
106+ with open (path , "w" ) as f :
107+ writer = csv .writer (f )
108+ writer .writerow (["timestamp" , "datetime" , "comment-id" , "voter-id" , "vote" ])
109+
110+ for v in sorted (self .votes , key = lambda x : (x ["statement_id" ], x ["participant_id" ])):
111+ ts , dt = self ._format_polis_times (v ["modified" ])
112+ writer .writerow ([ts , dt , v ["statement_id" ], v ["participant_id" ], v ["vote" ]])
113+
114+ # ---------------------------------------------------------
115+ # Comments CSV
116+ # ---------------------------------------------------------
117+ def _write_comments_csv (self , output_dir ):
118+ if not self .comments :
119+ return
120+
121+ path = os .path .join (output_dir , "comments.csv" )
122+ headers = [
123+ "timestamp" ,
124+ "datetime" ,
125+ "comment-id" ,
126+ "author-id" ,
127+ "agrees" ,
128+ "disagrees" ,
129+ "moderated" ,
130+ "comment-body" ,
131+ ]
132+
133+ with open (path , "w" ) as f :
134+ writer = csv .writer (f )
135+ writer .writerow (headers )
136+
137+ for c in sorted (self .comments , key = lambda x : (x ["statement_id" ], x ["participant_id" ])):
138+ ts , dt = self ._format_polis_times (c ["created" ])
139+ body = c ["txt" ].replace ('"' , '""' )
140+ writer .writerow ([
141+ ts ,
142+ dt ,
143+ c ["statement_id" ],
144+ c ["participant_id" ],
145+ c ["agree_count" ],
146+ c ["disagree_count" ],
147+ c ["moderated" ],
148+ f'"{ body } "' ,
149+ ])
150+
151+ # ---------------------------------------------------------
152+ # Comment Groups CSV
153+ # ---------------------------------------------------------
154+ def _write_comment_groups_csv (self , output_dir ):
155+ """
156+ POLIS format:
157+ comment-id,comment,total-votes,total-agrees,total-disagrees,total-passes,group-a-votes,group-a-agrees,group-a-disagrees,group-a-passes,group-[next alphabetic identifier (b)]-votes,[repeat 'votes/agrees/disagrees/passes' with alphabetic identifier...]
158+
159+ Each row represents a comment with total votes & votes by group
160+ """
161+ if not self .comments or not self .math :
162+ return
163+
164+ group_votes = self .math .get ("group-votes" , {})
165+ group_clusters = self .math .get ("group-clusters" , [])
166+ group_ids = [group ["id" ] for group in group_clusters ]
167+ # Map group indices to letters: 0 -> 'a', 1 -> 'b', etc.
168+ group_letters = [chr (ord ("a" ) + i ) for i in range (len (group_ids ))]
169+
170+ with open (output_dir + "/comment-groups.csv" , "w" ) as f :
171+ # Build header dynamically based on available groups
172+ header = [
173+ "comment-id" ,
174+ "comment" ,
175+ "total-votes" ,
176+ "total-agrees" ,
177+ "total-disagrees" ,
178+ "total-passes" ,
179+ ]
180+ for i , group in enumerate (group_clusters ):
181+ if i < len (group_letters ):
182+ group_letter = group_letters [i ]
183+ header .extend (
184+ [
185+ f"group-{ group_letter } -votes" ,
186+ f"group-{ group_letter } -agrees" ,
187+ f"group-{ group_letter } -disagrees" ,
188+ f"group-{ group_letter } -passes" ,
189+ ]
190+ )
191+ f .write ("," .join (header ))
192+ f .write ("\n " )
193+ rows = []
194+ sorted_comments_data = sorted (
195+ self .comments , key = lambda x : x ["statement_id" ]
196+ )
197+ for comment in sorted_comments_data :
198+ comment_id = str (comment ["statement_id" ])
199+ row = [
200+ comment_id ,
201+ comment ["txt" ]
202+ if comment ["txt" ][0 ] == '"'
203+ else '"' + comment ["txt" ] + '"' ,
204+ comment ["count" ],
205+ comment ["agree_count" ],
206+ comment ["disagree_count" ],
207+ comment ["pass_count" ],
208+ ]
209+
210+ # Add group-specific data
211+ for i , group in enumerate (group_clusters ):
212+ if i < len (group_letters ):
213+ group_id = str (group ["id" ])
214+ if (
215+ group_id in group_votes
216+ and comment_id in group_votes [group_id ]["votes" ]
217+ ):
218+ vote_data = group_votes [group_id ]["votes" ][comment_id ]
219+ total_votes = (
220+ vote_data ["A" ] + vote_data ["D" ] + vote_data ["S" ]
221+ )
222+ row .extend (
223+ [
224+ total_votes ,
225+ vote_data ["A" ], # agrees
226+ vote_data ["D" ], # disagrees
227+ vote_data ["S" ], # passes (skips)
228+ ]
229+ )
230+ else :
231+ # No votes from this group for this comment
232+ row .extend ([0 , 0 , 0 , 0 ])
233+ rows .append (row )
234+ f .write ("," .join ([str (item ) for item in row ]) + "\n " )
235+
236+ # ---------------------------------------------------------
237+ # Participant Votes CSV
238+ # ---------------------------------------------------------
239+ def _write_participant_votes_csv (self , output_dir ):
240+ """
241+ POLIS format:
242+ participant,group-id,n-comments,n-votes,n-agree,n-disagree,0,1,2,3,...
243+
244+ Each row represents a participant with:
245+ - participant: participant ID
246+ - group-id: which group they belong to (if any)
247+ - n-comments: number of comments they made
248+ - n-votes: total number of votes they cast
249+ - n-agree: number of agree votes
250+ - n-disagree: number of disagree votes
251+ - 0,1,2,3...: their vote on each comment (1=agree, -1=disagree, 0=pass, empty=no vote)
252+ """
253+ if not self .votes :
254+ return
255+
256+ # Get all unique participant IDs and statement IDs
257+ participant_ids = set ()
258+ statement_ids = set ()
259+ for vote in self .votes :
260+ participant_ids .add (vote ["participant_id" ])
261+ statement_ids .add (vote ["statement_id" ])
262+
263+ # Sort to ensure consistent order
264+ sorted_participant_ids = sorted (participant_ids )
265+ sorted_statement_ids = sorted (statement_ids )
266+
267+ # Build participant vote matrix
268+ participant_votes = {}
269+ for vote in self .votes :
270+ pid = vote ["participant_id" ]
271+ sid = vote ["statement_id" ]
272+ if pid not in participant_votes :
273+ participant_votes [pid ] = {}
274+ participant_votes [pid ][sid ] = vote ["vote" ]
275+
276+ # Get participant group assignments from math data
277+ participant_groups = {}
278+ if self .math and "group-clusters" in self .math :
279+ for group in self .math ["group-clusters" ]:
280+ group_id = group ["id" ]
281+ for member in group ["members" ]:
282+ participant_groups [member ] = group_id
283+
284+ # Count comments per participant
285+ participant_comment_counts = {}
286+ if self .comments :
287+ for comment in self .comments :
288+ pid = comment ["participant_id" ]
289+ participant_comment_counts [pid ] = (
290+ participant_comment_counts .get (pid , 0 ) + 1
291+ )
292+
293+ with open (output_dir + "/participant-votes.csv" , "w" ) as f :
294+ # Build header
295+ header = [
296+ "participant" ,
297+ "group-id" ,
298+ "n-comments" ,
299+ "n-votes" ,
300+ "n-agree" ,
301+ "n-disagree" ,
302+ ]
303+ header .extend ([str (sid ) for sid in sorted_statement_ids ])
304+ f .write ("," .join (header ) + "\n " )
305+
306+ # Write participant data
307+ for pid in sorted_participant_ids :
308+ participant_vote_data = participant_votes .get (pid , {})
309+
310+ # Count votes
311+ n_votes = len (participant_vote_data )
312+ n_agree = sum (1 for v in participant_vote_data .values () if v == 1 )
313+ n_disagree = sum (1 for v in participant_vote_data .values () if v == - 1 )
314+
315+ # Get group assignment
316+ group_id = participant_groups .get (pid , "" )
317+
318+ # Get comment count
319+ n_comments = participant_comment_counts .get (pid , 0 )
320+
321+ row = [pid , group_id , n_comments , n_votes , n_agree , n_disagree ]
322+
323+ # Add vote for each statement
324+ for sid in sorted_statement_ids :
325+ vote = participant_vote_data .get (sid , "" )
326+ row .append (vote )
327+
328+ f .write ("," .join ([str (item ) for item in row ]) + "\n " )
329+
330+ # ---------------------------------------------------------
331+ # Summary CSV
332+ # ---------------------------------------------------------
333+ def _write_summary_csv (self , output_dir ):
334+ """
335+ POLIS format:
336+ topic,[string]
337+ url,http://pol.is/[report_id]
338+ voters,[num]
339+ voters-in-conv,[num]
340+ commenters,[num]
341+ comments,[num]
342+ groups,[num]
343+ conversation-description,[string]
344+ """
345+ if not self .conversation :
346+ return
347+
348+ # Calculate summary statistics
349+ total_voters = (
350+ len (set (vote ["participant_id" ] for vote in self .votes ))
351+ if self .votes
352+ else 0
353+ )
354+ total_commenters = (
355+ len (set (comment ["participant_id" ] for comment in self .comments ))
356+ if self .comments
357+ else 0
358+ )
359+ total_comments = len (self .comments ) if self .comments else 0
360+ total_groups = (
361+ len (self .math .get ("group-clusters" , [])) if self .math else 0
362+ )
363+
364+ # Get conversation details
365+ topic = self .conversation .get ("topic" , "" )
366+ description = self .conversation .get ("description" , "" )
367+ if description :
368+ description = (
369+ description .replace ("\n " , "\\ n" )
370+ .replace ("\r " , "\\ r" )
371+ .replace ("\t " , "\\ t" )
372+ )
373+
374+ # Build URL
375+ conversation_id = self .conversation .get ("conversation_id" , "" )
376+ url = f"{ self .polis_instance_url } /{ conversation_id } "
377+
378+ with open (output_dir + "/summary.csv" , "w" ) as f :
379+ f .write (f'topic,"{ topic } "\n ' )
380+ f .write (f"url,{ url } \n " )
381+ f .write (f"voters,{ total_voters } \n " )
382+ f .write (f"voters-in-conv,{ total_voters } \n " )
383+ f .write (f"commenters,{ total_commenters } \n " )
384+ f .write (f"comments,{ total_comments } \n " )
385+ f .write (f"groups,{ total_groups } \n " )
386+ f .write (f'conversation-description,"{ description } "\n ' )
0 commit comments