-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy patharchiver.py
executable file
·165 lines (157 loc) · 6.23 KB
/
archiver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import praw
import snudown
import datetime
import time
import re
import sys
from requests.exceptions import HTTPError
"""
Customization Configuration
"""
# Default postID: #
postID='15zmjl'
# Path to which to output the file #
outputFilePath='./'
# The Path to the stylesheet, relative to where the html file will be stored #
pathToCSS='css/style.css'
"""
Reddit Post Archiver
By Samuel Johnson Stoever
"""
if len(sys.argv) == 1:
print('No post ID was provided. Using default postID.')
elif len(sys.argv) > 2:
print('Too Many Arguments. Using default postID.')
else:
postID = sys.argv[1]
outputFilePath = outputFilePath + postID + '.html'
monthsList = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
def writeHeader(posttitle):
htmlFile.write('<!DOCTYPE html>\n<html>\n<head>\n')
htmlFile.write('\t<meta charset="utf-8"/>\n')
htmlFile.write('\t<link type="text/css" rel="stylesheet" href="' + pathToCSS +'"/>\n')
htmlFile.write('\t<title>' + posttitle + '</title>\n')
htmlFile.write('</head>\n<body>\n')
def parsePost(postObject):
writeHeader(fixUnicode(postObject.title))
postObject.replace_more_comments()
postAuthorName = ''
postAuthorExists = 0
try:
postAuthorName = fixUnicode(postObject.author.name)
postAuthorExists = 1
except AttributeError:
postAuthorExists = 0
htmlFile.write('<div class="title">\n')
if postObject.is_self:
# The post is a self post
htmlFile.write(fixUnicode(postObject.title))
htmlFile.write('\n<br/><strong>')
else:
# The post is a link post
htmlFile.write('<a id="postlink" href="' + fixUnicode(postObject.url))
htmlFile.write('">')
htmlFile.write(fixUnicode(postObject.title))
htmlFile.write('</a>\n<br/><strong>')
if postAuthorExists:
htmlFile.write('Posted by <a id="userlink" href="' + fixUnicode(postObject.author._url))
htmlFile.write('">')
htmlFile.write(postAuthorName)
htmlFile.write('</a>. </strong><em>')
else:
htmlFile.write('Posted by [Deleted]. </strong><em>')
htmlFile.write('Posted at ')
postDate = time.gmtime(postObject.created_utc)
htmlFile.write(str(postDate.tm_hour) + ':')
htmlFile.write(str(postDate.tm_min) + ' UTC on ')
htmlFile.write(monthsList[postDate.tm_mon-1] + ' ')
htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year))
htmlFile.write('. ' + str(postObject.ups - postObject.downs))
if postObject.is_self:
htmlFile.write(' Points. </em><em>(self.<a id="selfLink" href="')
else:
htmlFile.write(' Points. </em><em>(<a id="selfLink" href="')
htmlFile.write(postObject.subreddit._url)
htmlFile.write('">' + postObject.subreddit.display_name)
if postObject.is_self:
htmlFile.write('</a>)</em><em>')
else:
htmlFile.write('</a> Subreddit)</em><em>')
htmlFile.write(' (<a id="postpermalink" href="')
htmlFile.write(fixUnicode(postObject.permalink))
htmlFile.write('">Permalink</a>)</em>\n')
if postObject.is_self:
htmlFile.write('<div class="post">\n')
htmlFile.write(snudown.markdown(fixMarkdown(postObject.selftext)))
htmlFile.write('</div>\n')
else:
htmlFile.write('<div class="post">\n<p>\n')
htmlFile.write(postObject.url)
htmlFile.write('</p>\n</div>\n')
htmlFile.write('</div>\n')
for comment in postObject._comments:
parseComment(comment, postAuthorName, postAuthorExists)
htmlFile.write('<hr id="footerhr">\n')
htmlFile.write('<div id="footer"><em>Archived on ')
htmlFile.write(str(datetime.datetime.utcnow()))
htmlFile.write(' UTC</em></div>')
htmlFile.write('\n\n</body>\n</html>\n')
#Done
def parseComment(redditComment, postAuthorName, postAuthorExists, isRoot=True):
commentAuthorName = ''
commentAuthorExists = 0
try:
commentAuthorName = fixUnicode(redditComment.author.name)
commentAuthorExists = 1
except AttributeError:
commentAuthorExists = 0
if isRoot:
htmlFile.write('<div id="' + str(redditComment.id))
htmlFile.write('" class="comment">\n')
else:
htmlFile.write('<div id="' + str(redditComment.id))
htmlFile.write('" class="comment" style="margin-bottom:10px;margin-left:0px;">\n')
htmlFile.write('<div class="commentinfo">\n')
if commentAuthorExists:
if postAuthorExists and postAuthorName == commentAuthorName:
htmlFile.write('<a href="' + redditComment.author._url)
htmlFile.write('" class="postOP-comment">' + commentAuthorName + '</a> <em>')
else:
htmlFile.write('<a href="' + redditComment.author._url)
htmlFile.write('">' + commentAuthorName + '</a> <em>')
else:
htmlFile.write('<strong>[Deleted]</strong> <em>')
htmlFile.write(str(redditComment.ups - redditComment.downs))
htmlFile.write(' Points </em><em>')
htmlFile.write('Posted at ')
postDate = time.gmtime(redditComment.created_utc)
htmlFile.write(str(postDate.tm_hour) + ':')
htmlFile.write(str(postDate.tm_min) + ' UTC on ')
htmlFile.write(monthsList[postDate.tm_mon-1] + ' ')
htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year))
htmlFile.write('</em></div>\n')
htmlFile.write(snudown.markdown(fixMarkdown(redditComment.body)))
for reply in redditComment._replies:
parseComment(reply, postAuthorName, postAuthorExists, False)
htmlFile.write('</div>\n')
#Done
def fixMarkdown(markdown):
newMarkdown = markdown.encode('utf8')
return re.sub('\>', '>', str(newMarkdown))
def fixUnicode(text):
return str(text.encode('utf8'))
# End Function Definitions
r = praw.Reddit(user_agent='RedditPostArchiver Bot, version 0.93')
# Disclaimer, storing plain text passwords is bad.
# uncomment the following line to login (e.g., in case of Unable to Archive Post:
# r.login('username', 'password')
try:
thePost = r.get_submission(submission_id=postID)
htmlFile = open(outputFilePath,'w')
parsePost(thePost)
htmlFile.close()
except HTTPError:
print('Unable to Archive Post: Invalid PostID or Log In Required (see line 157 of script)')
##Done