This repository was archived by the owner on Feb 1, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathDiscordtoJson.py
More file actions
230 lines (185 loc) · 8.92 KB
/
DiscordtoJson.py
File metadata and controls
230 lines (185 loc) · 8.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
"""
Simple JSON scraper for Kemono.party discord content.
@author: Jeff Chen
@last modified: 8/25/2022
"""
import time
from cfscrape import CloudflareScraper
import logging
import requests.adapters
from Threadpool import ThreadPool
from threading import Semaphore
from threading import Lock
import cfscrape
DISCORD_LOOKUP_API = "https://www.kemono.su/api/v1/discord/channel/lookup/"
DISCORD_CHANNEL_CONTENT_PRE_API = "https://www.kemono.su/api/v1/discord/channel/"
DISCORD_CHANNEL_CONTENT_SUF_API = "?o="
DISCORD_CHANNEL_CONTENT_SKIP_INCRE = 150
HEADERS={'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'}
class DiscordToJson():
"""
Utility functions used for scraping Kemono Party's Discord to Json data.
Offers functions for scrapping Discord sub channel IDs and scraping the channels themselves.
"""
__recent:dict = None
def discord_lookup(self, discordID:str, scraper:CloudflareScraper) -> dict:
"""
Looks up a discord id using Kemono.party's API and returns
the result in JSON format
Param:
discordID: ID of discord channel to grab channel IDs from
scraper: Scraper to use while scraping kemono
Return: channelIDs in JSON format
"""
# Link URL
url = DISCORD_LOOKUP_API + discordID
# Grab data
data = None
while not data:
try:
data = scraper.get(url, timeout=5, headers=HEADERS)
except(requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout):
logging.debug("Connection error, retrying")
time.sleep(1)
# Convert data
js = data.json()
logging.debug("Received " + str(js) + " from " + url)
# Return json
return js
def discord_lookup_all(self, channelID:str|None, threads:int=6, sessions:list=None)->dict|list:
"""
Similar to discord_channel_lookup() but processes everything, not just in segments.
NOTE: will take a significant amount of time if discord channel is of considerable size
Param:
threads: Number of threads to use while looking up js
sessions: list of sessions used when scraping, size must be >= threads
"""
# Grab data
js_buff = []
# Generate threads and threading vars
pool = ThreadPool(threads)
pool.start_threads()
js_buff_lock = Lock()
main_sem = Semaphore(0)
# Generate sessions for each thread
if sessions:
assert(len(sessions) >= threads)
else:
sessions = [cfscrape.create_scraper(requests.Session())] * threads
adapters = [requests.adapters.HTTPAdapter(pool_connections=1, pool_maxsize=1, max_retries=0, pool_block=True)] * threads
[session.mount('http://', adapter) for session,adapter in zip(sessions,adapters)]
# Loop until no more data left
[pool.enqueue((self.__discord_lookup_thread_job, (threads, DISCORD_CHANNEL_CONTENT_SKIP_INCRE, i * DISCORD_CHANNEL_CONTENT_SKIP_INCRE, channelID, sessions[i], main_sem, js_buff, js_buff_lock, pool)))\
for i in range(0, threads)]
# Sleep until done
main_sem.acquire()
# Kill threads
pool.join_queue()
pool.kill_threads()
# Kill all adapters
[session.close() for session in sessions]
# Return json
return js_buff
def __discord_lookup_thread_job(self, tcount:int, skip:int, curr:int, channelID:str, scraper:CloudflareScraper, main_sem:Semaphore, js_buff:list, js_buff_lock:Lock, pool:ThreadPool) -> None:
"""
Thread job for worker threads in discord_lookup_all. Processes a segment of
data then sends its next segment into thread queue
Param:
tcount: number of threads used within threadpool.
main_sem: Semaphore used to wake up main thread
skip: skip amount to access next page of content, will be the same for all threads
curr: current skip number
channelID: Discord channel id
scraper: scraper to be used to scrape js
js_buff: list used to store stuff
js_buff_lock: lock for js_buff
pool: Threadpool used for this function
Pre: main_sem begins on zero
Pre: tcount number of tasks were/is going to be submitted into threadpool
NOTE: that cond isn't used because there is a situation where broadcast may be
called before calling thread goes to sleep
"""
data = None
# Process current task
url = DISCORD_CHANNEL_CONTENT_PRE_API + channelID + DISCORD_CHANNEL_CONTENT_SUF_API + str(curr)
logging.info(f"scanning {url}")
while not data:
try:
data = scraper.get(url, timeout=5, headers=HEADERS)
except(requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout):
logging.info("Connection error, retrying -> url: {s}".format(s=url))
if not data:
logging.critical("Invalid data scraped -> url: {S}".format(s=url))
# Convert data
js = data.json()
# Add data to js_buff
if len(js) > 0:
js_buff_lock.acquire()
# If js_buff is too small, extend it
insert_pos = curr/skip
space_diff = self.__calculate_additional_list_slots(js_buff, insert_pos)
if(space_diff > 0):
addon = [None] * int(space_diff)
js_buff += addon
# Add into js buff
js_buff[int(insert_pos)] = js
logging.debug("Received " + str(js) + " from " + url)
js_buff_lock.release()
# Create and add task back into threadpool
pool.enqueue((self.__discord_lookup_thread_job, (tcount, DISCORD_CHANNEL_CONTENT_SKIP_INCRE, curr + tcount * DISCORD_CHANNEL_CONTENT_SKIP_INCRE, channelID, scraper, main_sem, js_buff, js_buff_lock, pool)))
# If is done, broadcast to main thread
else:
main_sem.release()
def __calculate_additional_list_slots(self, l:list, p:int)->int:
"""
Given the list l and position to insert element p, returns how many more list slots are
needed in l to meet p
Args:
l (list): list
p (int): position to insert element
Returns:
int: how many more list slots needed in l to meet p, if is <=0, no additional slots are needed
"""
return p - (len(l) - 1)
def discord_channel_lookup(self, channelID:str|None, scraper:CloudflareScraper)->dict|list:
"""
Looks up a channel's content and returns it. Content is returned in
chunks and not all content is returned; however, subsequent calls will
return results that will always be different.
Param:
channelID:
channelID of channel to scrape.
If is None, scrape starting at the endpoint of the previous scrape
If is not None, scrape starting the end of the channel
scarper:
Scraper: scaraper to use while scraping kemono
Return: JSON object containing data from the file
"""
# If None sent but no history, quit
if not channelID:
assert(self.__recent)
# If no history, create initial history
if not self.__recent:
self.__recent = {"channelID" : channelID, "skip" : 0} # it doesn't exist yet, so initialize it
# If history exists and matches, use old data
if(not channelID or channelID == self.__recent.get("channelID")):
skip = self.__recent.get("skip")
self.__recent = {"channelID" : self.__recent.get("channelID"), "skip" : skip + DISCORD_CHANNEL_CONTENT_SKIP_INCRE}
channelID = self.__recent.get("channelID")
# If history exists but does not match, start from beginning
else:
skip = 0
self.__recent = {"channelID" : channelID, "skip" : skip + DISCORD_CHANNEL_CONTENT_SKIP_INCRE}
# Grab data
data = None
url = DISCORD_CHANNEL_CONTENT_PRE_API + channelID + DISCORD_CHANNEL_CONTENT_SUF_API + str(skip)
while not data:
try:
data = scraper.get(url, timeout=5, headers=HEADERS)
except(requests.exceptions.ConnectionError, requests.exceptions.ConnectTimeout, requests.exceptions.ReadTimeout):
logging.debug("Connection error, retrying")
# Convert data
js = data.json()
logging.debug("Received " + str(js) + " from " + url)
# Return json
return js