@@ -201,14 +201,20 @@ async def scrape_profiles(urls: List[str]) -> List[Dict]:
201201
202202
203203def parse_search (response : ScrapeApiResponse ) -> List [Dict ]:
204- """parse search data from the API response"""
205- try :
206- data = json .loads (response .scrape_result ["content" ])
207- search_data = data ["data" ]
208- except Exception as e :
209- log .error (f"Failed to parse JSON from search API response: { e } " )
210- return None
211-
204+ """parse search data from XHR calls"""
205+ # extract the xhr calls and extract the ones for search results
206+ _xhr_calls = response .scrape_result ["browser_data" ]["xhr_call" ]
207+ search_calls = [c for c in _xhr_calls if "/api/search/general/full/" in c ["url" ]]
208+ search_data = []
209+ for search_call in search_calls :
210+ try :
211+ data = json .loads (search_call ["response" ]["body" ])["data" ]
212+ except Exception as e :
213+ log .error (f"Failed to parse search data from XHR call: { e } " )
214+ continue
215+ search_data .extend (data )
216+
217+ # parse all the data using jmespath
212218 parsed_search = []
213219 for item in search_data :
214220 if item ["type" ] == 1 : # get the item if it was item only
@@ -226,72 +232,31 @@ def parse_search(response: ScrapeApiResponse) -> List[Dict]:
226232 )
227233 result ["type" ] = item ["type" ]
228234 parsed_search .append (result )
229-
230- # wheter there is more search results: 0 or 1. There is no max searches available
231- has_more = data ["has_more" ]
232235 return parsed_search
233236
234237
235- async def obtain_session (url : str ) -> str :
236- """create a session to save the cookies and authorize the search API"""
237- session_id = str (uuid .uuid4 ().hex )
238- await SCRAPFLY .async_scrape (ScrapeConfig (url , ** BASE_CONFIG , render_js = True , session = session_id ))
239- return session_id
240-
241-
242- async def scrape_search (keyword : str , max_search : int , search_count : int = 12 ) -> List [Dict ]:
243- """scrape tiktok search data from the search API"""
244-
245- def generate_search_id ():
246- # get the current datetime and format it as YYYYMMDDHHMMSS
247- timestamp = datetime .datetime .now ().strftime ("%Y%m%d%H%M%S" )
248- # calculate the length of the random hex required for the total length (32)
249- random_hex_length = (32 - len (timestamp )) // 2 # calculate bytes needed
250- random_hex = secrets .token_hex (random_hex_length ).upper ()
251- random_id = timestamp + random_hex
252- return random_id
253-
254- def form_api_url (cursor : int ):
255- """form the reviews API URL and its pagination values"""
256- base_url = "https://www.tiktok.com/api/search/general/full/?"
257- params = {
258- "keyword" : quote (keyword ),
259- "offset" : cursor , # the index to start from
260- "search_id" : generate_search_id (),
261- }
262- return base_url + urlencode (params )
263-
264- log .info ("obtaining a session for the search API" )
265- session_id = await obtain_session (url = "https://www.tiktok.com/search?q=" + quote (keyword ))
266-
267- log .info ("scraping the first search batch" )
268- first_page = await SCRAPFLY .async_scrape (
238+ async def scrape_search (keyword : str ) -> List [Dict ]:
239+ """scrape tiktok search data by scrolling the search page"""
240+ # js code for scrolling down with maximum 15 scrolls. It stops at the end without using the full iterations
241+ js = """const scrollToEnd = (i = 0) => (window.innerHeight + window.scrollY >= document.body.scrollHeight || i >= 15) ? (console.log("Reached the bottom or maximum iterations. Stopping further iterations."), setTimeout(() => console.log("Waited 10 seconds after all iterations."), 10000)) : (window.scrollTo(0, document.body.scrollHeight), setTimeout(() => scrollToEnd(i + 1), 10000)); setTimeout(() => scrollToEnd(), 5000);"""
242+ url = f"https://www.tiktok.com/search?q={ quote (keyword )} "
243+ log .info (f"scraping search page with the URL { url } for search data" )
244+ response = await SCRAPFLY .async_scrape (
269245 ScrapeConfig (
270- form_api_url (cursor = 0 ),
271- ** BASE_CONFIG ,
272- headers = {
273- "content-type" : "application/json" ,
274- },
275- session = session_id ,
246+ url ,
247+ asp = True ,
248+ country = "AU" ,
249+ wait_for_selector = "//div[@data-e2e='search_top-item']" ,
250+ render_js = True ,
251+ auto_scroll = True ,
252+ rendering_wait = 10000 ,
253+ js = js ,
254+ debug = True ,
276255 )
277256 )
278- search_data = parse_search (first_page )
279-
280- # scrape the remaining comments concurrently
281- log .info (f"scraping search pagination, remaining { max_search // search_count } more pages" )
282- _other_pages = [
283- ScrapeConfig (
284- form_api_url (cursor = cursor ), ** BASE_CONFIG , headers = {"content-type" : "application/json" }, session = session_id
285- )
286- for cursor in range (search_count , max_search + search_count , search_count )
287- ]
288- async for response in SCRAPFLY .concurrent_scrape (_other_pages ):
289- data = parse_search (response )
290- if data is not None :
291- search_data .extend (data )
292-
293- log .success (f"scraped { len (search_data )} from the search API from the keyword { keyword } " )
294- return search_data
257+ data = parse_search (response )
258+ log .success (f"scraped { len (data )} search results for keyword: { keyword } " )
259+ return data
295260
296261
297262def parse_channel (response : ScrapeApiResponse ):
@@ -326,7 +291,7 @@ def parse_channel(response: ScrapeApiResponse):
326291async def scrape_channel (url : str ) -> List [Dict ]:
327292 """scrape video data from a channel (profile with videos)"""
328293 # js code for scrolling down with maximum 15 scrolls. It stops at the end without using the full iterations
329- js = """const scrollToEnd = (i = 0) => (window.innerHeight + window.scrollY >= document.body.scrollHeight || i >= 15) ? (console.log("Reached the bottom or maximum iterations. Stopping further iterations."), setTimeout(() => console.log("Waited 10 seconds after all iterations."), 10000)) : (window.scrollTo(0, document.body.scrollHeight), setTimeout(() => scrollToEnd(i + 1), 5000 )); scrollToEnd();"""
294+ js = """const scrollToEnd = (i = 0) => (window.innerHeight + window.scrollY >= document.body.scrollHeight || i >= 15) ? (console.log("Reached the bottom or maximum iterations. Stopping further iterations."), setTimeout(() => console.log("Waited 10 seconds after all iterations."), 10000)) : (window.scrollTo(0, document.body.scrollHeight), setTimeout(() => scrollToEnd(i + 1), 10000 )); setTimeout(() => scrollToEnd(), 5000 );"""
330295 log .info (f"scraping channel page with the URL { url } for post data" )
331296 response = await SCRAPFLY .async_scrape (
332297 ScrapeConfig (
0 commit comments