-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtimeline-snarf.sh
executable file
·91 lines (77 loc) · 7.04 KB
/
timeline-snarf.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/bin/bash
# Grab all raw tweets and replies (last 10 days of RTs) from user timeline
# Multiple auth_tokens can be used - realistically only need 1-3 depending on account size
# x-rate-limit counters shown (1200 requests every 15mins per account)
#
# usage: ./timeline-snarf screen_name since until|[now] [from date - to date, in YYYY-MM-DD format]
# example: ./timeline-snarf nasa 2025-01-01 now
# ./timeline-snarf nasa 2021-06-01 2022-02-01
auth_tokens=(
abcdef0123456789abcdef0123456789abcdef01
01abcdef0123456789abcdef0123456789abcdef
#def…456
#ace…789
)
x_csrf_token=$(tr -dc 0-9a-f < /dev/urandom | head -c 32)
####
usage() { echo "$0 screen_name since until|[now] [from date - to date, in YYYY-MM-DD format]"; exit 1; }
[ "$#" -ne 3 ] && usage
user="$1"
since="$2"
until="$3"
if [[ "$until" == "now" ]]; then until=$(date -u -d '+1 day' +%Y-%m-%d); fi
product="Latest" # Latest | Top
interval=0 # sleep n seconds between requests
dest="${user}-${since}_${until}" # dump json output in this directory
# search query format
# include:nativeretweets or filter:nativeretweets for just RT (only can retrieve the last 10 days of RTs?)
query="include:nativeretweets from:${user} since:${since} until:${until}"
start=$EPOCHSECONDS # to calculate scrape time
bearer_token='AAAAAAAAAAAAAAAAAAAAAFQODgEAAAAAVHTp76lzh3rFzcHbmHVvQxYYpTw%3DckAlMINMjmCwxUcaXbAN4XqJVdgMJaHqNOFgPMK0zN1qLqLQCF'
header=(-H "Authorization: Bearer ${bearer_token}" -H "User-Agent: TwitterAndroid/10.21.1" -H "X-Csrf-Token: ${x_csrf_token}" -H "Cookie: ct0=${x_csrf_token}; auth_token=${auth_tokens[0]}")
#url='https://api.twitter.com/graphql/gkjsKepM6gl_HmFWoWKfgg/SearchTimeline'
url='https://x.com/i/api/graphql/uGjEfWQSYF3MLxu5TVEiRA/SearchTimeline'
variables='{"rawQuery":"'"${query}"'","count":20,"querySource":"typed_query","product":"'"${product}"'"}'
features='{"android_graphql_skip_api_media_color_palette":false,"blue_business_profile_image_shape_enabled":false,"creator_subscriptions_subscription_count_enabled":false,"creator_subscriptions_tweet_preview_api_enabled":true,"freedom_of_speech_not_reach_fetch_enabled":false,"graphql_is_translatable_rweb_tweet_is_translatable_enabled":false,"hidden_profile_likes_enabled":false,"highlights_tweets_tab_ui_enabled":false,"interactive_text_enabled":false,"longform_notetweets_consumption_enabled":true,"longform_notetweets_inline_media_enabled":false,"longform_notetweets_richtext_consumption_enabled":true,"longform_notetweets_rich_text_read_enabled":false,"responsive_web_edit_tweet_api_enabled":false,"responsive_web_enhance_cards_enabled":false,"responsive_web_graphql_exclude_directive_enabled":true,"responsive_web_graphql_skip_user_profile_image_extensions_enabled":false,"responsive_web_graphql_timeline_navigation_enabled":false,"responsive_web_media_download_video_enabled":false,"responsive_web_text_conversations_enabled":false,"responsive_web_twitter_article_tweet_consumption_enabled":false,"responsive_web_twitter_blue_verified_badge_is_enabled":true,"rweb_lists_timeline_redesign_enabled":true,"spaces_2022_h2_clipping":true,"spaces_2022_h2_spaces_communities":true,"standardized_nudges_misinfo":false,"subscriptions_verification_info_enabled":true,"subscriptions_verification_info_reason_enabled":true,"subscriptions_verification_info_verified_since_enabled":true,"super_follow_badge_privacy_enabled":false,"super_follow_exclusive_tweet_notifications_enabled":false,"super_follow_tweet_api_enabled":false,"super_follow_user_api_enabled":false,"tweet_awards_web_tipping_enabled":false,"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled":false,"tweetypie_unmention_optimization_enabled":false,"unified_cards_ad_metadata_container_dynamic_card_content_query_enabled":false,"verified_phone_label_enabled":false,"vibe_api_enabled":false,"view_counts_everywhere_api_enabled":true,"responsive_web_grok_analyze_button_fetch_trends_enabled":true,"creator_subscriptions_quote_tweet_preview_enabled":false,"profile_label_improvements_pcf_label_in_post_enabled":false,"rweb_tipjar_consumption_enabled":true,"rweb_video_timestamps_enabled":true,"c9s_tweet_anatomy_moderator_badge_enabled":true,"communities_web_enable_tweet_community_results_fetch":true,"premium_content_api_read_enabled":false,"articles_preview_enabled":true,"responsive_web_grok_analyze_post_followups_enabled":false}'
mkdir -p $dest
count=0
token=0
tokens_max="$((${#auth_tokens[@]}-1))"
#################
while :; do
next=$(($count+1))
auth_token="${auth_tokens[$token]}"
header=(-H "Authorization: Bearer ${bearer_token}" -H "User-Agent: TwitterAndroid/10.21.1" -H "X-Csrf-Token: ${x_csrf_token}" -H "Cookie: ct0=${x_csrf_token}; auth_token=${auth_token}")
cursor=$(jq -r '.data.search_by_raw_query.search_timeline.timeline.instructions[-1] | if(.entries[-1].content.value) then .entries[-1].content.value else .entry.content.value end' $dest/$count.json 2>/dev/null)
after="\"cursor\":\"${cursor}\","
if [[ $cursor != "null" ]]; then # XXX getting cursors on empty reults with search api
variables='{"rawQuery":"'"${query}"'","count":20,'"${after}"'"querySource":"typed_query","product":"'"${product}"'"}'
if [ -z "${cursor}" ]; then cursor="First page"; fi
# [${cursor}] for verbosity
echo -e "page \x1b[40m $next \x1b[0m | token: \e[$((40+token));5;1m ${token} \e[0m …${auth_token: -4}"
# grab headers (-i) to get x-rate-limits
fetch=$(curl -si -G "${header[@]}" "${url}" --data-urlencode "variables=${variables}" --data-urlencode "features=${features}")
tail -1 <<< "${fetch}" > $dest/$next.json
if [[ $(jq -r '.data.search_by_raw_query.search_timeline.timeline.instructions[0] | if(.entries) then([.entries[]]|length) else 0 end' $dest/$next.json 2>/dev/null) -eq 0 ]]; then
end=$EPOCHSECONDS
echo "✨ All done - completed in $(($end-$start)) seconds"
tweet_count=$(cat $dest/*.json | jq '.data.search_by_raw_query.search_timeline.timeline.instructions[0] | [if(.entries) then .entries[] else empty end | if(select(.entryId | startswith("tweet-"))) then [.entryId] else 0 end] | length' | awk '{sum+=$1};END{printf(sum)}')
echo "Downloaded ${tweet_count} tweets from @${user} between ${since} - ${until} to ${dest}/"
exit 0
fi
# print date range of first-last tweet of query
date_range=$(jq -r '[.data.search_by_raw_query.search_timeline.timeline.instructions[0].entries[] | select(.entryId | startswith("tweet-")) | .content.itemContent.tweet_results.result.legacy.created_at][0,-1] | sub(" \\+0000";"")' $dest/$next.json 2>/dev/null | sed -z 's/\n/ <----> /')
rate_limits=$(sed -En -e 's/^x-rate-limit-.*: (.*)\r/\1/p' <<< "${fetch}" | sort -n | xargs | while read remaining limit reset; do echo -e "\x1b[32m$remaining/$limit\x1b[0m reset: \x1b[94m$(date -d@$reset '+%a %T')\x1b[0m"; done)
echo -e "\x1b[1;96m$date_range\x1b[0m | x-rate-limit: ${rate_limits}"
if [[ $token -lt $tokens_max ]]; then
((token++))
else
token=0
fi
((count++))
sleep $interval
else
echo "Limit reached or interrupted" # shrug
break
fi
done