-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
111 lines (95 loc) · 3.89 KB
/
Copy pathmain.py
File metadata and controls
111 lines (95 loc) · 3.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import csv
from time import sleep
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import regexp
from textblob import TextBlob
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english", ignore_stopwords=True)
lemmatizer = WordNetLemmatizer()
word_tokenizer = regexp.WhitespaceTokenizer()
def get_tweet_data(card):
username = card.find_element_by_xpath('.//span').text
handle = card.find_element_by_xpath('.//span[contains(text(), "@")]').text
try:
postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')
except NoSuchElementException:
return
comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text
responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text
text = comment + responding
reply_cnt = card.find_element_by_xpath('.//div[@data-testid="reply"]').text
retweet_cnt = card.find_element_by_xpath('.//div[@data-testid="retweet"]').text
like_cnt = card.find_element_by_xpath('.//div[@data-testid="like"]').text
tweet = (username, handle, postdate, text, reply_cnt, retweet_cnt, like_cnt)
return tweet
driver = Firefox()
driver.get('https://www.twitter.com/login')
driver.maximize_window()
sleep(2)
username = driver.find_element_by_xpath('//input[@name="session[username_or_email]"]')
username.send_keys('INSERT-USERNAME/EMAIL-HERE')
my_password = 'INSERT-PASSWORD-HERE'
password = driver.find_element_by_xpath('//input[@name="session[password]"]')
password.send_keys(my_password)
password.send_keys(Keys.RETURN)
sleep(3)
search_input = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
#Change $btc to any word/hashtag you want to search
search_input.send_keys("$btc")
search_input.send_keys(Keys.RETURN)
sleep(3)
#Uncomment this line if you want to get tweets from Latest page, leave if you want the top tweets
#driver.find_element_by_link_text('Latest').click()
data = []
tweet_ids = set()
last_position = driver.execute_script("return window.pageYOffset;")
scrolling = True
count=0
while True:
count += 1
sleep(3)
page_cards = driver.find_elements_by_xpath('//div[@data-testid="tweet"]')
for card in page_cards:
tweet = get_tweet_data(card)
if tweet:
tweet_id = ''.join(tweet)
if tweet_id not in tweet_ids:
tweet_ids.add(tweet_id)
data.append(tweet)
scroll_attempt = 0
while True:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(1)
curr_position = driver.execute_script('return window.pageYOffset;')
if last_position == curr_position:
scroll_attempt += 1
if scroll_attempt >= 3:
scrolling = False
break
else:
sleep(2)
else:
last_position = curr_position
break
if count == 4:
break
with open("btc_tweets.csv",'w',newline='', encoding='utf-8') as f:
header = ['UserName', 'Handle', 'Timestamp', 'Comments', 'Likes', 'Retweets', 'Text', 'Polarity', 'Subjectivity']
writer = csv.writer(f)
writer.writerow(header)
writer.writerows(data)
df = pd.read_csv('btc_tweets.csv')
for index, row in df.iterrows():
text = row['Comments']
tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
words = [lemmatizer.lemmatize(w) for w in tokenized_text if w not in stop_words]
stem_text = " ".join([stemmer.stem(i) for i in words])
analysis = TextBlob(text)
df.loc[index, 'Polarity'] = analysis.sentiment.polarity
df.loc[index, 'Subjectivity'] = analysis.sentiment.subjectivity
print("Overall sentiment of the analyzed tweets: " + df['Polarity'].mean())