-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdata_preprocessing.py
88 lines (65 loc) · 3.18 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
import pickle
import json
import os
import seaborn as sns
from nltk.corpus import stopwords
import sys
sys.path.append('../')
import utilities.twitter_custom_functions as tcf
from sentiment_class import TwitterSentiment
json_loc = r'D:\GitHub\Projects\Twitter_Project\Twitter_Topic_Modelling\twitter_config.json'
with open(json_loc) as json_file:
config = json.load(json_file)
# Project folder location and keys
os.chdir(config["project_directory"])
sns.set_style("darkgrid")
all_tweets_list_file_loc = r'all_tweets_list.txt'
translated_tweets_filename = 'tweets_translated.csv'
with open(all_tweets_list_file_loc, 'rb') as file:
allTweetsList = pickle.load(file)
# Create a dataframe based on the relevant data from the full list of the
# received tweets
user_ls, userid_ls, tweet_ls = [], [], []
location_ls, datetime_ls, replyto_ls = [], [], []
geo_loc_ls = []
for tweet_dict in allTweetsList:
user_ls.append(tweet_dict['user']['screen_name'])
userid_ls.append(tweet_dict['user']['id'])
tweet_ls.append(tcf.remove_url(tweet_dict['text']))
location_ls.append(tweet_dict['user']['location'])
datetime_ls.append(tweet_dict['created_at'])
print('removing hyperlink finished..')
# Dataframe that contains the data for analysis
# Note: The twitter API functionality is very broad in what data we can analyse
# This project will focus on tweets and with their respective location/date.
tweets_df = pd.DataFrame(list(zip(user_ls, userid_ls, tweet_ls,
location_ls, datetime_ls)),
columns=['Username', 'UserID', 'Tweet', 'Location', 'Date'])
# Remove tweets that they did not have any text
tweets_df = tweets_df[tweets_df['Tweet'].notnull()].reset_index()
tweets_df.drop(columns=['index'], inplace=True)
# Add Year and Month columns corresponding to each tweet
tweets_df['Year'] = pd.DatetimeIndex(tweets_df['Date']).year
tweets_df['Month'] = pd.DatetimeIndex(tweets_df['Date']).month
# Detect language and translate if necessary
tweets_df['Tweet_Translated'] = tweets_df['Tweet'].apply(lambda text:
tcf.translate_tweet(text))
# Remove punctuation and stop words
stopwords_full = list(stopwords.words('english'))
spanish_stopwords = list(stopwords.words('spanish'))
# Remove common words or punctuation used in tweets plus the term that we
# used for the query
commonTwitterStopwords = ['rt', 'RT', 'retweet', 'new', 'via', 'us', 'u',
'covid', 'coronavirus', '2019', 'coronavírus',
'#coronavirus', '19', '#covid', '#covid19',
'#covid2019', '…', '...', '“', '”', '‘', '’']
stopwords_full.extend(commonTwitterStopwords + spanish_stopwords)
num_list = '0123456789'
tweets_df['Tweets_Clean'] = tweets_df['Tweet_Translated'].apply(
lambda x: tcf.remove_punct_and_stopwords(x, stopwords_full, num_list))
tweets_df = tweets_df[tweets_df['Tweets_Clean'].notnull()].reset_index()
# Save the data
tweets_df.to_csv(translated_tweets_filename, sep='\t', encoding='utf-8',
index=False)
print('File created in: {0}'.format(os.path.join(os.getcwd(), translated_tweets_filename)))