-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtweet_cleanup.py
More file actions
104 lines (91 loc) · 3.56 KB
/
tweet_cleanup.py
File metadata and controls
104 lines (91 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import html
import re
import unicodedata
import argparse
from bs4 import BeautifulSoup
from urllib.request import urlopen
parser = argparse.ArgumentParser()
parser.add_argument('--i', dest='infile', default='data/tweets.tsv')
parser.add_argument('--o', dest='outfile', default='data/tweets_cleaned.tsv')
parser.add_argument('--url', dest='fetch_website_titles', default=False,
action='store_true')
parser.add_argument('--h', dest='leave_hashtags', default=False,
action='store_true')
args = parser.parse_args()
url_regex = '((?<=^)|(?<=\W))((https?://|www\d{0,3}\.)' \
'[a-zA-Z0-9.\-]+\.[a-z]{2,}|[a-zA-Z0-9.\-]+\.[a-z]{2,}/)' \
'([a-zA-Z0-9/\?%\+#~\.\-@\*!\(\)\[\]=:;,&\$/\']*)?'
def replace_url(tweet):
match = re.search(url_regex, tweet)
retrieved = False
while match:
url = match.group(0)
title = ''
try:
soup = BeautifulSoup(urlopen(url), features="lxml")
title = soup.title.string
title = title.replace('\n', ' ')
title = title.replace('\t', ' ')
except Exception as e:
pass
if title:
retrieved = True
else:
title = ''
tweet = tweet[:match.start(0)] + title + tweet[match.end(0):]
match = re.search(url_regex, tweet)
if retrieved:
print(tweet)
return tweet
chars = set()
with open(args.infile, 'r', encoding='utf8') as f_in:
with open(args.outfile, 'w+', encoding='utf8') as f_out:
prev_tweet = ''
for line in f_in:
line = line.strip()
fields = line.split('\t', maxsplit=2)
new_tweet = False
if len(fields) == 3:
try:
int(fields[0])
int(fields[1])
new_tweet = True
except ValueError:
pass
tweet = fields[2] if new_tweet else line
# URLs: of the form abc.de; start with http(s):// or www or contain a /
if args.fetch_website_titles:
tweet = replace_url(tweet)
else:
tweet = re.sub(url_regex, '<URL>', tweet)
# Replace non-breaking spaces etc.
tweet = unicodedata.normalize("NFKC", tweet)
# Replace, e.g., > with >
tweet = html.unescape(tweet)
# Zero-width characters
tweet = re.sub('[\u200b\u200c\u200d\xad]', '', tweet)
# Typographical variation
tweet = re.sub('[«»“”„]|(´´)|(``)|( ́ ́)|( ̀ ̀)', '"',
tweet)
tweet = re.sub("[\x92‘’´`]|('')|( ́)|( ̀)", "'", tweet)
tweet = re.sub("''", '"', tweet)
# Twitter usernames
tweet = re.sub('((?<=^)|(?<=\W))@[a-zA-Z0-9_]+',
'<USERNAME>', tweet)
# Hashtags, numbers
if not args.leave_hashtags:
tweet = re.sub('#\w+', '<HASHTAG>', tweet)
tweet = re.sub('(?:(?<=\s)|(?<=^))[0-9]+(?:(?=\s)|(?=$))',
'<NUMBER>', tweet)
if new_tweet:
f_out.write(prev_tweet + '\n')
prev_tweet = fields[0] + '\t' + fields[1] + '\t' + tweet
else:
prev_tweet += ' ' + tweet
chars.update(tweet)
if len(prev_tweet) > 0:
f_out.write(prev_tweet + '\n')
chars = list(chars)
chars.sort()
with open('data/tweets_chars.tsv', 'w+', encoding='utf8') as f:
f.write('{} characters\n{}\n'.format(len(chars), chars))