diff --git a/twitter.py b/twitter.py index d2aade9..31a9a27 100644 --- a/twitter.py +++ b/twitter.py @@ -1,6 +1,12 @@ #import numpy as np +import re +import nltk import pandas as pd from textblob import TextBlob +from nltk import word_tokenize,sent_tokenize +from typing import Counter, List +from nltk.corpus import stopwords +from nltk.util import filestring, pr """ def processTweet(self,tweet): @@ -20,6 +26,39 @@ def processTweet(self,tweet): dataset=pd.read_csv('twitter.csv' , engine='python') +#if @dataset is only of text and with no additional data + +""" +and if data set consists of more columns +tweet_id,sentiment,content +1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[ +1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call... +1956967696,sadness,Funeral ceremony...gloomy friday... +1956967789,enthusiasm,wants to hang out with friends SOON! +1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will." +1956968477,worry,Re-pinging @ghostridah14: why didn't you go to prom? BC my bf didn't like my friends +1956968487,sadness,"I should be sleep, but im not! thinking about an old friend who I want. but he's married now. damn, & he wants me 2! scandalous!" +1956968636,worry,Hmmm. http://www.djhero.com/ is down +1956969035,sadness,@charviray Charlene my love. I miss you +1956969172,sadness,@kelcouch I'm sorry at least it's Friday? +""" +#then parse it by dataset>content + +csv_text=[] +for i in dataset:#dataset['content'] + csv_text.append(i) + +csv_words=[] +countsm=1 +for i in csv_text: + i = re.sub('[^a-zA-Z]',' ', i) + i=i.replace(' ','') + i=i.split() + for f in i: + if not f in set(stopwords.words('english')): + csv_words.append(f) + +print(csv_words) x=dataset.iloc[:,5] #df=pd.DataFrame(columns=[6]) @@ -58,4 +97,4 @@ def processTweet(self,tweet): print(neg*100/y) print("percentage of neutral tweets") -print(neu*100/y) \ No newline at end of file +print(neu*100/y)