You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
How to prepare Movie Review Data for Sentimental Analysis
importstringimportrefromosimportlistdirfromcollectionsimportCounterfromnltk.corpusimportstopwords# load doc into memorydefload_doc(filename):
# open the file in read onlyfile=open(filename, 'r')
# read all texttext=file.read()
# close the filefile.close()
returntext# turn a doc into clean tokensdefclean_doc(doc):
# split into tokens by whitespacetokens=doc.split()
# prepare for char filteringre_punc=re.compile('[%s]'%re.escape(string.punctuation))
# remove the punctuationtokens= [re_punc.sub('', w) forwintokens]
# removing remaining tokens that are not alphabetictokens= [wordforwordintokensifword.isalpha()]
# filter out stop wordsstop_words=set(stopwords.words('english'))
tokens= [wforwintokensifnotwinstop_words]
# filter out short tokenstokens= [wordforwordintokensiflen(word)>1]
returntokens# load doc and add to vocabdefadd_doc_to_vocab(filename, vocab):
# load docdoc=load_doc(filename)
# clean doctokens=clean_doc(doc)
# update countsvocab.update(tokens)
# load all docs in a directorydefprocess_docs(directory, vocab):
# walkthrough all files in the folderforfilenameinlistdir(directory):
# skip files that do not have the right extensionifnotfilename.endswith(".txt"):
next# create the full path of the file to openpath=directory+'/'+filename# add doc to vocabadd_doc_to_vocab(path, vocab)
# save list to filedefsave_list(lines, filename):
data='\n'.join(lines)
file=open(filename, 'w')
file.write(data)
file.close()
# define vocab vocab=Counter()
# add all docs to vocabprocess_docs('txt_sentoken/neg', vocab)
process_docs('txt_sentoken/pos', vocab)
# print the size of the vocabprint(len(vocab))
# print the top words in the vocabprint(vocab.most_common(50))
# keep tokens with > 5 occurrencemin_occurence=5tokens= [kfork,cinvocab.items() ifc>=min_occurence]
print(len(tokens))
# save tokens to a vocabulary filesave_list(tokens, 'vocab.txt')
importstringimportrefromosimportlistdirfromnltk.corpusimportstopwords# load doc into memorydefload_doc(filename):
# open the file as read onlyfile=open(filename,'r')
# read all texttext=file.read()
# close the filefile.close()
returntext# turn a doc into clean tokensdefclean_doc(doc):
# split into tokens by whitespacetokens=doc.split()
# prepare for char filteringre_punc=re.compile('[%s]'%re.escape(string.punctuation))
# remove the punctuationtokens= [re_punc.sub('', w) forwintokens]
# removing remaining tokens that are not alphabetictokens= [wordforwordintokensifword.isalpha()]
# filter out stop wordsstop_words=set(stopwords.words('english'))
tokens= [wforwintokensifnotwinstop_words]
# filter out short tokenstokens= [wordforwordintokensiflen(word)>1]
returntokens# save list to filedefsave_list(lines, filename):
data='\n'.join(lines)
file=open(filename, 'w')
file.write(data)
file.close()
# load doc, clean and return line of tokensdefdoc_to_line(filename, vocab):
# load the docdoc=load_doc(filename)
# clean doctokens=clean_doc(doc)
# filter by vocabtokens= [wforwintokensifwinvocab]
return' '.join(tokens)
# load all docs in a directorydefprocess_docs(directory, vocab):
lines=list()
# walk through all files in the folderforfilenameinlistdir(directory):
# skip files that don't have the right extensionifnotfilename.endswith(".txt"):
next# create the full path of the file to openpath=directory+'/'+filename# load and clean the docline=doc_to_line(path, vocab)
# add to listlines.append(line)
returnlines# load vocabularyvocab_filename='vocab.txt'vocab=load_doc(vocab_filename)
vocab=vocab.split()
vocab=set(vocab)
# prepare negative reviewsnegative_lines=process_docs('txt_sentoken/neg', vocab)
save_list(negative_lines, 'negative.txt')
# prepare positive reviewspositive_lines=process_docs('txt_sentoken/pos', vocab)
save_list(positive_lines, 'positive.txt')