This repository was archived by the owner on Oct 18, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathngram.py
57 lines (51 loc) · 1.69 KB
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
#removing stopwords(if but we he she)
english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
removed_stop_words = []
for review in corpus:
removed_stop_words.append(
' '.join([word for word in review.split()
if word not in english_stop_words]))
return removed_stop_words
#print(remove_stop_words('bromwell high is a cartoon comedy'))
def stemming():
return 0
def split(inputstring):
wordlist = inputstring.split()
return wordlist
def getNGrams(wordlist, n): #wordlist is split words, n is the n in n-gram
ngrams = []
letters = list(wordlist)
for i in range(len(letters) - (n-1)):
ngrams.append(letters[i:i+n])
return ngrams
#teststring = 'hello it is a very nice day'
#print(getNGrams(list(teststring.split()),3))
cv = CountVectorizer(ngram_range = (2,2))
corpus = ['this is a sentence is', 'this is another sentence',
'this is the third sentence']
# X = cv.fit(corpus)
# X = cv.transform(corpus)
# print(X.shape)
# print(X)
# print(X.toarray())
# df = pd.DataFrame(X.toarray(),columns = cv.get_feature_names())
# print(df)
with open('sample.txt', 'r') as file:
data = file.read()
data = [data]
def run(datasample): #datasample = array of strings
data = datasample
cv2 = CountVectorizer(ngram_range = (2,2))
X = cv2.fit_transform(data)
#print(X.toarray())
df = pd.DataFrame(X.toarray(),columns = cv2.get_feature_names())
df.head(10)
return df
run(corpus)