Narrative-Nexus/preprocessing.py at main · Keerthana-webdev/Narrative-Nexus · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import nltk #natural lang toolkit(used fr text processing n tokenization)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from cleaning import clean_text

#Downloads
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

    #Original stats
    original_words = len(text.split())
    original_chars = len(text)

    #Cleaning
    cleaned = clean_text(text)

    #Tokenization
    tokens = nltk.word_tokenize(cleaned)

    #Stopwords removal
    tokens = [t for t in tokens if t not in stop_words]

    #Lemmatization
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    #Final processed text
    processed_text = " ".join(tokens)

    #Cleaned stats
    cleaned_words = len(tokens)
    cleaned_chars = len(processed_text)

    #Reduction %
    word_reduction = round(((original_words - cleaned_words)/original_words)*100,2) if original_words else 0
    char_reduction = round(((original_chars - cleaned_chars)/original_chars)*100,2) if original_chars else 0

    return {
        "original_text": text,
        "processed_text": processed_text,
        "original_words": original_words,
        "original_chars": original_chars,
        "cleaned_words": cleaned_words,
        "cleaned_chars": cleaned_chars,
        "word_reduction": word_reduction,
        "char_reduction": char_reduction,
        "tokens": tokens
    }