-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.py
More file actions
99 lines (83 loc) · 3.41 KB
/
index.py
File metadata and controls
99 lines (83 loc) · 3.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Libraries ***********************************************************
# *********************************************************************
import os, re, sys, lucene
from subprocess import *
from java.nio.file import Paths
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.document import Document, Field, StringField, TextField
from org.apache.lucene.store import SimpleFSDirectory
import pandas as pd
import numpy as np
import csv as csv_api
# Init Lucene *********************************************************
# *********************************************************************
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
# Main variables ******************************************************
# *********************************************************************
rumor_folder = "/rumors"
index_folder = rumor_folder+"_index"
# Functions ***********************************************************
# *********************************************************************
def indexDirectory(dir):
for name in os.listdir(dir):
path = os.path.join(dir, name)
if os.path.isfile(path):
indexFile(dir, name)
def indexFile(dir, filename):
path = os.path.join(dir, filename)
print " Cluster: ", filename
if filename.endswith('.csv'):
# reads the cluster file into a dataframe
tweetV = pd.read_csv(open(path,'rU'), delimiter="\t", engine='python')
else: pass
#gets label of cluster
label = np.array(tweetV["Cluster_Lab"].head(1))
#gets score of document
lentv = len(tweetV["T"])
score = tweetV["T"].sum()
score = (lentv+score)*100/(2*lentv)
#gets keywords
doc = Document()
doc.add(Field("label", label[0], TextField.TYPE_STORED))
doc.add(Field("score", str(score), StringField.TYPE_STORED))
# doc.add(Field("section", section, StringField.TYPE_STORED))
doc.add(Field("name", label[0].strip(), TextField.TYPE_STORED))
# doc.add(Field("synopsis", synopsis.strip(), TextField.TYPE_STORED))
# doc.add(Field("keywords", ' '.join((command, name, synopsis, description)),TextField.TYPE_NOT_STORED))
doc.add(Field("filename", os.path.abspath(path), StringField.TYPE_STORED))
print
writer.addDocument(doc)
# Program *************************************************************
# *********************************************************************
# create index directory
index_dir = os.getcwd() + index_folder
if not os.path.exists(index_dir):
os.makedirs(index_dir)
# rumour directory (should be already created)
rumor_dir = os.getcwd() + rumor_folder
if not os.path.exists(rumor_dir):
os.makedirs(rumor_dir)
# get index storage
directory = SimpleFSDirectory(Paths.get(index_dir))
# get the analyzer
analyzer = StandardAnalyzer()
analyzer = LimitTokenCountAnalyzer(analyzer,10000)
config = IndexWriterConfig(analyzer)
# get the index writer
writer = IndexWriter(directory, config)
# get the rumour directory
rumor_path = rumor_dir.split(os.pathsep)
for dir in rumor_path:
print
print "Crawling on folder...", dir
print
for name in os.listdir(dir):
path = os.path.join(dir, name)
if os.path.isdir(path):
indexDirectory(path)
print
# finalize execution
writer.commit()
writer.close()