-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlocalIndexCreation.cpp
96 lines (89 loc) · 2.39 KB
/
localIndexCreation.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#include "header.h"
bool compareDocFreq(docFreq &a,docFreq &b)
{
return a.frequency > b.frequency;
}
map<string,vector<docFreq> > createLocalIndex(char* dirPath, map<string, bool> stopWordsIndex,int my_rank)
{
map<string,vector<docFreq> > localIndex;
DIR* dir;
struct dirent *docInfo;
fstream doc;
// unsigned int fileID = my_rank*1000000;
if((dir = opendir(dirPath)) != NULL)
{
while((docInfo = readdir(dir)) != NULL)
{
if(strcmp(docInfo->d_name,".") == 0 || strcmp(docInfo->d_name,"..") == 0)
{
continue;
}
unsigned int docID = atoi(docInfo->d_name); // file should not have an extension
// unsigned int docID = fileID++;
char* file_path = (char*)malloc(500*sizeof(char));
strcpy(file_path,dirPath);
strcat(file_path,"/");
strcat(file_path,docInfo->d_name);
doc.open(file_path);
map<string,bool> isAlreadyPresent; // if word has occured at least once in the document opened
string word;
char temp_word[512];
while(doc >> temp_word)
{
char* token = strtok(temp_word,"\n ,-.:;?!\"");
while(token != NULL)
{
word = string(token);
transform(word.begin(),word.end(),word.begin(),::tolower);
if(stopWordsIndex[word])
{
token = strtok(NULL,"\n ,-.:;?!");
continue;
}
if(localIndex[word].size() == 0)
{
vector<docFreq> docWordFreq;
localIndex[word] = docWordFreq;
}
if(!isAlreadyPresent[word])
{
isAlreadyPresent[word] = true;
docFreq d1;
d1.docID = docID;
d1.frequency = 1;
localIndex[word].push_back(d1);
}
else
{
vector<docFreq>::iterator itr = localIndex[word].end();
itr--;
itr->frequency++;
}
token = strtok(NULL,"\n ,-.:;?!\"");
}
}
doc.close();
}
closedir(dir);
}
map<string,vector<docFreq> >::iterator mapItr;
for(mapItr = localIndex.begin(); mapItr != localIndex.end(); mapItr++)
{
sort((mapItr->second).begin(),(mapItr->second).end(),compareDocFreq);
}
return localIndex;
}
void printLocalIndex(map<string,vector<docFreq> > localIndex)
{
map<string,vector<docFreq> >::iterator mapItr;
for(mapItr = localIndex.begin(); mapItr != localIndex.end(); mapItr++)
{
cout << mapItr->first << ":\t";
vector<docFreq>::iterator itr;
for(itr = (mapItr->second).begin(); itr != (mapItr->second).end(); itr++)
{
cout << itr->docID << ":" << itr->frequency << " ";
}
cout << "\n";
}
}