-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathposting.py
126 lines (100 loc) · 3.37 KB
/
posting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from utils import log_tf
class Posting:
"""
Provides the basic information of a posting inside a document such as document id, term frequency.
Other descriptive attributes can be added such as positioning indexes to allow proximity queries.
The class act is just a node for a linked list class.
"""
def __init__(self, document_id, term_frequency):
self.document_id = document_id
self.term_frequency = term_frequency
self.next = None
self.previous = None
def __repr__(self):
return str(self.info)
@property
def log_tf(self):
"""
:return: log_tf: Logarithmic Term Frequency
"""
return log_tf(self.term_frequency)
@property
def raw_tf(self):
"""
:return: term frequency
"""
return self.term_frequency
@property
def info(self):
"""
:return: Python Dictionary that describe the Posting.
"""
return {
'doc_id': self.document_id,
'raw_tf': self.raw_tf,
'log_tf': self.log_tf,
}
class PostingList:
"""
Represents the Posting List of a term in the index. The structure used is a doubly linked list.
Keep in mind that each posting is inserted in ascending order by document id to help in query processing.
Such queries uses INTERSECTION, UNION.
"""
def __init__(self):
self.head = None
self.tail = None
self.length = 0
def __repr__(self):
return str(self.postings)
def insert(self, document_id: int, term_frequency: int):
"""
Insert a posting inside the postings list.
:param document_id: Integer
:param term_frequency: Integer
"""
posting = Posting(document_id=document_id, term_frequency=term_frequency)
if self.head is None:
self.head = posting
self.tail = posting
self.head.previous = None
elif self.head.document_id > posting.document_id:
self.head.previous = posting
posting.next = self.head
self.head = posting
elif self.tail.document_id < posting.document_id:
posting.previous = self.tail
self.tail.next = posting
self.tail = posting
else:
current = self.head.next
while current.document_id < posting.document_id:
current = current.next
current.previous.next = posting
posting.previous = current.previous
current.previous = posting
posting.next = current
self.length += 1
def exists(self, document_id: int):
"""
Check if a posting exists in a posting list given its document id.
:param document_id: Integer
:return: True if exists; False otherwise.
"""
current = self.head
while current:
if current.document_id == document_id:
return True
current = current.next
return False
@property
def postings(self):
"""
All postings inside a posting list.
:return: (key, value) pairs of document id and posting info.
"""
postings = {}
current = self.head
while current:
postings[current.document_id] = current.info
current = current.next
return postings