|
1 | 1 | import re |
2 | | -from collections import defaultdict, Counter |
3 | | -from itertools import chain |
4 | | -from math import log |
5 | | - |
6 | | -def preprocess(text): |
7 | | - return re.sub(r'[^\w\s]|(?<!\w)_', '', text).lower() |
8 | | - |
9 | | - |
10 | | -def search(docs: list, words: str) -> list: |
11 | | - """ |
12 | | - Searches for words in the documents and returns their IDs, |
13 | | - ranked by the total count of word occurrences in the document text. |
14 | | - """ |
15 | | - words = preprocess(words).split() |
16 | | - result = [] |
17 | | - |
18 | | - for doc in docs: |
19 | | - cleaned_text = preprocess(doc['text']) |
20 | | - word_count = sum(Counter(cleaned_text.split())[word] for word in words) |
21 | | - if word_count: |
22 | | - result.append({'id': doc['id'], 'word_count': word_count}) |
23 | | - |
24 | | - result.sort(key=lambda x: x['word_count'], reverse=True) |
25 | | - |
26 | | - return [item['id'] for item in result] |
27 | | - |
28 | | - |
29 | | -def get_inverted_index(docs: list) -> dict: |
30 | | - """ |
31 | | - Builds an inverted index from a list of documents. |
32 | | - """ |
33 | | - inverted_index = defaultdict(list) |
34 | | - |
35 | | - # Создаем индекс для каждого слова |
36 | | - for doc in docs: |
37 | | - words = preprocess(doc['text']).split() |
38 | | - word_counts = Counter(words) |
39 | | - |
40 | | - for word, count in word_counts.items(): |
41 | | - if word not in inverted_index: |
42 | | - inverted_index[word] = [doc['id']] |
43 | | - else: |
44 | | - inverted_index[word].append(doc['id']) |
45 | | - |
46 | | - return dict(inverted_index) |
47 | | - |
48 | | - |
49 | | -def get_tf(text: str) -> dict: |
50 | | - """ Compute TF """ |
51 | | - cleaned_text = preprocess(text).split() |
52 | | - total_words = len(cleaned_text) |
53 | | - tf = defaultdict(float) |
54 | | - |
55 | | - for word in cleaned_text: |
56 | | - tf[word] += 1 |
57 | | - |
58 | | - for word in tf: |
59 | | - tf[word] /= total_words |
60 | | - |
61 | | - return tf |
62 | | - |
63 | | - |
64 | | -def get_idf(docs): |
65 | | - """ Compute IDF """ |
66 | | - N = len(docs) |
67 | | - idf = {} |
68 | | - |
69 | | - for doc in docs: |
70 | | - words_in_doc = preprocess(doc['text']).split() |
71 | | - for word in words_in_doc: |
72 | | - q_of_docs_with_word = len(search(docs, word)) |
73 | | - idf[word] = log(N / (1 + q_of_docs_with_word)) + 1 |
74 | | - |
75 | | - return idf |
76 | | - |
77 | | - |
78 | | -def get_tf_idf(docs): |
79 | | - """ Compute TF-IDF """ |
80 | | - idf = get_idf(docs) |
81 | | - tf_idf = {} |
82 | | - |
83 | | - for doc in docs: |
84 | | - doc_id = doc['id'] |
85 | | - tf = get_tf(doc['text']) |
86 | | - tf_idf[doc_id] = { |
87 | | - word: round(tf[word] * idf[word], 4) |
88 | | - for word in tf |
| 2 | +from math import log2 |
| 3 | + |
| 4 | + |
| 5 | +def search(documents, query): |
| 6 | + inverted_index = get_inverted_index(documents) |
| 7 | + result_with_relevance = {} |
| 8 | + query_tokens = tokenize(query) |
| 9 | + for query_token in query_tokens: |
| 10 | + documents_has_token = inverted_index.get(query_token) |
| 11 | + if not documents_has_token: |
| 12 | + continue |
| 13 | + for document in documents_has_token: |
| 14 | + result_with_relevance.setdefault(document['id'], 0) |
| 15 | + token_tf_idf = get_tf_idf(document['id'], documents_has_token) |
| 16 | + result_with_relevance[document['id']] += token_tf_idf |
| 17 | + result = sorted( |
| 18 | + result_with_relevance, |
| 19 | + key=result_with_relevance.get, |
| 20 | + reverse=True) |
| 21 | + return result |
| 22 | + |
| 23 | + |
| 24 | +def get_inverted_index(documents): |
| 25 | + inverted_index = {} |
| 26 | + tokens_all = set() |
| 27 | + documents_as_tokens = [] |
| 28 | + for document in documents: |
| 29 | + document_tokens = tokenize(document['text']) |
| 30 | + current_document_tokenized = { |
| 31 | + 'id': document['id'], |
| 32 | + 'tokens': document_tokens |
89 | 33 | } |
90 | | - |
| 34 | + documents_as_tokens.append(current_document_tokenized) |
| 35 | + tokens_all.update(document_tokens) |
| 36 | + for token in tokens_all: |
| 37 | + inverted_index[token] = [] |
| 38 | + idf = get_idf(documents_as_tokens, token) |
| 39 | + for document in documents_as_tokens: |
| 40 | + if token in document['tokens']: |
| 41 | + tf = get_tf(document['tokens'], token) |
| 42 | + current_document_with_relevance = { |
| 43 | + 'id': document['id'], |
| 44 | + 'tf-idf': round(tf * idf, 4) |
| 45 | + } |
| 46 | + inverted_index[token].append(current_document_with_relevance) |
| 47 | + return inverted_index |
| 48 | + |
| 49 | + |
| 50 | +def tokenize(text): |
| 51 | + tokens = [] |
| 52 | + text_lines = text.split('\n') |
| 53 | + for text_line in text_lines: |
| 54 | + text_line_tokenized = [ |
| 55 | + get_term(token) |
| 56 | + for token in text_line.split(' ') if token |
| 57 | + ] |
| 58 | + tokens.extend(text_line_tokenized) |
| 59 | + return tokens |
| 60 | + |
| 61 | + |
| 62 | +def get_term(token): |
| 63 | + return re.sub(r'[^\w\s]', '', token).lower() |
| 64 | + |
| 65 | + |
| 66 | +def get_tf_idf(document_id, documents_has_token): |
| 67 | + filter_document_has_token = filter( |
| 68 | + lambda document: document['id'] == document_id, documents_has_token |
| 69 | + ) |
| 70 | + document_has_token = list(filter_document_has_token)[0] |
| 71 | + tf_idf = document_has_token['tf-idf'] |
91 | 72 | return tf_idf |
92 | 73 |
|
93 | 74 |
|
94 | | -# doc1 = "I can't shoot straight unless I've had a pint!" |
95 | | -# doc2 = "Don't shoot shoot shoot that thing at me." |
96 | | -# doc3 = "I'm your shooter." |
| 75 | +def get_tf(document_as_tokens, token): |
| 76 | + document_tokens_count = len(document_as_tokens) |
| 77 | + token_in_document_count = document_as_tokens.count(token) |
| 78 | + tf = token_in_document_count / document_tokens_count |
| 79 | + return tf |
97 | 80 |
|
98 | | -# docs = [ |
99 | | -# {'id': 'doc1', 'text': doc1}, |
100 | | -# {'id': 'doc2', 'text': doc2}, |
101 | | -# {'id': 'doc3', 'text': doc3}, |
102 | | -# ] |
103 | 81 |
|
104 | | -# pprint(get_tf_idf(docs)) |
| 82 | +def get_idf(documents_as_tokens, token): |
| 83 | + documents_count = len(documents_as_tokens) |
| 84 | + filter_documents_has_token = filter( |
| 85 | + lambda document: token in document['tokens'], documents_as_tokens |
| 86 | + ) |
| 87 | + documents_has_token = list(filter_documents_has_token) |
| 88 | + documents_has_token_count = len(documents_has_token) |
| 89 | + idf = log2((documents_count + 1) / (documents_has_token_count + 0.5)) |
| 90 | + return idf |
0 commit comments