|
1 | 1 | import re |
2 | | -from math import log2 |
| 2 | +import math |
3 | 3 |
|
4 | 4 |
|
5 | | -def search(documents, query): |
6 | | - inverted_index = get_inverted_index(documents) |
7 | | - result_with_relevance = {} |
8 | | - query_tokens = tokenize(query) |
9 | | - for query_token in query_tokens: |
10 | | - documents_has_token = inverted_index.get(query_token) |
11 | | - if not documents_has_token: |
12 | | - continue |
13 | | - for document in documents_has_token: |
14 | | - result_with_relevance.setdefault(document['id'], 0) |
15 | | - token_tf_idf = get_tf_idf(document['id'], documents_has_token) |
16 | | - result_with_relevance[document['id']] += token_tf_idf |
17 | | - result = sorted( |
18 | | - result_with_relevance, |
19 | | - key=result_with_relevance.get, |
20 | | - reverse=True) |
21 | | - return result |
| 5 | +def compare_by_if_idf(item1, item2): |
| 6 | + if item1['TFIDF'] < item2['TFIDF']: |
| 7 | + return -1 |
| 8 | + elif item1['TFIDF'] == item2['TFIDF']: |
| 9 | + return 0 |
| 10 | + else: |
| 11 | + return 1 |
22 | 12 |
|
23 | 13 |
|
24 | | -def get_inverted_index(documents): |
25 | | - inverted_index = {} |
26 | | - tokens_all = set() |
27 | | - documents_as_tokens = [] |
28 | | - for document in documents: |
29 | | - document_tokens = tokenize(document['text']) |
30 | | - current_document_tokenized = { |
31 | | - 'id': document['id'], |
32 | | - 'tokens': document_tokens |
33 | | - } |
34 | | - documents_as_tokens.append(current_document_tokenized) |
35 | | - tokens_all.update(document_tokens) |
36 | | - for token in tokens_all: |
37 | | - inverted_index[token] = [] |
38 | | - idf = get_idf(documents_as_tokens, token) |
39 | | - for document in documents_as_tokens: |
40 | | - if token in document['tokens']: |
41 | | - tf = get_tf(document['tokens'], token) |
42 | | - current_document_with_relevance = { |
43 | | - 'id': document['id'], |
44 | | - 'tf-idf': round(tf * idf, 4) |
45 | | - } |
46 | | - inverted_index[token].append(current_document_with_relevance) |
47 | | - return inverted_index |
| 14 | +def quickSort(items, comparator, direction='asc'): |
| 15 | + items_length = len(items) |
48 | 16 |
|
| 17 | + if items_length == 0: |
| 18 | + return [] |
49 | 19 |
|
50 | | -def tokenize(text): |
51 | | - tokens = [] |
52 | | - text_lines = text.split('\n') |
53 | | - for text_line in text_lines: |
54 | | - text_line_tokenized = [ |
55 | | - get_term(token) |
56 | | - for token in text_line.split(' ') if token |
57 | | - ] |
58 | | - tokens.extend(text_line_tokenized) |
59 | | - return tokens |
| 20 | + index = items_length // 2 |
| 21 | + element = items[index] |
60 | 22 |
|
| 23 | + smaller_items = [ |
| 24 | + items[i] |
| 25 | + for i in range(items_length) |
| 26 | + if comparator(items[i], element) < 0 and i != index |
| 27 | + ] |
61 | 28 |
|
62 | | -def get_term(token): |
63 | | - return re.sub(r'[^\w\s]', '', token).lower() |
| 29 | + bigger_items = [ |
| 30 | + items[i] |
| 31 | + for i in range(items_length) |
| 32 | + if comparator(items[i], element) >= 0 and i != index |
| 33 | + ] |
64 | 34 |
|
| 35 | + sorted_smaller_items = quickSort(smaller_items, comparator, direction) |
| 36 | + sorted_bigger_items = quickSort(bigger_items, comparator, direction) |
65 | 37 |
|
66 | | -def get_tf_idf(document_id, documents_has_token): |
67 | | - filter_document_has_token = filter( |
68 | | - lambda document: document['id'] == document_id, documents_has_token |
69 | | - ) |
70 | | - document_has_token = list(filter_document_has_token)[0] |
71 | | - tf_idf = document_has_token['tf-idf'] |
72 | | - return tf_idf |
| 38 | + if direction == 'asc': |
| 39 | + return [*sorted_smaller_items, element, *sorted_bigger_items] |
| 40 | + return [*sorted_bigger_items, element, *sorted_smaller_items] |
73 | 41 |
|
74 | 42 |
|
75 | | -def get_tf(document_as_tokens, token): |
76 | | - document_tokens_count = len(document_as_tokens) |
77 | | - token_in_document_count = document_as_tokens.count(token) |
78 | | - tf = token_in_document_count / document_tokens_count |
79 | | - return tf |
| 43 | +def get_index(docs): |
| 44 | + index = {} |
| 45 | + docs_count = len(docs) |
| 46 | + for doc in docs: |
| 47 | + temp_dict = {} |
| 48 | + number_words = 0 |
| 49 | + for token in doc['text'].split(): |
| 50 | + term = re.findall(r'\w+', token) |
| 51 | + index_key = ''.join(term).lower() |
| 52 | + if index_key not in temp_dict: |
| 53 | + temp_dict[index_key] = 0 |
| 54 | + temp_dict[index_key] += 1 |
| 55 | + number_words += 1 |
| 56 | + for key, TF in temp_dict.items(): |
| 57 | + if key not in index: |
| 58 | + index[key] = [{'id': doc['id'], 'TF': TF / number_words}] |
| 59 | + else: |
| 60 | + index[key].append({'id': doc['id'], 'TF': TF / number_words}) |
| 61 | + for key, list_doc in index.items(): |
| 62 | + docs_with_term = len(list_doc) |
| 63 | +# IDF = math.log10( docs_count / docs_with_term ) |
| 64 | +# Math.log2(1 + (docsCount - termCount + 1) / (termCount + 0.5)); |
| 65 | +# docsCount - общее количество документов |
| 66 | +# termCount - количество документов, в которых встречается искомое слово |
| 67 | +# Это несколько "сглаженный" вариант основной формулы |
| 68 | +# линтер требует одновременно соблюдать W503 и W504 |
| 69 | + part_idf = (docs_count - docs_with_term + 1) / (docs_with_term + 0.5) |
| 70 | + IDF = math.log2(1 + part_idf) |
| 71 | + for doc in list_doc: |
| 72 | + doc['TFIDF'] = doc['TF'] * IDF |
| 73 | + return index |
80 | 74 |
|
81 | 75 |
|
82 | | -def get_idf(documents_as_tokens, token): |
83 | | - documents_count = len(documents_as_tokens) |
84 | | - filter_documents_has_token = filter( |
85 | | - lambda document: token in document['tokens'], documents_as_tokens |
86 | | - ) |
87 | | - documents_has_token = list(filter_documents_has_token) |
88 | | - documents_has_token_count = len(documents_has_token) |
89 | | - idf = log2((documents_count + 1) / (documents_has_token_count + 0.5)) |
90 | | - return idf |
| 76 | +def search(docs: dict, search_pattern: str): |
| 77 | + keys = [] |
| 78 | + index = get_index(docs) |
| 79 | + search_results = {} |
| 80 | + search_worlds = search_pattern.split() |
| 81 | + for search_world in search_worlds: |
| 82 | + term = re.findall(r'\w+', search_world) |
| 83 | + search_world_lower = ''.join(term).lower() |
| 84 | + if search_world_lower in index: |
| 85 | + for doc in index[search_world_lower]: |
| 86 | + if doc['id'] not in search_results: |
| 87 | + search_results[doc['id']] = doc['TFIDF'] |
| 88 | + else: |
| 89 | + search_results[doc['id']] += doc['TFIDF'] |
| 90 | + search_results_list = [] |
| 91 | + for doc, TFIDF in search_results.items(): |
| 92 | + search_results_list.append({'id': doc, 'TFIDF': TFIDF}) |
| 93 | + search_results = quickSort(search_results_list, |
| 94 | + compare_by_if_idf, |
| 95 | + 'desc') |
| 96 | + print(search_results) |
| 97 | + for result in search_results: |
| 98 | + keys.append(result['id']) |
| 99 | + return keys |
0 commit comments