-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain2.py
More file actions
98 lines (82 loc) · 4.5 KB
/
main2.py
File metadata and controls
98 lines (82 loc) · 4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import requests
import numpy as np
from bs4 import BeautifulSoup
# Function to get embeddings from Ollama's 'nomic-embed-text' model
def get_embeddings(texts):
embeddings = []
for text in texts:
response = requests.post(
"http://localhost:11434/api/embed", # Correct Ollama API endpoint
json={"model": "nomic-embed-text", "input": text}
)
if response.status_code == 200:
response_json = response.json()
embeddings.append(np.array(response_json['embeddings'][0])) # Extract embedding from response
else:
print(f"Error in getting embedding for: {text}")
embeddings.append(np.zeros(256)) # Fallback to a zero vector if error occurs
return np.array(embeddings)
# Function to extract text from an HTML file or webpage
def extract_text_from_html(html_content):
soup = BeautifulSoup(html_content, "html.parser")
text = soup.get_text()
return text
# Function to calculate cosine similarity using numpy
def cosine_similarity(vec1, vec2):
dot_product = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
return dot_product / (norm1 * norm2)
# Function to perform similarity search with confidence score and threshold
def find_most_similar_tags(tag_descriptions, html_content, threshold=0.7, top_n=3):
# Get embeddings for tags
tag_embeddings = get_embeddings([desc['tags_description'] for desc in tag_descriptions])
# Extract text from the HTML content
extracted_text = extract_text_from_html(html_content)
# Get embedding for extracted HTML text
html_text_embedding = get_embeddings([extracted_text])[0]
# Calculate cosine similarity between the HTML text embedding and tag embeddings
similarities = []
for tag_embedding in tag_embeddings:
similarity = cosine_similarity(html_text_embedding, tag_embedding)
similarities.append(similarity)
# Get indices of top N most similar tags, filtered by threshold
top_indices = [i for i, similarity in enumerate(similarities) if similarity >= threshold]
# If no tags meet the threshold, return empty list or the top N
if not top_indices:
top_indices = np.argsort(similarities)[-top_n:][::-1] # Sort indices by similarity (descending)
most_similar_tags = []
for idx in top_indices:
most_similar_tags.append({
"tags": tag_descriptions[idx]["tags"], # Add tags (as multiple)
"tags_description": tag_descriptions[idx]["tags_description"],
"similarity_score": similarities[idx] # Add similarity score to the result
})
return most_similar_tags
# Example of tags and descriptions
tag_descriptions = [
{"tags": ["entertainment", "novels"], "tags_description": "Entertainment and novels include books, movies, TV shows, and other forms of storytelling."},
{"tags": ["news", "social media"], "tags_description": "News and social media cover updates, discussions, and trends in the digital space."},
{"tags": ["learning", "education", "science"], "tags_description": "Learning and education focus on the acquisition of knowledge, skills, and scientific understanding."},
{"tags": ["anime", "entertainment"], "tags_description": "Anime is a genre of animation originating from Japan, encompassing various artistic styles and themes."},
{"tags": ["wiki", "information"], "tags_description": "Wiki refers to websites that allow collaborative editing of content, like Wikipedia."},
{"tags": ["math", "education"], "tags_description": "Mathematics is the study of numbers, shapes, patterns, and their relationships."},
{"tags": ["science", "education"], "tags_description": "Science focuses on the study of the physical and natural world through observation and experimentation."}
]
# Example of HTML content to compare with
html_content = """
<html>
<head><title>Introduction to Anime</title></head>
<body>
<h1>What is Anime?</h1>
<p>Anime refers to Japanese animation that has become a global phenomenon. It covers various genres such as action, drama, fantasy, and more.</p>
<p>Many popular anime series have expanded into manga, video games, and other forms of media.</p>
</body>
</html>
"""
# Find the most similar tags
similar_tags = find_most_similar_tags(tag_descriptions, html_content, threshold=0.7)
# Output the most similar tags with similarity scores
print("Most similar tags:")
for tag in similar_tags:
print(f"Tags: {', '.join(tag['tags'])}, Similarity Score: {tag['similarity_score']:.3f}")