Skip to content

Commit f3f1021

Browse files
committed
Version 1,one website scraping with bot
1 parent a3725f0 commit f3f1021

14 files changed

Lines changed: 429 additions & 0 deletions

File tree

.agents/rules/spider-safety.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Spider Safety & Stealth Protocol
2+
When managing or refactoring scripts in `/spiders`, follow these constraints:
3+
4+
1. **UC Mode Persistence**: Never remove `uc=True` or `headless2=True`. These are required to bypass Webnovel Cloudflare.
5+
2. **Process Management**: Always include `os.system("pkill -f chrome")` before driver initialization to prevent zombie processes on the ASUS TUF hardware.
6+
3. **Binary Paths**: Strictly use `/usr/bin/google-chrome` and `/usr/local/bin/chromedriver`.
7+
4. **Error Handling**: If a scrape returns 0 results, do not hallucinate code changes; check if the site's CSS classes have changed in the Antigravity Browser view.

.agents/tasks/scrape-fandom.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Task: Multi-Source Scrape
2+
**Objective**: Run spiders to collect fanfiction metadata for a specific keyword.
3+
4+
**Steps**:
5+
1. Run `python3 spiders/webnovel_spider.py --keyword "{keyword}"`
6+
2. Run `python3 spiders/ao3_spider.py --keyword "{keyword}"`
7+
3. Verify JSON files exist in `data/raw/`.
8+
4. Notify user of total novel count.

.gitignore

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Environment files
2+
.env
3+
4+
# Cache and compiled files
5+
__pycache__/
6+
*.py[cod]
7+
*$py.class
8+
.pytest_cache/
9+
10+
# Data directory
11+
data/

config.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import os
2+
from dotenv import load_dotenv
3+
4+
load_dotenv()
5+
6+
class Config:
7+
# Telegram
8+
BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
9+
10+
# Paths
11+
CHROME_BINARY = os.getenv("CHROME_BINARY_PATH", "/usr/bin/google-chrome")
12+
CHROME_DRIVER = os.getenv("CHROME_DRIVER_PATH", "/usr/local/bin/chromedriver")
13+
RAW_DATA_DIR = "data/raw"
14+
PROCESSED_DATA_DIR = "data/processed"
15+
16+
# Models
17+
EMBED_MODEL = os.getenv("EMBED_MODEL_NAME", "nomic-embed-text")
18+
OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
19+
20+
# Scraper Settings
21+
DEFAULT_SCROLLS = 4
22+
23+
# Ensure directories exist on startup
24+
os.makedirs(Config.RAW_DATA_DIR, exist_ok=True)
25+
os.makedirs(Config.PROCESSED_DATA_DIR, exist_ok=True)

core/embeddings.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import ollama
2+
import numpy as np
3+
from sklearn.metrics.pairwise import cosine_similarity
4+
from config import Config
5+
6+
def get_embeddings(texts):
7+
"""Batch processes a list of strings into vectors using the configured Ollama model."""
8+
print(f"🧠 [FicSense] Generating embeddings for {len(texts)} items...")
9+
10+
# nomic-embed-text performs better with 'search_document:' prefix for stored content
11+
formatted_texts = [f"search_document: {t}" for t in texts]
12+
13+
# Call Ollama using the model defined in Config
14+
response = ollama.embed(model=Config.EMBED_MODEL, input=formatted_texts)
15+
return np.array(response['embeddings'])
16+
17+
def get_query_vector(query):
18+
"""Converts the user intent into a search vector using the 'search_query:' prefix."""
19+
response = ollama.embed(model=Config.EMBED_MODEL, input=f"search_query: {query}")
20+
return np.array(response['embeddings'][0]).reshape(1, -1)
21+
22+
def rank_novels(query, novels, top_n=15):
23+
"""Ranks novels based on semantic cosine similarity to the user's intent."""
24+
if not novels:
25+
print("⚠️ No novels provided for ranking.")
26+
return []
27+
28+
# 1. Prepare text for embedding (Title + Synopsis)
29+
# We combine them so the model understands context from both
30+
descriptions = [f"{n['title']} {n['synopsis']}" for n in novels]
31+
32+
# 2. Vectorize Documents and Query
33+
try:
34+
doc_vectors = get_embeddings(descriptions)
35+
query_vector = get_query_vector(query)
36+
except Exception as e:
37+
print(f"❌ Ollama Embedding Error: {e}")
38+
return []
39+
40+
# 3. Calculate Cosine Similarity
41+
# This results in a list of scores (usually 0.5 to 0.9 for related text)
42+
scores = cosine_similarity(query_vector, doc_vectors)[0]
43+
44+
# 4. Attach scores and sort by highest relevance
45+
for i, novel in enumerate(novels):
46+
novel["score"] = float(scores[i])
47+
48+
ranked = sorted(novels, key=lambda x: x["score"], reverse=True)
49+
50+
print(f"✅ Ranked {len(ranked)} items. Returning top {top_n}.")
51+
return ranked[:top_n]

core/processor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import json
2+
3+
# Logic for embeddings and merging raw scraped data
4+
def process_data(keyword):
5+
print(f"Processing data for {keyword}...")
6+
pass

downloaded_files/driver_fixing.lock

Whitespace-only changes.

interface/telegram_bot.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import logging
2+
import asyncio
3+
import sys
4+
import os
5+
import html
6+
from telegram import Update
7+
from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, MessageHandler, filters
8+
9+
# --- PATH CONFIGURATION ---
10+
# Fixes the 'ModuleNotFoundError' by adding the root project directory
11+
root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
12+
if root_path not in sys.path:
13+
sys.path.append(root_path)
14+
15+
from config import Config
16+
from main import run_ficsense_pipeline
17+
18+
# Setup logging
19+
logging.basicConfig(
20+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
21+
level=logging.INFO
22+
)
23+
24+
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
25+
"""Handles /start and /restart commands."""
26+
await update.message.reply_html(
27+
"✨ <b>FicSense Antigravity Bot</b> ✨\n\n"
28+
"Ready to find your next read. Send your search as:\n"
29+
"<code>Fandom | Intent</code>\n\n"
30+
"Example: <code>onepiece | time travel ace</code>"
31+
)
32+
33+
async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
34+
text = update.message.text
35+
36+
if "|" not in text:
37+
await update.message.reply_html("❌ <b>Format Error!</b> Please use: <code>Fandom | Intent</code>")
38+
return
39+
40+
# 1. Parse Inputs
41+
fandom, intent = [x.strip() for x in text.split("|")]
42+
43+
await update.message.reply_html(
44+
f"📡 <b>Request Received!</b>\n"
45+
f"🔍 Scoping: <code>{html.escape(fandom)}</code>\n"
46+
f"🎯 Target: <i>{html.escape(intent)}</i>\n\n"
47+
f"<i>Launching Chrome on ASUS TUF...</i>"
48+
)
49+
50+
try:
51+
# 2. Run Pipeline (Offloaded to thread to keep bot alive)
52+
loop = asyncio.get_event_loop()
53+
top_matches = await loop.run_in_executor(None, run_ficsense_pipeline, fandom, intent)
54+
55+
if not top_matches:
56+
await update.message.reply_html("⚠️ <b>No results found.</b> The scraper might be blocked or no stories matched.")
57+
return
58+
59+
# 3. Build PLAIN TEXT Response (The "Unbreakable" Version)
60+
response = f"🎯 Top {len(top_matches)} Semantic Matches for {fandom.upper()}\n"
61+
response += "-------------------------------------\n\n"
62+
63+
for i, res in enumerate(top_matches, 1):
64+
title = res.get('title', 'Unknown Title')
65+
link = res.get('link', 'No Link')
66+
score = res.get('score', 0.0)
67+
# Truncate synopsis to keep the message clean
68+
synopsis = res.get('synopsis', 'No synopsis available.')[:150] + "..."
69+
70+
# Simple string building with NO special formatting
71+
line = f"{i}. {title}\n"
72+
line += f" 🔗 {link}\n"
73+
line += f" ⭐ Score: {score:.2f}\n"
74+
line += f" 📝 {synopsis}\n\n"
75+
76+
# Telegram message limit check
77+
if len(response + line) > 4000:
78+
await update.message.reply_text(response) # Plain text, no parse_mode
79+
response = ""
80+
response += line
81+
82+
if response.strip():
83+
# Crucial: We REMOVE parse_mode='HTML' or 'Markdown' here
84+
await update.message.reply_text(response)
85+
86+
except Exception as e:
87+
logging.error(f"Error: {e}")
88+
# Even the error message should be plain text
89+
await update.message.reply_text(f"❌ System Error: {str(e)}")
90+
91+
if __name__ == "__main__":
92+
if not Config.BOT_TOKEN:
93+
print("🛑 FATAL: No TELEGRAM_BOT_TOKEN found in .env file!")
94+
else:
95+
# Build the application with increased timeouts for heavy scraping tasks
96+
application = (
97+
ApplicationBuilder()
98+
.token(Config.BOT_TOKEN)
99+
.read_timeout(300) # Wait up to 5 mins for Telegram to read
100+
.write_timeout(300) # Wait up to 5 mins for Telegram to send
101+
.connect_timeout(300)
102+
.pool_timeout(300)
103+
.build()
104+
)
105+
106+
# Adding handlers
107+
application.add_handler(CommandHandler(['start', 'restart'], start))
108+
application.add_handler(MessageHandler(filters.TEXT & (~filters.COMMAND), handle_message))
109+
110+
print("🤖 FicSense Bot is alive with extended timeouts...")
111+
application.run_polling()

main.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import sys
2+
import os
3+
import json
4+
from spiders.webnovel_spider import scrape_webnovel
5+
from core.embeddings import rank_novels
6+
from config import Config
7+
8+
def run_ficsense_pipeline(keyword, intent):
9+
"""The master sequence: Scrape -> Rank -> Return."""
10+
print(f"🚀 [FicSense] Processing: {keyword} | Intent: {intent}")
11+
12+
# 1. Trigger the Visible Spider
13+
# Returns the list of novels directly from the function
14+
novels = scrape_webnovel(keyword, scrolls=Config.DEFAULT_SCROLLS)
15+
16+
if not novels or len(novels) == 0:
17+
print("⚠️ No novels captured. Check Chrome window for blocks.")
18+
return []
19+
20+
# 2. Semantic Ranking via Ollama
21+
print(f"🧠 Ranking {len(novels)} novels against intent: '{intent}'")
22+
top_matches = rank_novels(intent, novels, top_n=15)
23+
24+
# 3. Return the ranked results to the caller (Bot or CLI)
25+
return top_matches
26+
27+
if __name__ == "__main__":
28+
# Fallback for CLI testing: python3 main.py onepiece "time travel"
29+
fandom = sys.argv[1] if len(sys.argv) > 1 else "onepiece"
30+
user_intent = sys.argv[2] if len(sys.argv) > 2 else "time travel"
31+
32+
results = run_ficsense_pipeline(fandom, user_intent)
33+
for i, res in enumerate(results, 1):
34+
print(f"{i}. {res['title']} - Score: {res['score']:.4f}")

requirements.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
seleniumbase==4.24.5
2+
selenium==4.18.1
3+
argparse
4+
# Future additions for Phase 2:
5+
# requests
6+
# numpy
7+
# ollama

0 commit comments

Comments
 (0)