Version 1,one website scraping with bot

Neroschizoid · Neroschizoid · commit f3f1021d3cc5 · 2026-04-03T14:21:38.000+05:30
diff --git a/.agents/rules/spider-safety.md b/.agents/rules/spider-safety.md
@@ -0,0 +1,7 @@
+# Spider Safety & Stealth Protocol
+When managing or refactoring scripts in `/spiders`, follow these constraints:
+
+1. **UC Mode Persistence**: Never remove `uc=True` or `headless2=True`. These are required to bypass Webnovel Cloudflare.
+2. **Process Management**: Always include `os.system("pkill -f chrome")` before driver initialization to prevent zombie processes on the ASUS TUF hardware.
+3. **Binary Paths**: Strictly use `/usr/bin/google-chrome` and `/usr/local/bin/chromedriver`.
+4. **Error Handling**: If a scrape returns 0 results, do not hallucinate code changes; check if the site's CSS classes have changed in the Antigravity Browser view.
diff --git a/.agents/tasks/scrape-fandom.md b/.agents/tasks/scrape-fandom.md
@@ -0,0 +1,8 @@
+# Task: Multi-Source Scrape
+**Objective**: Run spiders to collect fanfiction metadata for a specific keyword.
+
+**Steps**:
+1. Run `python3 spiders/webnovel_spider.py --keyword "{keyword}"`
+2. Run `python3 spiders/ao3_spider.py --keyword "{keyword}"`
+3. Verify JSON files exist in `data/raw/`.
+4. Notify user of total novel count.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+# Environment files
+.env
+
+# Cache and compiled files
+__pycache__/
+*.py[cod]
+*$py.class
+.pytest_cache/
+
+# Data directory
+data/
diff --git a/config.py b/config.py
@@ -0,0 +1,25 @@
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+class Config:
+    # Telegram
+    BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN")
+    
+    # Paths
+    CHROME_BINARY = os.getenv("CHROME_BINARY_PATH", "/usr/bin/google-chrome")
+    CHROME_DRIVER = os.getenv("CHROME_DRIVER_PATH", "/usr/local/bin/chromedriver")
+    RAW_DATA_DIR = "data/raw"
+    PROCESSED_DATA_DIR = "data/processed"
+    
+    # Models
+    EMBED_MODEL = os.getenv("EMBED_MODEL_NAME", "nomic-embed-text")
+    OLLAMA_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
+
+    # Scraper Settings
+    DEFAULT_SCROLLS = 4
+
+# Ensure directories exist on startup
+os.makedirs(Config.RAW_DATA_DIR, exist_ok=True)
+os.makedirs(Config.PROCESSED_DATA_DIR, exist_ok=True)
diff --git a/core/embeddings.py b/core/embeddings.py
@@ -0,0 +1,51 @@
+import ollama
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from config import Config
+
+def get_embeddings(texts):
+    """Batch processes a list of strings into vectors using the configured Ollama model."""
+    print(f"🧠 [FicSense] Generating embeddings for {len(texts)} items...")
+    
+    # nomic-embed-text performs better with 'search_document:' prefix for stored content
+    formatted_texts = [f"search_document: {t}" for t in texts]
+    
+    # Call Ollama using the model defined in Config
+    response = ollama.embed(model=Config.EMBED_MODEL, input=formatted_texts)
+    return np.array(response['embeddings'])
+
+def get_query_vector(query):
+    """Converts the user intent into a search vector using the 'search_query:' prefix."""
+    response = ollama.embed(model=Config.EMBED_MODEL, input=f"search_query: {query}")
+    return np.array(response['embeddings'][0]).reshape(1, -1)
+
+def rank_novels(query, novels, top_n=15):
+    """Ranks novels based on semantic cosine similarity to the user's intent."""
+    if not novels:
+        print("⚠️ No novels provided for ranking.")
+        return []
+
+    # 1. Prepare text for embedding (Title + Synopsis)
+    # We combine them so the model understands context from both
+    descriptions = [f"{n['title']} {n['synopsis']}" for n in novels]
+    
+    # 2. Vectorize Documents and Query
+    try:
+        doc_vectors = get_embeddings(descriptions)
+        query_vector = get_query_vector(query)
+    except Exception as e:
+        print(f"❌ Ollama Embedding Error: {e}")
+        return []
+
+    # 3. Calculate Cosine Similarity
+    # This results in a list of scores (usually 0.5 to 0.9 for related text)
+    scores = cosine_similarity(query_vector, doc_vectors)[0]
+
+    # 4. Attach scores and sort by highest relevance
+    for i, novel in enumerate(novels):
+        novel["score"] = float(scores[i])
+
+    ranked = sorted(novels, key=lambda x: x["score"], reverse=True)
+    
+    print(f"✅ Ranked {len(ranked)} items. Returning top {top_n}.")
+    return ranked[:top_n]
diff --git a/core/processor.py b/core/processor.py
@@ -0,0 +1,6 @@
+import json
+
+# Logic for embeddings and merging raw scraped data
+def process_data(keyword):
+    print(f"Processing data for {keyword}...")
+    pass
diff --git a/downloaded_files/driver_fixing.lock b/downloaded_files/driver_fixing.lock
diff --git a/interface/telegram_bot.py b/interface/telegram_bot.py
@@ -0,0 +1,111 @@
+import logging
+import asyncio
+import sys
+import os
+import html
+from telegram import Update
+from telegram.ext import ApplicationBuilder, ContextTypes, CommandHandler, MessageHandler, filters
+
+# --- PATH CONFIGURATION ---
+# Fixes the 'ModuleNotFoundError' by adding the root project directory
+root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+if root_path not in sys.path:
+    sys.path.append(root_path)
+
+from config import Config
+from main import run_ficsense_pipeline
+
+# Setup logging
+logging.basicConfig(
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 
+    level=logging.INFO
+)
+
+async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    """Handles /start and /restart commands."""
+    await update.message.reply_html(
+        "✨ <b>FicSense Antigravity Bot</b> ✨\n\n"
+        "Ready to find your next read. Send your search as:\n"
+        "<code>Fandom | Intent</code>\n\n"
+        "Example: <code>onepiece | time travel ace</code>"
+    )
+
+async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    text = update.message.text
+    
+    if "|" not in text:
+        await update.message.reply_html("❌ <b>Format Error!</b> Please use: <code>Fandom | Intent</code>")
+        return
+
+    # 1. Parse Inputs
+    fandom, intent = [x.strip() for x in text.split("|")]
+    
+    await update.message.reply_html(
+        f"📡 <b>Request Received!</b>\n"
+        f"🔍 Scoping: <code>{html.escape(fandom)}</code>\n"
+        f"🎯 Target: <i>{html.escape(intent)}</i>\n\n"
+        f"<i>Launching Chrome on ASUS TUF...</i>"
+    )
+
+    try:
+        # 2. Run Pipeline (Offloaded to thread to keep bot alive)
+        loop = asyncio.get_event_loop()
+        top_matches = await loop.run_in_executor(None, run_ficsense_pipeline, fandom, intent)
+
+        if not top_matches:
+            await update.message.reply_html("⚠️ <b>No results found.</b> The scraper might be blocked or no stories matched.")
+            return
+
+      # 3. Build PLAIN TEXT Response (The "Unbreakable" Version)
+        response = f"🎯 Top {len(top_matches)} Semantic Matches for {fandom.upper()}\n"
+        response += "-------------------------------------\n\n"
+        
+        for i, res in enumerate(top_matches, 1):
+            title = res.get('title', 'Unknown Title')
+            link = res.get('link', 'No Link')
+            score = res.get('score', 0.0)
+            # Truncate synopsis to keep the message clean
+            synopsis = res.get('synopsis', 'No synopsis available.')[:150] + "..."
+
+            # Simple string building with NO special formatting
+            line = f"{i}. {title}\n"
+            line += f"   🔗 {link}\n"
+            line += f"   ⭐ Score: {score:.2f}\n"
+            line += f"   📝 {synopsis}\n\n"
+            
+            # Telegram message limit check
+            if len(response + line) > 4000:
+                await update.message.reply_text(response) # Plain text, no parse_mode
+                response = ""
+            response += line
+
+        if response.strip():
+            # Crucial: We REMOVE parse_mode='HTML' or 'Markdown' here
+            await update.message.reply_text(response)
+
+    except Exception as e:
+        logging.error(f"Error: {e}")
+        # Even the error message should be plain text
+        await update.message.reply_text(f"❌ System Error: {str(e)}")
+
+if __name__ == "__main__":
+    if not Config.BOT_TOKEN:
+        print("🛑 FATAL: No TELEGRAM_BOT_TOKEN found in .env file!")
+    else:
+        # Build the application with increased timeouts for heavy scraping tasks
+        application = (
+            ApplicationBuilder()
+            .token(Config.BOT_TOKEN)
+            .read_timeout(300)   # Wait up to 5 mins for Telegram to read
+            .write_timeout(300)  # Wait up to 5 mins for Telegram to send
+            .connect_timeout(300)
+            .pool_timeout(300)
+            .build()
+        )
+        
+        # Adding handlers
+        application.add_handler(CommandHandler(['start', 'restart'], start))
+        application.add_handler(MessageHandler(filters.TEXT & (~filters.COMMAND), handle_message))
+        
+        print("🤖 FicSense Bot is alive with extended timeouts...")
+        application.run_polling()
diff --git a/main.py b/main.py
@@ -0,0 +1,34 @@
+import sys
+import os
+import json
+from spiders.webnovel_spider import scrape_webnovel
+from core.embeddings import rank_novels
+from config import Config
+
+def run_ficsense_pipeline(keyword, intent):
+    """The master sequence: Scrape -> Rank -> Return."""
+    print(f"🚀 [FicSense] Processing: {keyword} | Intent: {intent}")
+    
+    # 1. Trigger the Visible Spider
+    # Returns the list of novels directly from the function
+    novels = scrape_webnovel(keyword, scrolls=Config.DEFAULT_SCROLLS)
+    
+    if not novels or len(novels) == 0:
+        print("⚠️ No novels captured. Check Chrome window for blocks.")
+        return []
+
+    # 2. Semantic Ranking via Ollama
+    print(f"🧠 Ranking {len(novels)} novels against intent: '{intent}'")
+    top_matches = rank_novels(intent, novels, top_n=15)
+
+    # 3. Return the ranked results to the caller (Bot or CLI)
+    return top_matches
+
+if __name__ == "__main__":
+    # Fallback for CLI testing: python3 main.py onepiece "time travel"
+    fandom = sys.argv[1] if len(sys.argv) > 1 else "onepiece"
+    user_intent = sys.argv[2] if len(sys.argv) > 2 else "time travel"
+    
+    results = run_ficsense_pipeline(fandom, user_intent)
+    for i, res in enumerate(results, 1):
+        print(f"{i}. {res['title']} - Score: {res['score']:.4f}")
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+seleniumbase==4.24.5
+selenium==4.18.1
+argparse
+# Future additions for Phase 2:
+# requests
+# numpy
+# ollama
diff --git a/spiders/ao3_spider.py b/spiders/ao3_spider.py
@@ -0,0 +1,41 @@
+import json
+import time
+import os
+import argparse
+from seleniumbase import Driver
+
+def scrape_ao3(keyword):
+    os.system("pkill -f chrome")
+    driver = Driver(uc=True, headless2=True, driver_version="keep")
+    results = []
+
+    try:
+        url = f"https://archiveofourown.org/works/search?work_search[query]={keyword}"
+        driver.get(url)
+        time.sleep(5)
+
+        cards = driver.find_elements("css selector", "li.work.blurb")
+        for card in cards:
+            try:
+                title_el = card.find_element("css selector", "h4.heading a:first-child")
+                results.append({
+                    "title": title_el.text.strip(),
+                    "author": card.find_element("css selector", "a[rel='author']").text.strip(),
+                    "synopsis": card.find_element("css selector", "blockquote.userstuff").text.strip(),
+                    "link": title_el.get_attribute("href"),
+                    "source": "AO3"
+                })
+            except: continue
+
+        with open(f"data/raw/ao3_{keyword}.json", "w") as f:
+            json.dump(results, f, indent=4)
+        return len(results)
+    finally:
+        driver.quit()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--keyword", required=True)
+    args = parser.parse_args()
+    count = scrape_ao3(args.keyword)
+    print(f"Captured {count} from AO3")
diff --git a/spiders/base_spider.py b/spiders/base_spider.py
@@ -0,0 +1,3 @@
+# Shared configurations for spiders
+class BaseSpider:
+    pass
diff --git a/spiders/webnovel_spider.py b/spiders/webnovel_spider.py
diff --git a/tests/test_site.py b/tests/test_site.py