chrome-extension/content.js at master · Sauravkolte/chrome-extension · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/**
 * SatyaCheck — Content Script
 * सत्य की जाँच | India's AI Fake News Detector
 *
 * This script runs on EVERY web page you visit (invisibly, in the background).
 * When you click "Check This Page" in the SatyaCheck popup, this script:
 *   1. Extracts the article title, body text, images, and URL
 *   2. Sends it back to the popup
 *   3. The popup sends it to the SatyaCheck backend for AI analysis
 *
 * File: dist/content.js
 * Manifest V3 — runs at document_idle
 */

'use strict';

// ─── Article Content Extraction ───────────────────────────────────────────────

/**
 * Extracts article content from the current page.
 * Tries multiple selectors to find the article body — works on most Indian
 * and international news websites.
 *
 * @returns {Object} { url, title, body_text, image_urls, domain }
 */
function extractArticleContent() {
  // ── Title ──────────────────────────────────────────────────────────────────
  const title =
    document.querySelector('h1')?.innerText?.trim() ||
    document.querySelector('meta[property="og:title"]')?.content?.trim() ||
    document.querySelector('meta[name="twitter:title"]')?.content?.trim() ||
    document.querySelector('title')?.innerText?.trim() ||
    'Unknown Title';

  // ── Article Body ───────────────────────────────────────────────────────────
  // Try known article body selectors (ordered by specificity)
  const bodySelectors = [
    'article',
    '[class*="article-body"]',
    '[class*="story-body"]',
    '[class*="story-content"]',
    '[class*="post-content"]',
    '[class*="entry-content"]',
    '[class*="article-content"]',
    '[class*="news-content"]',
    '[class*="content-body"]',
    '[itemprop="articleBody"]',
    '[data-testid="article-body"]',
    '.article__body',
    '.story__content',
    '.article-detail',
    'main',
    '.main-content',
    '#article-body',
    '#story-body',
  ];

  let bodyText = '';
  for (const selector of bodySelectors) {
    const el = document.querySelector(selector);
    if (el && el.innerText && el.innerText.trim().length > 200) {
      bodyText = el.innerText.trim();
      break;
    }
  }

  // Fallback: use the page body but strip nav/header/footer
  if (!bodyText || bodyText.length < 100) {
    const clone = document.body.cloneNode(true);
    // Remove non-content elements
    ['nav', 'header', 'footer', 'aside', 'script', 'style', '.ad', '.advertisement'].forEach(sel => {
      clone.querySelectorAll(sel).forEach(el => el.remove());
    });
    bodyText = clone.innerText?.trim() || '';
  }

  // Trim to 10,000 chars (enough for AI analysis, not too large for the API)
  bodyText = bodyText.substring(0, 10000);

  // ── Images ─────────────────────────────────────────────────────────────────
  const allImages = Array.from(document.querySelectorAll('img'));
  const imageUrls = allImages
    .map(img => img.src || img.dataset.src || img.dataset.lazySrc || '')
    .filter(src =>
      src.startsWith('http') &&
      !src.includes('icon') &&
      !src.includes('logo') &&
      !src.includes('avatar') &&
      !src.includes('1x1') &&
      !src.includes('pixel') &&
      !src.includes('ad') &&
      (src.includes('.jpg') || src.includes('.jpeg') || src.includes('.png') || src.includes('.webp'))
    )
    .slice(0, 5); // Max 5 images

  // ── Open Graph image (usually the hero/featured image) ────────────────────
  const ogImage = document.querySelector('meta[property="og:image"]')?.content;
  if (ogImage && !imageUrls.includes(ogImage)) {
    imageUrls.unshift(ogImage);
  }

  // ── Domain ─────────────────────────────────────────────────────────────────
  const domain = window.location.hostname.replace(/^www\./, '');

  return {
    url: window.location.href,
    title: title.substring(0, 300),
    body_text: bodyText,
    image_urls: imageUrls.slice(0, 5),
    domain: domain,
  };
}


// ─── Message Listener ─────────────────────────────────────────────────────────

/**
 * Listen for messages from the SatyaCheck popup.
 * When the popup asks for article content, extract and return it.
 */
chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
  if (request.action === 'extractContent') {
    try {
      const content = extractArticleContent();
      sendResponse({ success: true, data: content });
    } catch (error) {
      sendResponse({
        success: false,
        error: error.message,
        data: {
          url: window.location.href,
          title: document.title || 'Unknown',
          body_text: document.body?.innerText?.substring(0, 5000) || '',
          image_urls: [],
          domain: window.location.hostname.replace(/^www\./, ''),
        }
      });
    }
    return true; // Keep message channel open for async response
  }

  if (request.action === 'ping') {
    sendResponse({ success: true, message: 'SatyaCheck content script active' });
    return true;
  }
});


// ─── Page Type Detection ──────────────────────────────────────────────────────

/**
 * Detect if the current page is a news article (vs. homepage, category, etc.)
 * Used to show/hide the "Check This Page" button in the popup.
 */
function isArticlePage() {
  const url = window.location.href;

  // URL-based detection
  const articleUrlPatterns = [
    /\/news\//i, /\/article\//i, /\/story\//i, /\/post\//i,
    /\/\d{4}\/\d{2}\//i,  // Date-based URLs: /2024/01/
    /\/politics\//i, /\/sports\//i, /\/business\//i,
  ];
  if (articleUrlPatterns.some(p => p.test(url))) return true;

  // Content-based detection
  const hasArticleTag = !!document.querySelector('article');
  const hasH1 = !!document.querySelector('h1');
  const bodyLength = document.body?.innerText?.length || 0;

  return hasArticleTag && hasH1 && bodyLength > 500;
}

// Notify popup whether this page looks like an article
chrome.runtime.sendMessage({
  action: 'pageInfo',
  isArticle: isArticlePage(),
  domain: window.location.hostname.replace(/^www\./, ''),
  url: window.location.href,
}).catch(() => {
  // Popup not open — ignore
});