fixed bugs

rabbit0021 · rabbit0021 · commit 97d740d7db77 · 2025-08-21T14:40:18.000+05:30
diff --git a/app.py b/app.py
@@ -12,7 +12,7 @@
 
 app = Flask(__name__, static_folder="static", template_folder="templates")
 app.db = get_database()
-SECRET_KEY = os.getenv("POSTS_SECRET_KEY", "******")
+SECRET_KEY = os.getenv("POSTS_SECRET_KEY", "123")
 
 # Logging    
 app.logger = get_logger("app")
@@ -194,6 +194,9 @@ def get_posts():
                 "url": post["url"],
                 "title": post["title"],
                 "topic": post["topic"],
+                "publisher": post["publisher_name"],
+                "published_at": post["published_at"],
+                "tags": post["tags"],
                 "labelled": post['labelled']
             })
         return jsonify(result)
diff --git a/db/sqlite.py b/db/sqlite.py
@@ -231,7 +231,7 @@ def get_notifications_by_email(self, conn, email):
         rows = c.fetchall()
         return [dict(row) for row in rows]
     
-    def get_active_notifications_by_email_and_url(self, conn, email, url):
+    def get_notifications_by_email_and_url(self, conn, email, url):
         c = conn.cursor()
         c.execute("""
             SELECT *
@@ -244,7 +244,7 @@ def get_active_notifications_by_email_and_url(self, conn, email, url):
     def add_notification(self, conn, email, heading, style_version, post_url, post_title, maturity_date):
         logger.info(f"Adding notification: {email}, type: {post_title}")
         
-        notf = self.get_active_notifications_by_email_and_url(conn, email, post_url)
+        notf = self.get_notifications_by_email_and_url(conn, email, post_url)
         
         if not notf:
             c = conn.cursor()
@@ -368,7 +368,8 @@ def get_posts(self, conn):
         c = conn.cursor()
         c.execute("""
             SELECT *
-            FROM posts
+            FROM posts po
+            JOIN publishers p ON po.publisher_id = p.id
         """)
         rows = c.fetchall()
         return [dict(row) for row in rows]
diff --git a/handlers/base.py b/handlers/base.py
@@ -49,9 +49,11 @@ def search_blog_posts(self, category, last_scan_time):
                                                           
                 if last_scan_time.tzinfo is None:
                     last_scan_time = last_scan_time.replace(tzinfo=timezone.utc)
+                
+                # breaking the loop when first article appears having stale
                 if published <= last_scan_time:
                     logger.debug(f"Skipping {entry.title}: article published on {published} before last scan time: {last_scan_time}")
-                    continue    
+                    break    
                 
                 matching_posts.append({
                     "title": entry.title,
diff --git a/handlers/facebook.py b/handlers/facebook.py
@@ -66,6 +66,7 @@ def search_blog_posts(self, category, last_scan_time):
             try:
                 title_tag = article.select_one(".entry-title a")
                 if not title_tag:
+                    logger.exception("Title not found")
                     continue
                 title = title_tag.get_text(strip=True)
                 post_url = title_tag["href"]
@@ -76,7 +77,9 @@ def search_blog_posts(self, category, last_scan_time):
                 # Date
                 date_tag = article.select_one("time.entry-date")
                 if not date_tag:
+                    logger.exception("Published Date not found")
                     continue
+                
                 date_str = date_tag.get_text(strip=True)
     
                 try:
@@ -91,16 +94,9 @@ def search_blog_posts(self, category, last_scan_time):
                 if last_scan_time.tzinfo is None:
                     last_scan_time = last_scan_time.replace(tzinfo=timezone.utc)
                 
+                # breaking the loop when first article appears having stale
                 if published <= last_scan_time:
-                    continue
-                
-                # content_block = article.select_one(".entry-content")
-                # if content_block:
-                #     paragraphs = content_block.find_all("p")
-                #     full_content = " ".join(p.get_text(strip=True) for p in paragraphs)
-                #     content = full_content[:100]  # first 100 characters
-                # else:
-                #     content = ""
+                    break
     
                 posts.append({
                     "title": title,
diff --git a/handlers/factory.py b/handlers/factory.py
@@ -1,4 +1,4 @@
-from handlers import aws, github, netflix, airbnb, dropbox, facebook, slack, spotify, cloudfare, nvidea, salesforce, google, databricks
+from handlers import aws, github, linkedin, netflix, airbnb, dropbox, facebook, slack, spotify, cloudfare, nvidea, salesforce, google, databricks
 from logger_config import get_logger;
 
 logger = get_logger("HANDLERS")
@@ -31,6 +31,8 @@ def get_scraper(comapny):
             return google.GoogleScraper()
         elif comapny.lower() == "databricks":
             return databricks.DatabricksScraper()
+        elif comapny.lower() == "linkedin":
+            return linkedin.LinkedinScraper()
         else:
             logger.error(f"No handler found for {comapny}")
             return None
diff --git a/handlers/linkedin.py b/handlers/linkedin.py
@@ -0,0 +1,109 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+from .base import BaseScraper
+from logger_config import get_logger
+from email.utils import parsedate_to_datetime
+from dateutil import parser
+from datetime import timezone
+
+BASE_URL = 'https://engineering.linkedin.com/blog'
+HEADERS = {'User-Agent': 'Mozilla/5.0'}
+
+logger = get_logger("linkedin-heandler")
+class LinkedinScraper(BaseScraper):
+    def scrape(self):
+        pass
+    
+    def get_posts_from_group_url(self, url, last_scan_time):
+        logger.debug(f"Getting posts from group url: {url}")
+        resp = requests.get(url, timeout=5)
+        if resp.status_code != 200:
+            logger.warning(f"Non-200 response for {url}: {resp.status_code}")
+            return None    
+
+        soup = BeautifulSoup(resp.text, "html.parser")  
+        
+        posts = []
+        
+        post_items = soup.find_all("li", class_="post-list__item grid-post")
+
+        for post in post_items:
+            try:
+                # Title
+                title_tag = post.find("div", class_="grid-post__title")
+                if title_tag and title_tag.a:
+                    title = title_tag.a.get_text(strip=True)
+                    url = title_tag.a["href"]
+                else:
+                    logger.exception("Title not found")
+                    continue
+
+                # Topic
+                topic_tag = post.find("p", class_="grid-post__topic")
+                topic = [topic_tag.a.get_text(strip=True) if topic_tag and topic_tag.a else ""]
+
+                # Published date
+                date_tag = post.find("p", class_="grid-post__date")
+                if date_tag:
+                    published = parser.parse(date_tag.get_text(strip=True))
+                    if published.tzinfo is None:
+                        published = published.replace(tzinfo=timezone.utc)
+                else:
+                    published = None
+                
+                if not published:
+                    logger.exception("Published date not found")
+                    continue
+                
+                if last_scan_time.tzinfo is None:
+                    last_scan_time = last_scan_time.replace(tzinfo=timezone.utc)
+                
+                # breaking the loop when first article appears having stale
+                if published <= last_scan_time:
+                    logger.debug(f"Skipping post: {title} as it is published on {published} before {last_scan_time}")
+                    break
+                
+                posts.append({
+                    "title": title,
+                    "url": url,
+                    "tags": topic,
+                    "published": published.isoformat()
+                })
+                
+                # Stop early after collecting 2 items
+                if len(posts) >= 2:
+                    break
+
+            except Exception as e:
+                logger.exception(f"Error parsing group post")
+                continue
+
+        return posts       
+            
+
+    def search_blog_posts(self, category, last_scan_time):
+        res = requests.get(BASE_URL)
+        soup = BeautifulSoup(res.text, "html.parser")
+    
+        posts = []
+        groups = soup.select(".artdeco-dropdown__content")
+    
+        for group in groups:
+            links = group.select(".artdeco-dropdown__item a.header-nav__link")
+            
+            links = links
+            
+            for grouplink in links:
+                try:                    
+                    group_url = grouplink.get("href")
+                    
+                    group_posts = self.get_posts_from_group_url(group_url, last_scan_time)
+                    
+                    for post in group_posts:
+                        posts.append(post)
+                except:
+                    logger.exception("Failed while scraping Linkedin")
+                        
+        return posts
+
diff --git a/send_notifications.py b/send_notifications.py
@@ -16,8 +16,8 @@
 # === CONFIG ===
 SMTP_SERVER = "smtp.zoho.in"
 SMTP_PORT = 587
-SMTP_USERNAME = os.getenv('SMTP_USERNAME', 'xxxx@domain.com')
-SMTP_PASSWORD = os.getenv('SMTP_PASSWORD', 'xxxxx')
+SMTP_USERNAME = os.getenv('SMTP_USERNAME', 'noreply@onesearch.blog')
+SMTP_PASSWORD = os.getenv('SMTP_PASSWORD', 'MHBXNAeAY90M')
 
 logger = get_logger("send_notification_worker")
 
diff --git a/templates/posts.html b/templates/posts.html
@@ -93,7 +93,10 @@ <h2 style="text-align:center; margin-bottom: 1.5rem;">Manage Posts</h2>
           <th>ID</th>
           <th>Title</th>
           <th>Topic</th>
-          <th>Labelled</th>
+          <th>Publisher</th>
+          <th>Published At</th>
+          <th>Tags</th>
+          <th>Classified</th>
           <th>Action</th>
         </tr>
       </thead>
@@ -135,6 +138,9 @@ <h2 style="text-align:center; margin-bottom: 1.5rem;">Manage Posts</h2>
               ${allowedTopics.map(t => `<option value="${t}" ${t === post.topic ? 'selected' : ''}>${t}</option>`).join('')}
             </select>
           </td>
+          <td>${post.publisher}</td>
+          <td>${post.published_at}</td>
+          <td>${post.tags}</td>
           <td>${post.labelled ? '✅' : '❌'}</td>
           <td><button class="update-btn">Update</button></td>
         `;
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -14,13 +14,13 @@
 
 logger.debug("=================== Test Session Starts =================== ")
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="function")
 def db():
     db_path = "data/tests.db"
     if os.path.exists(db_path):
         os.remove(db_path)
     # use in-memory DB for testing
-    db_instance = SQLiteDatabase.get_instance(db_path)
+    db_instance = SQLiteDatabase(db_path)
     logger.debug("Test Database Initialised")
     yield db_instance
     
diff --git a/tests/e2e/test_scrape_pubs.py b/tests/e2e/test_scrape_pubs.py
@@ -4,27 +4,42 @@
 from datetime import datetime
 from scrape_pubs import scrape_pubs
 
-@pytest.mark.pubs
-def test_scrape_pubs_techteams1(db):
-    conn = db.get_connection()
+# @pytest.mark.pubs
+# def test_scrape_pubs_techteams1(db):
+#     conn = db.get_connection()
     
-    pubid = db.add_publisher(conn, "facebook", "techteam")
-    db.add_subscription(conn, "newemail5@gmail.com", enums.PublisherCategory.SOFTWARE_ENGINEERING.value, pubid)
+#     pubid = db.add_publisher(conn, "facebook", "techteam")
+#     db.add_subscription(conn, "newemail5@gmail.com", enums.PublisherCategory.SOFTWARE_ENGINEERING.value, pubid)
     
-    scrape_pubs(db, conn)
+#     scrape_pubs(db, conn)
     
-    posts = db.get_posts_by_publisher_id(conn, pubid)
+#     posts = db.get_posts_by_publisher_id(conn, pubid)
     
-    assert len(posts) > 10
+#     assert len(posts) > 10
     
-    conn.close()
+#     conn.close()
+
+# @pytest.mark.pubs
+# def test_scrape_pubs_techteams2(db):
+#     conn = db.get_connection()
+    
+#     pubid = db.add_publisher(conn, "databricks", "techteam")
+#     db.add_subscription(conn, "newemail6@gmail.com", enums.PublisherCategory.SOFTWARE_ENGINEERING.value, pubid)
+    
+#     scrape_pubs(db, conn)
+    
+#     posts = db.get_posts_by_publisher_id(conn, pubid)
+    
+#     assert len(posts) > 9
+    
+#     conn.close()
 
 @pytest.mark.pubs
-def test_scrape_pubs_techteams2(db):
+def test_scrape_pubs_techteams3(db):
     conn = db.get_connection()
     
-    pubid = db.add_publisher(conn, "databricks", "techteam")
-    db.add_subscription(conn, "newemail6@gmail.com", enums.PublisherCategory.SOFTWARE_ENGINEERING.value, pubid)
+    pubid = db.add_publisher(conn, "linkedin", "techteam")
+    db.add_subscription(conn, "newemail7@gmail.com", enums.PublisherCategory.SOFTWARE_ENGINEERING.value, pubid)
     
     scrape_pubs(db, conn)
     
@@ -34,6 +49,7 @@ def test_scrape_pubs_techteams2(db):
     
     conn.close()
 
+