Skip to content

Commit 97d740d

Browse files
committed
fixed bugs
1 parent fd3a411 commit 97d740d

File tree

10 files changed

+167
-32
lines changed

10 files changed

+167
-32
lines changed

app.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
app = Flask(__name__, static_folder="static", template_folder="templates")
1414
app.db = get_database()
15-
SECRET_KEY = os.getenv("POSTS_SECRET_KEY", "******")
15+
SECRET_KEY = os.getenv("POSTS_SECRET_KEY", "123")
1616

1717
# Logging
1818
app.logger = get_logger("app")
@@ -194,6 +194,9 @@ def get_posts():
194194
"url": post["url"],
195195
"title": post["title"],
196196
"topic": post["topic"],
197+
"publisher": post["publisher_name"],
198+
"published_at": post["published_at"],
199+
"tags": post["tags"],
197200
"labelled": post['labelled']
198201
})
199202
return jsonify(result)

db/sqlite.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def get_notifications_by_email(self, conn, email):
231231
rows = c.fetchall()
232232
return [dict(row) for row in rows]
233233

234-
def get_active_notifications_by_email_and_url(self, conn, email, url):
234+
def get_notifications_by_email_and_url(self, conn, email, url):
235235
c = conn.cursor()
236236
c.execute("""
237237
SELECT *
@@ -244,7 +244,7 @@ def get_active_notifications_by_email_and_url(self, conn, email, url):
244244
def add_notification(self, conn, email, heading, style_version, post_url, post_title, maturity_date):
245245
logger.info(f"Adding notification: {email}, type: {post_title}")
246246

247-
notf = self.get_active_notifications_by_email_and_url(conn, email, post_url)
247+
notf = self.get_notifications_by_email_and_url(conn, email, post_url)
248248

249249
if not notf:
250250
c = conn.cursor()
@@ -368,7 +368,8 @@ def get_posts(self, conn):
368368
c = conn.cursor()
369369
c.execute("""
370370
SELECT *
371-
FROM posts
371+
FROM posts po
372+
JOIN publishers p ON po.publisher_id = p.id
372373
""")
373374
rows = c.fetchall()
374375
return [dict(row) for row in rows]

handlers/base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,11 @@ def search_blog_posts(self, category, last_scan_time):
4949

5050
if last_scan_time.tzinfo is None:
5151
last_scan_time = last_scan_time.replace(tzinfo=timezone.utc)
52+
53+
# breaking the loop when first article appears having stale
5254
if published <= last_scan_time:
5355
logger.debug(f"Skipping {entry.title}: article published on {published} before last scan time: {last_scan_time}")
54-
continue
56+
break
5557

5658
matching_posts.append({
5759
"title": entry.title,

handlers/facebook.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def search_blog_posts(self, category, last_scan_time):
6666
try:
6767
title_tag = article.select_one(".entry-title a")
6868
if not title_tag:
69+
logger.exception("Title not found")
6970
continue
7071
title = title_tag.get_text(strip=True)
7172
post_url = title_tag["href"]
@@ -76,7 +77,9 @@ def search_blog_posts(self, category, last_scan_time):
7677
# Date
7778
date_tag = article.select_one("time.entry-date")
7879
if not date_tag:
80+
logger.exception("Published Date not found")
7981
continue
82+
8083
date_str = date_tag.get_text(strip=True)
8184

8285
try:
@@ -91,16 +94,9 @@ def search_blog_posts(self, category, last_scan_time):
9194
if last_scan_time.tzinfo is None:
9295
last_scan_time = last_scan_time.replace(tzinfo=timezone.utc)
9396

97+
# breaking the loop when first article appears having stale
9498
if published <= last_scan_time:
95-
continue
96-
97-
# content_block = article.select_one(".entry-content")
98-
# if content_block:
99-
# paragraphs = content_block.find_all("p")
100-
# full_content = " ".join(p.get_text(strip=True) for p in paragraphs)
101-
# content = full_content[:100] # first 100 characters
102-
# else:
103-
# content = ""
99+
break
104100

105101
posts.append({
106102
"title": title,

handlers/factory.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from handlers import aws, github, netflix, airbnb, dropbox, facebook, slack, spotify, cloudfare, nvidea, salesforce, google, databricks
1+
from handlers import aws, github, linkedin, netflix, airbnb, dropbox, facebook, slack, spotify, cloudfare, nvidea, salesforce, google, databricks
22
from logger_config import get_logger;
33

44
logger = get_logger("HANDLERS")
@@ -31,6 +31,8 @@ def get_scraper(comapny):
3131
return google.GoogleScraper()
3232
elif comapny.lower() == "databricks":
3333
return databricks.DatabricksScraper()
34+
elif comapny.lower() == "linkedin":
35+
return linkedin.LinkedinScraper()
3436
else:
3537
logger.error(f"No handler found for {comapny}")
3638
return None

handlers/linkedin.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import re
4+
from .base import BaseScraper
5+
from logger_config import get_logger
6+
from email.utils import parsedate_to_datetime
7+
from dateutil import parser
8+
from datetime import timezone
9+
10+
BASE_URL = 'https://engineering.linkedin.com/blog'
11+
HEADERS = {'User-Agent': 'Mozilla/5.0'}
12+
13+
logger = get_logger("linkedin-heandler")
14+
class LinkedinScraper(BaseScraper):
15+
def scrape(self):
16+
pass
17+
18+
def get_posts_from_group_url(self, url, last_scan_time):
19+
logger.debug(f"Getting posts from group url: {url}")
20+
resp = requests.get(url, timeout=5)
21+
if resp.status_code != 200:
22+
logger.warning(f"Non-200 response for {url}: {resp.status_code}")
23+
return None
24+
25+
soup = BeautifulSoup(resp.text, "html.parser")
26+
27+
posts = []
28+
29+
post_items = soup.find_all("li", class_="post-list__item grid-post")
30+
31+
for post in post_items:
32+
try:
33+
# Title
34+
title_tag = post.find("div", class_="grid-post__title")
35+
if title_tag and title_tag.a:
36+
title = title_tag.a.get_text(strip=True)
37+
url = title_tag.a["href"]
38+
else:
39+
logger.exception("Title not found")
40+
continue
41+
42+
# Topic
43+
topic_tag = post.find("p", class_="grid-post__topic")
44+
topic = [topic_tag.a.get_text(strip=True) if topic_tag and topic_tag.a else ""]
45+
46+
# Published date
47+
date_tag = post.find("p", class_="grid-post__date")
48+
if date_tag:
49+
published = parser.parse(date_tag.get_text(strip=True))
50+
if published.tzinfo is None:
51+
published = published.replace(tzinfo=timezone.utc)
52+
else:
53+
published = None
54+
55+
if not published:
56+
logger.exception("Published date not found")
57+
continue
58+
59+
if last_scan_time.tzinfo is None:
60+
last_scan_time = last_scan_time.replace(tzinfo=timezone.utc)
61+
62+
# breaking the loop when first article appears having stale
63+
if published <= last_scan_time:
64+
logger.debug(f"Skipping post: {title} as it is published on {published} before {last_scan_time}")
65+
break
66+
67+
posts.append({
68+
"title": title,
69+
"url": url,
70+
"tags": topic,
71+
"published": published.isoformat()
72+
})
73+
74+
# Stop early after collecting 2 items
75+
if len(posts) >= 2:
76+
break
77+
78+
except Exception as e:
79+
logger.exception(f"Error parsing group post")
80+
continue
81+
82+
return posts
83+
84+
85+
def search_blog_posts(self, category, last_scan_time):
86+
res = requests.get(BASE_URL)
87+
soup = BeautifulSoup(res.text, "html.parser")
88+
89+
posts = []
90+
groups = soup.select(".artdeco-dropdown__content")
91+
92+
for group in groups:
93+
links = group.select(".artdeco-dropdown__item a.header-nav__link")
94+
95+
links = links
96+
97+
for grouplink in links:
98+
try:
99+
group_url = grouplink.get("href")
100+
101+
group_posts = self.get_posts_from_group_url(group_url, last_scan_time)
102+
103+
for post in group_posts:
104+
posts.append(post)
105+
except:
106+
logger.exception("Failed while scraping Linkedin")
107+
108+
return posts
109+

send_notifications.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
# === CONFIG ===
1717
SMTP_SERVER = "smtp.zoho.in"
1818
SMTP_PORT = 587
19-
SMTP_USERNAME = os.getenv('SMTP_USERNAME', 'xxxx@domain.com')
20-
SMTP_PASSWORD = os.getenv('SMTP_PASSWORD', 'xxxxx')
19+
SMTP_USERNAME = os.getenv('SMTP_USERNAME', 'noreply@onesearch.blog')
20+
SMTP_PASSWORD = os.getenv('SMTP_PASSWORD', 'MHBXNAeAY90M')
2121

2222
logger = get_logger("send_notification_worker")
2323

templates/posts.html

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,10 @@ <h2 style="text-align:center; margin-bottom: 1.5rem;">Manage Posts</h2>
9393
<th>ID</th>
9494
<th>Title</th>
9595
<th>Topic</th>
96-
<th>Labelled</th>
96+
<th>Publisher</th>
97+
<th>Published At</th>
98+
<th>Tags</th>
99+
<th>Classified</th>
97100
<th>Action</th>
98101
</tr>
99102
</thead>
@@ -135,6 +138,9 @@ <h2 style="text-align:center; margin-bottom: 1.5rem;">Manage Posts</h2>
135138
${allowedTopics.map(t => `<option value="${t}" ${t === post.topic ? 'selected' : ''}>${t}</option>`).join('')}
136139
</select>
137140
</td>
141+
<td>${post.publisher}</td>
142+
<td>${post.published_at}</td>
143+
<td>${post.tags}</td>
138144
<td>${post.labelled ? '✅' : '❌'}</td>
139145
<td><button class="update-btn">Update</button></td>
140146
`;

tests/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414

1515
logger.debug("=================== Test Session Starts =================== ")
1616

17-
@pytest.fixture(scope="session")
17+
@pytest.fixture(scope="function")
1818
def db():
1919
db_path = "data/tests.db"
2020
if os.path.exists(db_path):
2121
os.remove(db_path)
2222
# use in-memory DB for testing
23-
db_instance = SQLiteDatabase.get_instance(db_path)
23+
db_instance = SQLiteDatabase(db_path)
2424
logger.debug("Test Database Initialised")
2525
yield db_instance
2626

tests/e2e/test_scrape_pubs.py

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,42 @@
44
from datetime import datetime
55
from scrape_pubs import scrape_pubs
66

7-
@pytest.mark.pubs
8-
def test_scrape_pubs_techteams1(db):
9-
conn = db.get_connection()
7+
# @pytest.mark.pubs
8+
# def test_scrape_pubs_techteams1(db):
9+
# conn = db.get_connection()
1010

11-
pubid = db.add_publisher(conn, "facebook", "techteam")
12-
db.add_subscription(conn, "newemail5@gmail.com", enums.PublisherCategory.SOFTWARE_ENGINEERING.value, pubid)
11+
# pubid = db.add_publisher(conn, "facebook", "techteam")
12+
# db.add_subscription(conn, "newemail5@gmail.com", enums.PublisherCategory.SOFTWARE_ENGINEERING.value, pubid)
1313

14-
scrape_pubs(db, conn)
14+
# scrape_pubs(db, conn)
1515

16-
posts = db.get_posts_by_publisher_id(conn, pubid)
16+
# posts = db.get_posts_by_publisher_id(conn, pubid)
1717

18-
assert len(posts) > 10
18+
# assert len(posts) > 10
1919

20-
conn.close()
20+
# conn.close()
21+
22+
# @pytest.mark.pubs
23+
# def test_scrape_pubs_techteams2(db):
24+
# conn = db.get_connection()
25+
26+
# pubid = db.add_publisher(conn, "databricks", "techteam")
27+
# db.add_subscription(conn, "newemail6@gmail.com", enums.PublisherCategory.SOFTWARE_ENGINEERING.value, pubid)
28+
29+
# scrape_pubs(db, conn)
30+
31+
# posts = db.get_posts_by_publisher_id(conn, pubid)
32+
33+
# assert len(posts) > 9
34+
35+
# conn.close()
2136

2237
@pytest.mark.pubs
23-
def test_scrape_pubs_techteams2(db):
38+
def test_scrape_pubs_techteams3(db):
2439
conn = db.get_connection()
2540

26-
pubid = db.add_publisher(conn, "databricks", "techteam")
27-
db.add_subscription(conn, "newemail6@gmail.com", enums.PublisherCategory.SOFTWARE_ENGINEERING.value, pubid)
41+
pubid = db.add_publisher(conn, "linkedin", "techteam")
42+
db.add_subscription(conn, "newemail7@gmail.com", enums.PublisherCategory.SOFTWARE_ENGINEERING.value, pubid)
2843

2944
scrape_pubs(db, conn)
3045

@@ -34,6 +49,7 @@ def test_scrape_pubs_techteams2(db):
3449

3550
conn.close()
3651

52+
3753

3854

3955

0 commit comments

Comments
 (0)