-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathGRCnewsAssistant.py
337 lines (287 loc) · 12.3 KB
/
GRCnewsAssistant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#!/usr/bin/env python3
# GRCnewsAssistant.py
# Consolidated GRC News Assistant with AI Analysis
import csv
import requests
import datetime
import time
import logging
import json
import tempfile
import os
import subprocess
import platform
import sys
import urllib.parse
from newspaper import Article
from typing import List, Dict, Any
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def get_api_key() -> str:
"""Get NewsData.io API key from environment variable."""
api_key = os.getenv('NEWSDATA_API_KEY')
if not api_key:
logger.error("""
NewsData.io API key not found!
Please set your API key as an environment variable:
For macOS/Linux:
export NEWSDATA_API_KEY='your_api_key_here'
For Windows (Command Prompt):
set NEWSDATA_API_KEY=your_api_key_here
For Windows (PowerShell):
$env:NEWSDATA_API_KEY='your_api_key_here'
You can add this to your shell's startup file (.bashrc, .zshrc, etc.)
to make it permanent.
""")
sys.exit(1)
return api_key
def get_clipboard_command() -> List[str]:
"""Get the appropriate clipboard command based on OS."""
system = platform.system().lower()
if system == 'darwin': # macOS
return ['pbpaste']
elif system == 'linux':
# Check if xclip is installed
try:
subprocess.run(['xclip', '-version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return ['xclip', '-selection', 'clipboard', '-o']
except FileNotFoundError:
logger.error("""
xclip not found! On Linux, please install xclip:
For Ubuntu/Debian:
sudo apt-get install xclip
For Fedora:
sudo dnf install xclip
For other distributions, use your package manager to install xclip.
""")
sys.exit(1)
elif system == 'windows':
return ['powershell.exe', '-command', 'Get-Clipboard']
else:
logger.error(f"Unsupported operating system: {system}")
sys.exit(1)
def read_keywords(filename: str = "keywords.csv") -> List[str]:
"""Read and decode keywords from CSV file."""
try:
with open(filename, 'r') as file:
reader = csv.reader(file)
return [urllib.parse.unquote(row[0]) for row in reader]
except Exception as e:
logger.error(f"Error reading keywords file: {e}")
return []
def search_news(keyword: str, api_key: str, category: str = "technology", language: str = "en") -> List[Dict]:
"""Search NewsData.io API for articles matching keyword."""
url = f"https://newsdata.io/api/1/news?apikey={api_key}&q={keyword}&language={language}&category={category}"
try:
response = requests.get(url)
data = response.json()
if data["status"] == "success":
articles = []
for article in data["results"]:
articles.append({
"date": datetime.date.today().strftime("%Y-%m-%d"),
"keyword": keyword,
"headline": article["title"],
"description": article["description"],
"url": article["link"]
})
return articles
else:
logger.error(f"API request failed: {data.get('results', 'No error message')}")
return []
except Exception as e:
logger.error(f"Error fetching news: {e}")
return []
def clean_and_validate_csv(filename: str, header: List[str]) -> List[Dict]:
"""Read, clean, and validate CSV data."""
data = []
if not os.path.exists(filename):
return data
try:
with open(filename, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
# Validate header matches expected structure
if reader.fieldnames != header:
logger.warning(f"Header mismatch in {filename}. Expected: {header}, Found: {reader.fieldnames}")
return data
# Read and clean data
for row in reader:
# Skip completely empty rows
if not any(row.values()):
continue
# Validate row has all required fields
if all(row.get(field) for field in header):
data.append(row)
else:
logger.warning(f"Skipping malformed row in {filename}: {row}")
return data
except Exception as e:
logger.error(f"Error reading {filename}: {e}")
return data
def save_urls(articles: List[Dict], filename: str = "urls.csv"):
"""Save URLs to separate CSV file."""
try:
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
# Only write valid URLs
for article in articles:
if article and article.get("url"):
writer.writerow([article["url"]])
except Exception as e:
logger.error(f"Error writing URLs to {filename}: {e}")
def extract_article_content(url: str) -> Dict:
"""Extract article content using newspaper4k."""
article = Article(url, fetch_images=False)
try:
article.download()
article.parse()
article.nlp()
return {
"title": article.title or "Not Found",
"keywords": article.keywords if article.keywords else [],
"authors": article.authors if article.authors else ["Not Found"],
"summary": article.summary or "Not Found",
"text": article.text or "Not Found",
"publish_date": article.publish_date.isoformat() if article.publish_date else "Not Found",
"url": url
}
except Exception as e:
logger.error(f"Failed to process article from {url}: {e}")
return None
def analyze_with_fabric(content: Dict) -> Dict:
"""Process article content with fabric label_and_rate."""
try:
# Create temporary file with formatted content
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as temp_file:
formatted_content = (
f"Title: {content['title']}\n"
f"Authors: {', '.join(content['authors'])}\n"
f"Keywords: {', '.join(content['keywords'])}\n"
f"Summary: {content['summary']}\n"
f"URL: {content['url']}\n"
)
temp_file.write(formatted_content)
temp_file_path = temp_file.name
# Copy content to clipboard using OS-specific command
with open(temp_file_path, 'r') as f:
subprocess.run(['pbcopy' if platform.system() == 'Darwin' else 'clip'],
input=f.read().encode(),
check=True)
# Create temporary output file for fabric results
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as output_file:
output_path = output_file.name
# Run fabric command with OS-specific clipboard command
clipboard_cmd = get_clipboard_command()
cmd = f'{" ".join(clipboard_cmd)} | fabric -p label_and_rate -o "{output_path}"'
subprocess.run(cmd, shell=True, timeout=15, check=True)
# Read fabric results
with open(output_path, 'r') as f:
fabric_data = json.loads(f.read())
# Cleanup temporary files
os.unlink(temp_file_path)
os.unlink(output_path)
return fabric_data
except Exception as e:
logger.error(f"Error in fabric analysis: {e}")
return None
def create_rated_csv(articles: List[Dict], analysis_results: List[Dict]):
"""Create or update grcdata_rated.csv with fabric analysis results while preserving historical data."""
filename = 'grcdata_rated.csv'
header = [
'date', 'keyword', 'title', 'description', 'url',
'one-sentence-summary', 'labels', 'rating',
'rating-explanation', 'quality-score', 'quality-score-explanation'
]
try:
# Get existing clean data
existing_data = clean_and_validate_csv(filename, header)
# Create set of existing URLs to avoid duplicates
existing_urls = {row['url'] for row in existing_data}
# Write all data
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(header)
# Write existing data
for row in existing_data:
writer.writerow([row[field] for field in header])
# Write new data with analysis
for article, analysis in zip(articles, analysis_results):
if article and article.get('url'):
# Skip if URL already exists in historical data
if article['url'] in existing_urls:
continue
if analysis:
writer.writerow([
article['date'],
article['keyword'],
article['headline'],
article['description'],
article['url'],
analysis.get('one-sentence-summary', ''),
analysis.get('labels', ''),
analysis.get('rating', ''),
'; '.join(analysis.get('rating-explanation', [])),
str(analysis.get('quality-score', '')),
'; '.join(analysis.get('quality-score-explanation', []))
])
else:
# Write without analysis if it failed, but still include article data
writer.writerow([
article['date'],
article['keyword'],
article['headline'],
article['description'],
article['url'],
'', '', '', '', '', ''
])
except Exception as e:
logger.error(f"Error updating grcdata.csv with analysis: {e}")
def main():
"""Main execution flow."""
logger.info("Starting GRC News Assistant")
# Get API key from environment variable
api_key = get_api_key()
# Read keywords
keywords = read_keywords()
if not keywords:
logger.error("No keywords found in keywords.csv")
return
# Collect articles
all_articles = []
for keyword in keywords:
logger.info(f"Searching for articles about: {keyword}")
articles = search_news(keyword.strip(), api_key)
if articles:
all_articles.extend(articles)
logger.info(f"Found {len(articles)} articles for '{keyword}'")
else:
logger.warning(f"No articles found for '{keyword}'")
if not all_articles:
logger.error("No articles found for any keywords")
return
# Save URLs
save_urls(all_articles)
logger.info(f"Found {len(all_articles)} articles")
# Process articles with newspaper4k and fabric
logger.info("Processing articles with newspaper4k and fabric")
analysis_results = []
for article in all_articles:
url = article["url"]
logger.info(f"Processing: {url}")
# Extract content
content = extract_article_content(url)
if content:
# Analyze with fabric
analysis = analyze_with_fabric(content)
analysis_results.append(analysis)
else:
analysis_results.append(None)
# Create rated CSV with analysis results
create_rated_csv(all_articles, analysis_results)
logger.info("Completed processing with AI analysis. Results saved to grcdata_rated.csv")
if __name__ == "__main__":
main()