-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews_analyzer.py
277 lines (231 loc) · 9.03 KB
/
news_analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import csv
import requests
import bs4
import google.generativeai as palm
# Define constants
PALM_API_KEY = "YOUR_API_KEY"
DEFAULTS_FOR_PALM = {
'model': 'models/text-bison-001',
'temperature': 0.75,
'candidate_count': 1,
'top_k': 40,
'top_p': 0.95,
'max_output_tokens': 512,
'stop_sequences': [],
'safety_settings': [
{"category": "HARM_CATEGORY_DEROGATORY", "threshold": 4},
{"category": "HARM_CATEGORY_TOXICITY", "threshold": 4},
{"category": "HARM_CATEGORY_VIOLENCE", "threshold": 4},
{"category": "HARM_CATEGORY_SEXUAL", "threshold": 4},
{"category": "HARM_CATEGORY_MEDICAL", "threshold": 4},
{"category": "HARM_CATEGORY_DANGEROUS", "threshold": 4}
],
}
PROMPT = """
news title : {title}
news content : {content}
How above news will affect for sri lankan person in his mindset ?
{question}
"""
QUESTION_FOR_MAIN_CATEGORIES = """
Select positive or negative or neutral
"""
QUESTION_FOR_POSITIVE_CATEGORIES = """
Select the most appropriate positive sub-category to which the above news should belong from the sub-categories below.
Economic Updates
Business and Entrepreneurship
Technology and Innovation
Infrastructure and Development
Education and Workforce Development
Trade and International Relations
Energy and Sustainability
Innovation Ecosystem
Government Policies and Regulations
Infrastructure Investment
Healthcare and Public Health
Agriculture and Food Security
Tourism and Cultural Heritage
Financial Inclusion and Access
Foreign Direct Investment (FDI)
Inclusive Development
Data and Analytics
Sustainable Practices
None of Above
"""
QUESTION_FOR_NEGATIVE_CATEGORIES = """
Select the most appropriate negative sub-category to which the above news should belong from the sub-categories below.
Crime and Violence
Natural Disasters
Conflict and War
Political Instability
Economic Downturn
Health Crises
Environmental Degradation
Social Injustice
Terrorism
Public Health Alerts
Cybersecurity Threats
Humanitarian Crises
Scandals and Misconduct
Fake News and Misinformation
Negative Social Trends
Sensationalism
Celebrity Gossip and Tabloids
None of Above
"""
# Initialize PALM API
palm.configure(api_key=PALM_API_KEY)
def reading_csv_to_get_url() -> list:
"""
Read a CSV file to extract titles and URLs.
Args :
None
Returns:
list: A list of URLs extracted from the CSV file.
"""
with open("news_url_list.csv", "r", encoding="utf-8") as file:
url_list = []
csv_reader = csv.DictReader(file)
for row in csv_reader:
url = row.get("post_url")
if url:
url_list.append(url)
return url_list
def get_the_content(url: str) -> str:
"""
Get the content of a webpage given its URL.
Args:
url (str): The URL of the webpage.
Returns:
str: The content of the webpage.
"""
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
content = soup.find("div", class_="new_details")
if content:
paragraphs = content.find_all("p")
content_text = ""
for paragraph in paragraphs:
content_text += paragraph.text
return content_text
else:
return "No content found on the page."
def extract_news_title(response_text: str) -> str:
"""
Extract the news title from HTML content.
Args:
response_text (str): The HTML content of the webpage.
Returns:
str: The extracted news title.
"""
soup = bs4.BeautifulSoup(response_text, "html.parser")
title = soup.find("h1", class_="top_stories_header_news")
if title:
return title.text
else:
return "No title found on the page."
def get_the_palm_response(title: str, content: str, defaults: dict, question: str) -> str:
"""
Get a response from the PALM API for given title, content, and a question.
Args:
title (str): The news title.
content (str): The news content.
defaults (dict): PALM API configuration settings.
question (str): The question to ask.
Example:
to get a response from the PALM API for given title, content, and a question:
>>> palm_response = get_the_palm_response(title, content, defaults, question)
Note:
The input 'defaults' should be a dictionary containing the following keys:
"model", "temperature", "candidate_count", "top_k", "top_p", "max_output_tokens","stop_sequences", and "safety_settings".
The input 'question' should be a string containing the question to ask.
The APIKEY should be set in the environment variable and palm object should be configured with the APIKEY.
Returns:
str: The PALM API response.
"""
completion = palm.generate_text(
prompt=PROMPT.format(title=title, content=content, question=question),
**defaults
)
return completion.result
def analyze_a_news(url: str, defaults: dict, question_for_main: str,question_for_positive:str,question_for_negative:str) -> dict:
"""
Analyze a news article from a given URL using PALM API.
Args:
url (str): The URL of the news article.
defaults (dict): PALM API configuration settings.
question_for_main (str): The question to ask in main categorization.
question_for_positive (str): The question to ask in positive sub-categorization.
question_for_negative (str): The question to ask in negative sub-categorization.
Example:
to analyze a news article from a URL:
>>> analysis = analyze_a_news(url)
Note:
The input 'defaults' should be a dictionary containing the following keys:
"model", "temperature", "candidate_count", "top_k", "top_p", "max_output_tokens","stop_sequences", and "safety_settings".
The input 'question_for_main' should be a string containing the question to ask in main categorization.
The input 'question_for_positive' should be a string containing the question to ask in positive sub-categorization.
The input 'question_for_negative' should be a string containing the question to ask in negative sub-categorization.
Returns:
dict: A dictionary containing the analysis results.
"""
response = requests.get(url)
title = extract_news_title(response.text)
content = get_the_content(url)
try:
palm_response = get_the_palm_response(title, content, defaults, question_for_main)
if "positive" in palm_response.lower():
palm_sub_response = get_the_palm_response(title, content, defaults, question_for_positive)
elif "negative" in palm_response.lower() or "negetive" in palm_response.lower():
palm_sub_response = get_the_palm_response(title, content, defaults, question_for_negative)
elif "neutral" in palm_response.lower():
palm_sub_response = "Neutral"
else:
palm_sub_response = "None of Above"
except Exception as e:
palm_response = "None"
palm_sub_response = "None"
print("----------------------------------------------")
print("Title : ", title)
print("Content : ", content)
print("Palm response : ", palm_response)
print("Palm sub response : ", palm_sub_response)
print("----------------------------------------------")
return {"title": title, "url": url, "palm_response": palm_response, "palm_sub_response": palm_sub_response}
def writing_to_result_csv(analysis: dict) -> None:
"""
Write analysis results to a CSV file.
Args:
analysis (dict): A dictionary containing analysis results.
Returns:
None
Example:
to write analysis results to a CSV file named "result.csv":
>>> writing_to_result_csv(analysis)
Note:
The input 'analysis' should be a dictionary containing the following keys:
"title", "palm_response", "palm_sub_response", and "url".
"""
with open("result.csv", "a", newline="", encoding="utf-8") as file:
csv_writer = csv.DictWriter(file, fieldnames=["title", "palm_response", "palm_sub_response", "url"])
csv_writer.writerow(analysis)
def analyzer():
"""
Analyze news articles from a CSV file and save results to a new CSV file.
Args:
None
Returns:
None
Example:
to analyze news articles from a CSV file named "news_url_list.csv" and save results to a new CSV file named "result.csv":
>>> analyzer()
Reads a CSV file containing news article URLs, analyzes each article using the PALM API,
and writes the analysis results to a new CSV file.
"""
url_list = reading_csv_to_get_url()
print("Total news : ", len(url_list))
for url in url_list:
print("**********************************************")
analysis = analyze_a_news(url, defaults=DEFAULTS_FOR_PALM, question_for_main=QUESTION_FOR_MAIN_CATEGORIES,question_for_positive=QUESTION_FOR_POSITIVE_CATEGORIES,question_for_negative=QUESTION_FOR_NEGATIVE_CATEGORIES)
writing_to_result_csv(analysis)
print("Currently completed : ", url_list.index(url) + 1, " out of ", len(url_list))