-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnews_url_extracter.py
187 lines (145 loc) · 8.39 KB
/
news_url_extracter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import requests,csv
NUMBER_OF_NEWS_PER_PAGE = 50
REQUEST_HOST = "https://example.com/"
FIELDS = ["title","post_url"]
NEWS_TYPE = 2 #local news are categorized as 2 in my site
def getting_data(current_page: int, number_of_news_per_page: int, news_type: int) -> list:
"""
Fetches a list of news articles from an API for a specified page and news category.
Args:
current_page (int): The page number of news articles to retrieve.
number_of_news_per_page (int): The maximum number of news contents to fetch per page.
news_type (int): The type or category ID for the news articles.
Returns:
list: A list of news articles as dictionaries, where each dictionary contains information
about a single news article.
Raises:
requests.exceptions.RequestException: If there is an issue with the API request.
Example:
To retrieve news articles from page 1 of a specific category with ID 2 and a maximum
of 20 news per page, you can call the function like this:
>>> news_data = getting_data(current_page=1, number_of_news_per_page=20, news_type=2)
>>> print(news_data)
Got 10 news
[{'title': 'News Article 1', 'content': 'Content of Article 1', ...},
{'title': 'News Article 2', 'content': 'Content of Article 2', ...},
...]
Note:
This function constructs the API endpoint URL based on the provided arguments and assumes
a specific API structure. Ensure the URL format is consistent with the API you are
interacting with.
"""
url = f"https://api.example.com/post/categoryPostPagination/{news_type}/{current_page}/{number_of_news_per_page}/"
try:
response = requests.get(url) # Make an API request.
response.raise_for_status() # Raise an exception for non-2xx status codes.
data = response.json() # Extract JSON data from the response.
news = data.get("postResponseDto", []) # Extract the news data from the JSON response.
print(f"Got {len(news)} news") # Print the number of news articles fetched.
return news
except requests.exceptions.RequestException as e: # Handle network or API request errors gracefully.
print(f"Error fetching data: {e}")
return []
def extract_data(news: list,request_host:str) -> list:
"""
Extracts and structures title and url from a list of news articles.
Args:
news (list): A list of news articles, where each article is represented as a dictionary.
request_host (str): The host name of the request
Returns:
list: A list of dictionaries containing extracted information for each news article.
Each dictionary includes "title" and "post_url" keys.
Example:
Given a list of news articles as input:
>>> news_data = [
... {"title": "News Article 1", "post_url": "https://example.com/news/1"},
... {"title": "News Article 2", "post_url": "https://example.com/news/2"},
... # More news articles...
... ]
Calling the function will extract and structure the data as follows:
>>> extracted_data = extract_data(news_data)
>>> print(extracted_data)
Extracted : News Article 1
Extracted : News Article 2
[{'title': 'News Article 1', 'post_url': 'https://example.com/news/1'},
{'title': 'News Article 2', 'post_url': 'https://example.com/news/2'},
...]
Note:
The input 'news' should be a list of dictionaries, and each dictionary is expected to
contain at least "title" and "post_url" keys. The function will extract these keys and
create a new list of dictionaries with these keys as-is.
"""
extracted_data = [] # Initialize an empty list to store the extracted data.
for news_item in news: # Iterate through the list of news articles.
title = news_item.get("title").get("rendered") # Extract the title.
post_url = news_item.get("post_url") # Extract the post URL.
if title and post_url: # If both title and post URL exist, add them to the dictionary.
extracted_data.append({"title": title, "post_url": request_host+post_url}) # Add the dictionary to the list.
print(f"Extracted : {title}")
return extracted_data
def write_data_list_to_csv(news_list: list, file_path: str = "news_url_list.csv", fields: list = None) -> None:
"""
Writes a list of dictionaries to a CSV file.
Args:
news_list (list): A list of dictionaries where each dictionary represents a data row
for the CSV file.
file_path (str, optional): The path to the CSV file to be created or appended to.
Defaults to "news_url_list.csv".
fields (list, optional): A list of field names specifying the order of columns in the CSV.
If not provided, the order is determined by the first dictionary's keys.
Returns:
None
Example:
Given a list of news data as input:
>>> news_data = [
... {"title": "News Article 1", "post_url": "https://example.com/news/1"},
... {"title": "News Article 2", "post_url": "https://example.com/news/2"},
... # More news data...
... ]
Calling the function will write the data to the "news_url_list.csv" file (or the specified path):
>>> write_data_list_to_csv(news_data)
Note:
- The input 'news_list' should be a list of dictionaries, where each dictionary represents
a row of data in the CSV file.
- The 'fields' argument can be provided to specify the order of columns in the CSV file.
If not provided, the order is determined by the keys of the first dictionary in 'news_list'.
- If the CSV file specified by 'file_path' does not exist, it will be created. If it exists,
data will be appended to the existing file.
- The function uses the 'utf-8' encoding for writing data to the CSV file.
"""
with open(file_path, "a", newline="", encoding="utf-8") as file: # Open the CSV file.
if not fields: # If the 'fields' argument is not provided, use the keys of the first dictionary.
fields = news_list[0].keys() if news_list else [] # If the list is empty, use an empty list.
csv_writer = csv.DictWriter(file, fieldnames=fields) # Initialize a CSV writer for writing data.
csv_writer.writerows(news_list) # Write the list of dictionaries to the CSV file.
def extracter()->None:
"""
Fetches, extracts, and writes news data to a CSV file.
This function iterates through a range of pages, fetches news data from an API, extracts
relevant information, and writes it to a CSV file. It also prints progress information.
Args:
None
Returns:
None
Example:
To start the process, call the extracter function as follows:
>>> extracter()
Note:
Make sure to import the required functions (getting_data, extract_data, and
write_data_list_to_csv) and define the constant NUMBER_OF_NEWS_PER_PAGE before
calling the extracter function.
"""
total_news = 0 # Initialize a counter for the total number of news articles extracted.
print(f"Starting the process with {NUMBER_OF_NEWS_PER_PAGE} news per time") # Print a message to indicate the start of the process.
# Initialize a CSV file with a header
with open("news_url_list.csv", "w", newline="", encoding="utf-8") as file: # Open the CSV file.
csv_writer = csv.writer(file) # Initialize a CSV writer for writing data.
csv_writer.writerow(FIELDS) # Write the header row to the CSV file.
for current_page in range(2): # Iterate through a range of pages.
news = getting_data(current_page, NUMBER_OF_NEWS_PER_PAGE, NEWS_TYPE) # Fetch news data from the API.
news_list = extract_data(news,REQUEST_HOST) # Extract title and url information from the news data.
write_data_list_to_csv(news_list, file_path="news_url_list.csv", fields=FIELDS) # Write the extracted data to a CSV file.
total_news += len(news_list) # Increment the total number of news articles extracted.
print("--------------------------------------------------")
print("Current page:", current_page + 1)
print("Total news extracted:", total_news)